In [2]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
version = 'v1'

In [3]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
filename = f'wikidata_corpus_{version}.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [4]:
#A = pd.read_excel(data, index_col=0)
A = pd.read_excel(filepath, index_col=0)

In [5]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,berlin city-state capital largest_city germany,Q64
1,1,berlin city millions inhabitants economy based...,Q64
2,2,berlin federal capital significant industries ...,Q64
3,3,berlin city millions inhabitants_capital large...,Q64
4,4,berlin urban municipality germany east berlin ...,Q64


In [6]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [8]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [9]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [10]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [11]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

Q64
germany 0.51
berlin 0.43
capital 0.42
museum 0.25
largest_city 0.21

Q821244
city 0.46
hampshire 0.39
new 0.36
berlin 0.36
united 0.26

Q614184
human 0.49
settlement 0.49
maryland 0.37
berlin 0.29
delaware 0.17

Q1569850
city 0.43
class 0.37
fourth 0.37
wisconsin 0.35
green 0.3

Q1086827
borough 0.49
jersey 0.39
new 0.32
census 0.29
berlin 0.23

Q821199
town 0.56
berlin 0.41
connecticut 0.39
united 0.35
states 0.3



---
## Save ranking to a file

In [12]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [13]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,Q64,germany
1,Q64,berlin
2,Q64,capital
3,Q64,museum
4,Q64,largest_city


In [14]:
df.shape

(1410, 2)

In [15]:
ranking_filename = f'ranking_wikidata_baseline_{version}.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [16]:
ranking_filepath

'../../data/terms/ranking_wikidata_baseline.xlsx'

In [17]:
df.to_excel(ranking_filepath)

In [19]:
df.groupby('label').head(5).to_excel('test1.xlsx')

---