In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
dataset_name = 'nyt'
filename = f'{dataset_name}_corpus.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [3]:
A = pd.read_excel(filepath, index_col=0)

In [4]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,panel new_york_state_judges hear arguments whe...,Q11201
1,1,prof_stephen_gillers op-ed_article says florid...,Q11201
2,2,seabury start sup ct action bring hastings com...,Q11201
3,3,3-judge_fed_panel_rules nys election_law would...,Q11201
4,4,federal_appeals_court_judges ruled california_...,Q11201


In [5]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [7]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [8]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [9]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [10]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

Q11201
court 0.39
supreme_court 0.18
says 0.18
ruling 0.16
ct 0.14

Q11211
iraq 0.36
says 0.28
baghdad 0.17
killed 0.15
troops 0.14

Q1124
says 0.38
clinton 0.37
president 0.19
bill 0.17
pres 0.16

Q1384
new_york 0.38
says 0.23
photo 0.16
albany 0.15
state 0.15

Q148
china 0.73
hong_kong 0.32
taiwan 0.22
chinese 0.14
japan 0.12

Q23505
bush 0.49
says 0.24
pres 0.22
_p 0.17
_o 0.16

Q29468
republicans 0.26
says 0.25
democrats 0.2
party 0.18
republican 0.14

Q330963
says 0.45
would 0.16
pres 0.13
may 0.12
iraq 0.12

Q66096
senate 0.62
republicans 0.24
senate_democrats 0.2
democrats 0.15
says 0.15

Q744448
says 0.41
us 0.27
foreign_policy 0.2
china 0.15
policy 0.14



---
## Save ranking to a file

In [11]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [12]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,Q11201,court
1,Q11201,supreme_court
2,Q11201,says
3,Q11201,ruling
4,Q11201,ct


In [13]:
df.shape

(276020, 2)

In [14]:
ranking_filename = f'ranking_{dataset_name}_baseline.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [15]:
ranking_filepath

'../../data/terms/ranking_nyt_baseline.xlsx'

In [16]:
df.to_excel(ranking_filepath)

---