In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
dataset_name = 'abstracts'
filename = f'{dataset_name}_corpus.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [3]:
A = pd.read_excel(filepath, index_col=0)

In [4]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,work mind construction concurrent_systems comp...,cs
1,1,optimal_selection interdependent projects impl...,cs
2,2,sorting one classic_problems computer_science ...,cs
3,3,dependency_analysis technique identify determi...,cs
4,4,module theorem janhunen et_al demonstrates pro...,cs


In [5]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [7]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [8]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [9]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [10]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

cs
paper 0.33
using 0.22
show 0.2
used 0.2
also 0.19

math
show 0.26
prove 0.24
also 0.23
paper 0.2
study 0.17

physics
using 0.21
also 0.19
show 0.17
used 0.17
based 0.16

stat
paper 0.23
show 0.22
based 0.2
using 0.2
also 0.19



---
## Save ranking to a file

In [11]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [12]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,cs,paper
1,cs,using
2,cs,show
3,cs,used
4,cs,also


In [13]:
df.shape

(152712, 2)

In [14]:
ranking_filename = f'ranking_{dataset_name}_baseline.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [15]:
ranking_filepath

'../../data/terms/ranking_abstracts_baseline.xlsx'

In [16]:
df.to_excel(ranking_filepath)

---