In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
filename = 'alaska_corpus.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [3]:
# data = '../../data/alaska_corpus.xlsx'

In [4]:
#A = pd.read_excel(data, index_col=0)
A = pd.read_excel(filepath, index_col=0)

In [5]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,nikon_d3200_dslr_camera 18_55mm 55_200mm_lense...,ENTITY#44
1,1,nikon_d3200 18-55/3 5-5 6 55-200/4 0-5 6 new_z...,ENTITY#44
2,2,nikon_d3200 digital_dslr_camera w 18 55mm 55 2...,ENTITY#44
3,3,nikon_d3200_digital_dslr_camera 24 1 w 18 55mm...,ENTITY#44
4,4,nikon_d3200 body_price india bangalore hyderab...,ENTITY#44


In [6]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [8]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [9]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [10]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [11]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

ENTITY#44
d3200 0.67
nikon_d3200 0.36
18 0.31
nikon 0.3
24_2_mp_digital_slr_camera_black_kit 0.21

ENTITY#23
canon_eos_7d 0.78
013803117493 0.32
013803117493_ebay 0.31
18 0.29
mp_digital_slr_camera_black_body 0.17

ENTITY#18
canon_eos_60d 0.49
60d 0.47
canon 0.32
eos 0.31
18 0.31

ENTITY#36
nikon_d3100 0.74
14_2_mp_digital_slr_camera_black_kit 0.26
18 0.25
dx_vr 0.25
d3100 0.23

ENTITY#41
18 0.46
d5200 0.45
nikon_d5200 0.31
24_1_mp_digital_slr_camera_black_kit 0.28
nikon 0.22

ENTITY#21
d5100 0.66
16 0.37
018208254781 0.35
nikon 0.25
18 0.19

ENTITY#75
d7000 0.74
nikon 0.32
16 0.3
018208254682 0.27
nikon_d7000 0.19

ENTITY#96
70d 0.52
canon 0.35
eos 0.32
013803221596 0.32
18 0.23

ENTITY#6
018208015191 0.54
18 0.41
nikon_d5300 0.36
nikon_d5300_kit 0.24
ebay 0.21

ENTITY#101
canon_eos 0.6
5d_mark_iii 0.55
22_3_mp_digital_slr_camera_black_body 0.26
ebay 0.25
5d_mark 0.19

ENTITY#102
canon_eos_5d 0.61
mark 0.58
ii 0.36
013803105384_ebay 0.22
21_1_mp_digital_slr_camera_black_body 0.2

ENTI

---
## Save ranking to a file

In [12]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [13]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,ENTITY#44,d3200
1,ENTITY#44,nikon_d3200
2,ENTITY#44,18
3,ENTITY#44,nikon
4,ENTITY#44,24_2_mp_digital_slr_camera_black_kit


In [14]:
df.shape

(55360, 2)

In [15]:
ranking_filename = 'ranking_alaska_baseline.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [16]:
ranking_filepath

'../../data/terms/ranking_alaska_baseline.xlsx'

In [17]:
df.to_excel(ranking_filepath)

---