In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
filename = 'alaska_corpus_noisy.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [3]:
# data = '../../data/alaska_corpus.xlsx'

In [4]:
#A = pd.read_excel(data, index_col=0)
A = pd.read_excel(filepath, index_col=0)

In [5]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,nikon_d3200_dslr_camera 18_55mm 55_200mm_lense...,ENTITY#44
1,1,nikon_d3200 18-55/3 5-5 6 55-200/4 0-5 6 new_z...,ENTITY#44
2,2,nikon_d3200 digital_dslr_camera w 18 55mm 55 2...,ENTITY#44
3,3,nikon_d3200_digital_dslr_camera 24 1 w 18 55mm...,ENTITY#44
4,4,nikon_d3200 body_price india bangalore hyderab...,ENTITY#44


In [6]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [8]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [9]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [10]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [11]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

ENTITY#44
canon_eos_5d 0.43
d3200 0.43
mark 0.39
ii 0.26
nikon_d3200 0.23

ENTITY#23
canon_eos_7d 0.71
013803117493 0.29
013803117493_ebay 0.29
18 0.27
1_j3 0.19

ENTITY#18
canon_eos_60d 0.45
60d 0.43
nikon_d300 0.31
18 0.31
canon 0.27

ENTITY#36
nikon_d3100 0.73
18 0.26
14_2_mp_digital_slr_camera_black_kit 0.25
dx_vr 0.23
d3100 0.22

ENTITY#41
nikon 0.39
d7000 0.38
16 0.36
d5100 0.35
18 0.32

ENTITY#96
70d 0.52
canon 0.32
013803221596 0.32
eos 0.3
ebay 0.25

ENTITY#6
018208015191 0.53
18 0.42
nikon_d5300 0.36
ebay 0.24
nikon_d5300_kit 0.24

ENTITY#101
canon_eos 0.57
5d_mark_iii 0.56
ebay 0.29
22_3_mp_digital_slr_camera_black_body 0.27
5d_mark 0.2

ENTITY#16
nikon_d90 0.79
018208254460 0.38
12_3_mp_digital_slr_camera_black_body 0.21
ebay 0.21
12_3_mp_digital_slr_camera_black_kit 0.17

ENTITY#57
018208254804 0.67
nikon_d800 0.57
36_3_mp_digital_slr_camera_black_body 0.32
ebay 0.25
d800 0.12

ENTITY#76
10 0.46
j1 0.38
nikon 0.36
d610 0.26
ebay 0.24

ENTITY#37
nikon_d80 0.8
018208254125_e

---
## Save ranking to a file

In [12]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [13]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,ENTITY#44,canon_eos_5d
1,ENTITY#44,d3200
2,ENTITY#44,mark
3,ENTITY#44,ii
4,ENTITY#44,nikon_d3200


In [14]:
df.shape

(33216, 2)

In [15]:
ranking_filename = 'ranking_alaska_baseline_noisy.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [16]:
ranking_filepath

'../../data/terms/ranking_alaska_baseline_noisy.xlsx'

In [17]:
df.to_excel(ranking_filepath)

---