In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
filename = 'alaska_corpus.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [3]:
# data = '../../data/alaska_corpus.xlsx'

In [4]:
#A = pd.read_excel(data, index_col=0)
A = pd.read_excel(filepath, index_col=0)

In [5]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,nikon_d3200_dslr_camera 18_55mm 55_200mm_lense...,ENTITY#44
1,1,nikon_d3200 18-55/3 5-5 6 55-200/4 0-5 6 new_z...,ENTITY#44
2,2,nikon_d3200 digital_dslr_camera w 18 55mm 55 2...,ENTITY#44
3,3,nikon_d3200_digital_dslr_camera 24 1 w 18 55mm...,ENTITY#44
4,4,nikon_d3200 body_price india bangalore hyderab...,ENTITY#44


Split noun chunks into single terms

In [6]:
A['chunk_doc'] = A['chunk_doc'].map(lambda x: ' '.join(x.split('_')))
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,nikon d3200 dslr camera 18 55mm 55 200mm lense...,ENTITY#44
1,1,nikon d3200 18-55/3 5-5 6 55-200/4 0-5 6 new z...,ENTITY#44
2,2,nikon d3200 digital dslr camera w 18 55mm 55 2...,ENTITY#44
3,3,nikon d3200 digital dslr camera 24 1 w 18 55mm...,ENTITY#44
4,4,nikon d3200 body price india bangalore hyderab...,ENTITY#44


In [7]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [9]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [10]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [11]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [12]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

ENTITY#44
d3200 0.78
nikon 0.33
18 0.21
camera 0.16
digital 0.16

ENTITY#23
7d 0.59
eos 0.44
canon 0.44
013803117493 0.33
18 0.19

ENTITY#18
60d 0.65
canon 0.43
eos 0.42
013803129052 0.22
18 0.21

ENTITY#36
d3100 0.74
14 0.33
nikon 0.31
18 0.18
camera 0.17

ENTITY#41
d5200 0.78
nikon 0.35
18 0.22
vr 0.16
camera 0.16

ENTITY#21
d5100 0.72
018208254781 0.31
nikon 0.31
16 0.27
18 0.18

ENTITY#75
d7000 0.76
nikon 0.32
018208254682 0.31
16 0.25
digital 0.16

ENTITY#96
70d 0.64
canon 0.42
eos 0.41
stm 0.2
20 0.17

ENTITY#6
d5300 0.77
nikon 0.34
018208015191 0.24
18 0.2
camera 0.16

ENTITY#101
5d 0.5
mark 0.44
iii 0.41
eos 0.37
canon 0.37

ENTITY#102
5d 0.45
mark 0.41
013803105384 0.39
21 0.36
canon 0.33

ENTITY#16
d90 0.7
12 0.37
018208254460 0.31
nikon 0.29
camera 0.17

ENTITY#57
d800 0.58
018208254804 0.5
36 0.43
nikon 0.27
camera 0.18

ENTITY#76
d610 0.68
018208015405 0.37
nikon 0.33
24 0.24
fx 0.2

ENTITY#19
d3300 0.77
nikon 0.34
018208015320 0.23
18 0.19
vr 0.15

ENTITY#58
j1 0.62
10 0.

---
## Save ranking to a file

In [13]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [14]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,ENTITY#44,d3200
1,ENTITY#44,nikon
2,ENTITY#44,18
3,ENTITY#44,camera
4,ENTITY#44,digital


In [15]:
df.shape

(21760, 2)

In [16]:
ranking_filename = 'ranking_alaska_baseline_single_terms.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [17]:
ranking_filepath

'../../data/terms/ranking_alaska_baseline_single_terms.xlsx'

In [18]:
df.to_excel(ranking_filepath)

---