# Build TF-IDF based baseline

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
version = 'v2'

In [3]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
terms_dir = 'terms'
filename = f'wikidata_corpus_{version}.xlsx'
filepath = os.path.join(root_dir, data_dir, corpus_dir, filename)

In [4]:
#A = pd.read_excel(data, index_col=0)
A = pd.read_excel(filepath, index_col=0)

In [5]:
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,stockholm city capital sweden,Q1754
1,1,stockholm capital capital sweden,Q1754
2,2,sthlm city city stretches across fourteen_isla...,Q1754
3,3,sthlm capital city stretches across fourteen_i...,Q1754
4,4,stockholm big city hosts annual nobel prize ce...,Q1754


Split noun chunks into single terms

In [6]:
A['chunk_doc'] = A['chunk_doc'].map(lambda x: ' '.join(x.split('_')))
A.head()

Unnamed: 0,doc_id,chunk_doc,label
0,0,stockholm city capital sweden,Q1754
1,1,stockholm capital capital sweden,Q1754
2,2,sthlm city city stretches across fourteen isla...,Q1754
3,3,sthlm capital city stretches across fourteen i...,Q1754
4,4,stockholm big city hosts annual nobel prize ce...,Q1754


In [7]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
pseudo_docs = defaultdict(lambda: "")
for i, doc in A.iterrows():
    pseudo_docs[doc.label] += " " + doc.chunk_doc

In [9]:
entities = list(pseudo_docs.keys())
pdocs = [pseudo_docs[k] for k in entities]

In [10]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(pdocs).toarray()

In [11]:
ranking = {}
features = vectorizer.get_feature_names()
for i, entity in enumerate(entities):
    rank = [(features[i], x) for i, x in sorted(enumerate(X[i]), key=lambda z: -z[1])]
    ranking[entity] = rank

In [12]:
for entity in entities:
    print(entity)
    for word, w in ranking[entity][:5]:
        print(word, round(w, 2))
    print()

Q1754
stockholm 0.56
sthlm 0.36
city 0.31
sweden 0.26
arena 0.17

Q1787199
stockholm 0.53
town 0.4
dakota 0.4
states 0.34
united 0.34

Q976601
wisconsin 0.71
village 0.5
stockholm 0.4
pepin 0.17
66 0.12

Q1484620
asteroid 0.64
stockholm 0.58
10552 0.5
00 0.0
000 0.0

Q3447382
stockholm 0.54
maine 0.52
town 0.41
states 0.29
united 0.29

Q30599096
film 0.65
stockholm 0.4
april 0.2
2018 0.19
budreau 0.13

Q906
čelâbinsk 0.55
russia 0.34
city 0.28
chelyabinsk 0.27
ural 0.2

Q1482450
chelyabinsk 0.58
asteroid 0.53
21088 0.3
1992 0.19
belgian 0.11

Q5714
oblast 0.81
russia 0.4
chelyabinsk 0.33
chelyabinskaya 0.08
subject 0.08

Q1777988
university 0.75
ural 0.33
state 0.26
chelyabinsk 0.21
south 0.19

Q4661508
meteor 0.55
earth 0.42
near 0.37
asteroid 0.28
chelyabinsk 0.25

Q2332010
time 0.69
zone 0.38
yekaterinburg 0.31
utc 0.27
ahead 0.19

Q600277
tractor 0.5
plant 0.42
chelyabinsk 0.39
чтз 0.3
russian 0.21

Q894
samara 0.67
russia 0.32
city 0.29
town 0.27
volga 0.19

Q475697
fruit 0.53
typ

---
## Save ranking to a file

In [13]:
df_list = []
for entity, terms_tfidf in ranking.items():
    df_list.append(pd.DataFrame({'label': [entity]*len(terms_tfidf), 
                                 'term': [term for term, _ in terms_tfidf]}))

In [14]:
df = pd.concat(df_list, axis=0)
df.head()

Unnamed: 0,label,term
0,Q1754,stockholm
1,Q1754,sthlm
2,Q1754,city
3,Q1754,sweden
4,Q1754,arena


In [15]:
df.shape

(842790, 2)

In [16]:
ranking_filename = f'ranking_wikidata_baseline_{version}_single_terms.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, terms_dir, ranking_filename)

In [17]:
ranking_filepath

'../../data/terms/ranking_wikidata_baseline_v2_single_terms.xlsx'

In [None]:
df.to_excel(ranking_filepath)

---