# Process terms 
Process terms retrieved from the neural network model using `LIME`

---
## Load the retrieved terms

In [1]:
import os

In [None]:
version = 'v1'

In [2]:
root_dir = '../..'
data_dir = 'data/terms'
filename = f'relevant_terms_wikidata_nn_{version}.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [3]:
import pandas as pd

In [4]:
terms_df = pd.read_csv(filepath)

In [5]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q64,germany,0.262304,0
1,Q64,berlin,0.167252,0
2,Q64,largest_city,0.165496,0
3,Q64,capital,0.155119,0
4,Q64,convention_venues,0.027901,1


---

## Rank terms

### Add `rank` column
Add a ranking column by sorting each document's terms by their LIME weight in descending order

In [6]:
terms_df['rank'] = terms_df.groupby(['label', 'data_id'])['weight'].rank(ascending=False)

In [7]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id,rank
0,Q64,germany,0.262304,0,1.0
1,Q64,berlin,0.167252,0,2.0
2,Q64,largest_city,0.165496,0,3.0
3,Q64,capital,0.155119,0,4.0
4,Q64,convention_venues,0.027901,1,1.0


### Compute stats

In [8]:
terms_stats_df_1 = terms_df.groupby(['label', 'term']).agg({'weight': 'sum','rank': 'mean'})
terms_stats_df_2 = terms_df.groupby(['label', 'term']).size().to_frame('count')
terms_stats_df = terms_stats_df_1.join(terms_stats_df_2).reset_index()

In [9]:
terms_stats_df.head()

Unnamed: 0,label,term,weight,rank,count
0,Q1086827,+23_4,0.073926,13.0,8
1,Q1086827,1927,0.155277,4.0,8
2,Q1086827,2010,0.082367,12.0,8
3,Q1086827,29,0.083359,11.0,8
4,Q1086827,477_+8_4,0.131093,6.0,8


### Ranking #1
Rank terms by mean ranking (ascending) and total weight (descending)

In [12]:
ranking1_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by=['rank', 'weight'], ascending=[True, False]))\
                .reset_index(drop=True)

In [14]:
ranking1_df.groupby('label').head(5)

Unnamed: 0,label,term,weight,rank,count
0,Q1086827,borough,2.502292,1.0,20
1,Q1086827,borough_population,0.198643,2.0,8
2,Q1086827,jersey,2.220848,2.4,20
3,Q1086827,camden,0.901545,4.0,12
4,Q1086827,1927,0.155277,4.0,8
21,Q1569850,county,0.547549,1.0,5
22,Q1569850,lake,1.104643,2.0,16
23,Q1569850,2010_census,0.961878,2.0,4
24,Q1569850,wisconsin,1.139769,2.214286,14
25,Q1569850,fourth-class,2.303453,2.4,20


---

## Ranking #2

In [15]:
terms_stats_df['score'] = terms_stats_df.apply(lambda x: x['count']/(x['rank'])**2, axis=1)

In [16]:
terms_stats_df.head()

Unnamed: 0,label,term,weight,rank,count,score
0,Q1086827,+23_4,0.073926,13.0,8,0.047337
1,Q1086827,1927,0.155277,4.0,8,0.5
2,Q1086827,2010,0.082367,12.0,8,0.055556
3,Q1086827,29,0.083359,11.0,8,0.066116
4,Q1086827,477_+8_4,0.131093,6.0,8,0.222222


In [17]:
ranking2_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by='score', ascending=False))\
                .reset_index(drop=True)

In [25]:
ranking2_df.groupby('label').head(5) #.reset_index(drop=True)[['label', 'term']].to_excel('test.xlsx')

Unnamed: 0,label,term,weight,rank,count,score
0,Q1086827,borough,2.502292,1.0,20,20.0
1,Q1086827,jersey,2.220848,2.4,20,3.472222
2,Q1086827,borough_population,0.198643,2.0,8,2.0
3,Q1086827,camden,0.901545,4.0,12,0.75
4,Q1086827,new,1.134633,5.4,20,0.685871
21,Q1569850,county,0.547549,1.0,5,5.0
22,Q1569850,lake,1.104643,2.0,16,4.0
23,Q1569850,fourth-class,2.303453,2.4,20,3.472222
24,Q1569850,wisconsin,1.139769,2.214286,14,2.855359
25,Q1569850,city,1.594706,3.8,20,1.385042


---

## Save selected ranking

In [19]:
ranking_filename = f'ranking_wikipedia_nn_{version}.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [20]:
ranking_filepath

'../../data/terms/ranking_wikipedia_nn.xlsx'

In [21]:
ranking1_df.to_excel(ranking_filepath)

---