# Process terms 
Process terms retrieved from the neural network model using `LIME`

---
## Load the retrieved terms

In [1]:
import os

In [2]:
version = 'v2'

In [3]:
root_dir = '../..'
data_dir = 'data/terms'
filename = f'relevant_terms_wikidata_nn_tfidf_{version}.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [4]:
import pandas as pd

In [5]:
terms_df = pd.read_csv(filepath)

In [6]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q1754,stockholm,0.442581,0
1,Q1754,sweden,0.352026,0
2,Q1754,city,0.123024,0
3,Q1754,capital,0.110941,0
4,Q1754,stockholm,0.47374,1


---

## Rank terms

### Add `rank` column
Add a ranking column by sorting each document's terms by their LIME weight in descending order

In [7]:
terms_df['rank'] = terms_df.groupby(['label', 'data_id'])['weight'].rank(ascending=False)

In [8]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id,rank
0,Q1754,stockholm,0.442581,0,1.0
1,Q1754,sweden,0.352026,0,2.0
2,Q1754,city,0.123024,0,3.0
3,Q1754,capital,0.110941,0,4.0
4,Q1754,stockholm,0.47374,1,1.0


### Compute stats

In [9]:
terms_stats_df_1 = terms_df.groupby(['label', 'term']).agg({'weight': 'sum','rank': 'mean'})
terms_stats_df_2 = terms_df.groupby(['label', 'term']).size().to_frame('count')
terms_stats_df = terms_stats_df_1.join(terms_stats_df_2).reset_index()

In [10]:
terms_stats_df.head()

Unnamed: 0,label,term,weight,rank,count
0,Q1005682,2010_census,0.158228,4.5,2
1,Q1005682,513,0.877568,1.0,2
2,Q1005682,carver,3.252445,1.0,9
3,Q1005682,city,0.432651,5.4,5
4,Q1005682,hamburg,2.680185,2.363636,11


### Ranking 
Rank terms by mean ranking (ascending) and total weight (descending)

In [11]:
ranking1_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by=['rank', 'weight'], ascending=[True, False]))\
                .reset_index(drop=True)

In [12]:
ranking1_df.groupby('label').head()

Unnamed: 0,label,term,weight,rank,count
0,Q1005682,carver,3.252445,1.000000,9
1,Q1005682,513,0.877568,1.000000,2
2,Q1005682,hamburg,2.680185,2.363636,11
3,Q1005682,minnesota,2.503608,2.600000,10
4,Q1005682,states,0.913663,4.090909,11
...,...,...,...,...,...
4699,Q998718,fremont,1.838321,1.000000,4
4700,Q998718,nishnabotna,1.156465,1.000000,5
4701,Q998718,iowa,1.240331,2.333333,6
4702,Q998718,rivers,0.783508,2.400000,5


---

## Save selected ranking

In [13]:
ranking_filename = f'ranking_wikidata_nn_tfidf_{version}.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [14]:
ranking_filepath

'../../data/terms/ranking_wikidata_nn_tfidf_v2.xlsx'

In [15]:
ranking1_df.to_excel(ranking_filepath)

---