# Process terms 
Process terms retrieved from the neural network model using `LIME`

---
## Load the retrieved terms

In [1]:
import os

In [2]:
version = 'v2'

In [3]:
root_dir = '../..'
data_dir = 'data/terms'
filename = f'relevant_terms_wikidata_nn_{version}.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [4]:
import pandas as pd

In [5]:
terms_df = pd.read_csv(filepath)

In [6]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q1754,stockholm,0.555859,0
1,Q1754,sweden,0.406443,0
2,Q1754,city,0.138746,0
3,Q1754,capital,0.130178,0
4,Q1754,stockholm,0.395502,1


---

## Rank terms

### Split noun chunks into single terms

In [7]:
def split_noun_chunks(input_df):
    df_data = []
    
    for _, row in input_df.iterrows():
        noun_chunk = row['term']

        for term in noun_chunk.split('_'):
            data_dict = {'label': row['label'], 
                         'term': term, 
                         'weight': row['weight'], 
                         'data_id': row['data_id']}
            df_data.append(data_dict)
    
    return pd.DataFrame(df_data)

In [8]:
terms_df = split_noun_chunks(terms_df)

In [9]:
terms_df.head(10)

Unnamed: 0,label,term,weight,data_id
0,Q1754,stockholm,0.555859,0
1,Q1754,sweden,0.406443,0
2,Q1754,city,0.138746,0
3,Q1754,capital,0.130178,0
4,Q1754,stockholm,0.395502,1
5,Q1754,sweden,0.207462,1
6,Q1754,capital,0.106679,1
7,Q1754,city,0.34199,2
8,Q1754,fourteen,0.33089,2
9,Q1754,islands,0.33089,2


### Aggregate (sum) LIME weights for each term

In [10]:
terms_stats_df = terms_df.groupby(['label', 'term']).agg({'weight': 'sum'}).reset_index()

In [11]:
terms_stats_df.head()

Unnamed: 0,label,term,weight
0,Q1005682,2010,0.304872
1,Q1005682,513,0.418916
2,Q1005682,carver,0.563979
3,Q1005682,census,0.304872
4,Q1005682,city,0.409819


### Sort terms
Rank terms by total weight (descending)

In [12]:
ranking_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by='weight', ascending=False))\
                .reset_index(drop=True)

In [13]:
ranking_df.groupby('label').head()

Unnamed: 0,label,term,weight
0,Q1005682,hamburg,6.433465
1,Q1005682,states,2.278421
2,Q1005682,united,2.012429
3,Q1005682,minnesota,1.838184
4,Q1005682,county,1.060182
...,...,...,...
4261,Q998718,hamburg,2.675068
4262,Q998718,iowa,1.819555
4263,Q998718,states,1.636533
4264,Q998718,united,1.130525


---

## Save ranking

In [14]:
ranking_filename = f'ranking_wikidata_nn_{version}_single_terms.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [15]:
ranking_filepath

'../../data/terms/ranking_wikidata_nn_v2_single_terms.xlsx'

In [16]:
ranking_df.to_excel(ranking_filepath)

---