# Process terms 
Process terms retrieved from the neural network model using `LIME`

---
## Load the retrieved terms

In [1]:
import os

In [2]:
dataset_name = 'nyt'
model_name = 'bert'

In [3]:
root_dir = '../..'
data_dir = 'data/terms'
filename = f'relevant_terms_{dataset_name}_{model_name}.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [4]:
import pandas as pd

In [5]:
terms_df = pd.read_csv(filepath)

In [6]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q11201,appeals_court,0.100571,0
1,Q11201,one-year_sentence,0.066058,0
2,Q11201,panel,0.046044,0
3,Q11201,arguments,0.036779,0
4,Q11201,whether,0.036227,0


---

## Rank terms by LIME weights

### Aggregate (sum) LIME weights for each term

In [7]:
terms_stats_df = terms_df.groupby(['label', 'term']).agg({'weight': 'sum'}).reset_index()

In [8]:
terms_stats_df.head()

Unnamed: 0,label,term,weight
0,Q11201,1,0.174647
1,Q11201,"1,500_fee",0.002563
2,Q11201,"1,500_fees",0.001279
3,Q11201,105-page_document,0.028491
4,Q11201,10_warrants,0.029505


### Sort terms
Rank terms by total weight (descending)

In [9]:
ranking_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by='weight', ascending=False))\
                .reset_index(drop=True)

In [10]:
ranking_df.groupby('label').head()

Unnamed: 0,label,term,weight
0,Q11201,court,9.744176
1,Q11201,supreme_court,7.793146
2,Q11201,article,3.224292
3,Q11201,high_court,2.532203
4,Q11201,supreme_court_justice,1.695288
1634,Q11211,troops,6.565331
1635,Q11211,u.,4.751721
1636,Q11211,iraq,4.275116
1637,Q11211,s.,2.826302
1638,Q11211,us,2.091001


---

## Save ranking

In [11]:
ranking_filename = f'ranking_{dataset_name}_{model_name}.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [12]:
ranking_filepath

'../../data/terms/ranking_nyt_bert.xlsx'

In [13]:
ranking_df.to_excel(ranking_filepath)

---