# Process terms 
Process terms retrieved from the neural network model using `LIME`

## Load the retrieved terms

In [1]:
import os

In [2]:
root_dir = '../..'
data_dir = 'data/terms'
dataset_name = 'alaska'
test_name = 'sampling_terms_test2'
filename = f'relevant_terms_{dataset_name}_bert_{test_name}.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [3]:
import pandas as pd

In [4]:
terms_df = pd.read_csv(filepath)

In [5]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,nikon_d3200_dslr_camera,0.865474,0
1,ENTITY#44,nikon_d3200,0.861488,1
2,ENTITY#44,nikon_d3200,0.7611,2
3,ENTITY#44,digital_dslr_camera,0.039113,2
4,ENTITY#44,nikon_d3200_digital_dslr_camera,0.808393,3


In [6]:
terms_df.shape

(3295, 4)

---

## Rank terms

### Aggregate (sum) LIME weights for each term

In [7]:
terms_stats_df = terms_df.groupby(['label', 'term']).agg({'weight': 'sum'}).reset_index()

In [8]:
terms_stats_df.head()

Unnamed: 0,label,term,weight
0,ENTITY#101,105mm_ultimate_lens_bundle_package,0.0713
1,ENTITY#101,22_3mp_brand,0.069255
2,ENTITY#101,2_batteries,0.105568
3,ENTITY#101,3,0.264943
4,ENTITY#101,3_ebay,0.062474


### Sort terms
Rank terms by total weight (descending)

In [9]:
ranking_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by='weight', ascending=False))\
                .reset_index(drop=True)

In [10]:
ranking_df.groupby('label').head(3)

Unnamed: 0,label,term,weight
0,ENTITY#101,5d_mark_iii,34.828042
1,ENTITY#101,5d_mark,7.037905
2,ENTITY#101,canon_eos_5d_mark_iii_22_3_mp_full_frame,3.797256
69,ENTITY#102,ii,49.854408
70,ENTITY#102,mark,15.690475
71,ENTITY#102,canon_eos_5d,14.837075
106,ENTITY#16,nikon_d90,58.386318
107,ENTITY#16,nikon_d90_camera_body,1.92114
108,ENTITY#16,nikon_d90_camera,1.91176
139,ENTITY#18,canon_eos_60d,50.040521


---

## Save ranking

In [11]:
ranking_filename = f'ranking_{dataset_name}_bert_{test_name}.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [12]:
ranking_filepath

'../../data/terms/ranking_alaska_bert_sampling_terms_test2.xlsx'

In [13]:
ranking_df.to_excel(ranking_filepath)

---