# Process terms 
Process terms retrieved from the neural network model using `LIME`

---
## Load the retrieved terms

In [1]:
import os

In [2]:
dataset_name = 'stackoverflow'
model_name = 'bert'

In [3]:
root_dir = '../..'
data_dir = 'data/terms'
filename = f'relevant_terms_{dataset_name}_{model_name}.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [4]:
import pandas as pd

In [5]:
terms_df = pd.read_csv(filepath)

In [6]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,.net,net,0.730877,0
1,.net,program_file_association_windows_net_program,0.206415,1
2,.net,net_windows_application,0.202642,1
3,.net,see,0.0088,1
4,.net,whatever_program_associated_file_type_windows,0.008557,1


---

## Rank terms by LIME weights

### Aggregate (sum) LIME weights for each term

In [7]:
terms_stats_df = terms_df.groupby(['label', 'term']).agg({'weight': 'sum'}).reset_index()

In [8]:
terms_stats_df.head()

Unnamed: 0,label,term,weight
0,.net,100_images_server,0.046995
1,.net,11,0.0117
2,.net,2,0.024258
3,.net,2009,0.029553
4,.net,23000_pm,0.022063


### Sort terms
Rank terms by total weight (descending)

In [9]:
ranking_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by='weight', ascending=False))\
                .reset_index(drop=True)

In [10]:
ranking_df.groupby('label').head()

Unnamed: 0,label,term,weight
0,.net,net,8.574018
1,.net,net_application,1.086539
2,.net,linq_net_two_data_tables,0.820128
3,.net,best_net_code,0.791083
4,.net,net_convert_number_string_representation,0.775314
...,...,...,...
13834,sql,sql,18.673883
13835,sql,sql_query,5.353227
13836,sql,query,2.725643
13837,sql,table,1.980812


---

## Save ranking

In [11]:
ranking_filename = f'ranking_{dataset_name}_{model_name}.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [12]:
ranking_filepath

'../../data/terms/ranking_stackoverflow_bert.xlsx'

In [13]:
ranking_df.to_excel(ranking_filepath)

---