# Process terms 
Process terms retrieved from the neural network model using `LIME`

---

## Load the retrieved terms

In [1]:
import os

In [2]:
root_dir = '../..'
data_dir = 'data/terms'
filename = 'relevant_terms_alaska_nn.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [3]:
import pandas as pd

In [4]:
terms_df = pd.read_csv(filepath)

In [5]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,nikon_d3200_dslr_camera,0.563572,0
1,ENTITY#44,18_55mm,0.26582,0
2,ENTITY#44,55_200mm_lenses,0.175857,0
3,ENTITY#44,black_ebay,0.12929,0
4,ENTITY#44,nikon_d3200,0.812544,1


---

## Rank terms

### Add `rank` column
Add a ranking column by sorting each document's terms by their LIME weight in descending order

In [6]:
terms_df['rank'] = terms_df.groupby(['label', 'data_id'])['weight'].rank(ascending=False)

In [7]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id,rank
0,ENTITY#44,nikon_d3200_dslr_camera,0.563572,0,1.0
1,ENTITY#44,18_55mm,0.26582,0,2.0
2,ENTITY#44,55_200mm_lenses,0.175857,0,3.0
3,ENTITY#44,black_ebay,0.12929,0,4.0
4,ENTITY#44,nikon_d3200,0.812544,1,1.0


### Split noun chunks into single terms

In [8]:
def split_noun_chunks(input_df):
    df_data = []
    
    for _, row in input_df.iterrows():
        noun_chunk = row['term']

        for term in noun_chunk.split('_'):
            data_dict = {'label': row['label'], 
                         'term': term, 
                         'weight': row['weight'], 
                         'data_id': row['data_id'], 
                         'rank': row['rank']}
            df_data.append(data_dict)
    
    return pd.DataFrame(df_data)

In [9]:
terms_df = split_noun_chunks(terms_df)

In [10]:
terms_df.head(10)

Unnamed: 0,label,term,weight,data_id,rank
0,ENTITY#44,nikon,0.563572,0,1.0
1,ENTITY#44,d3200,0.563572,0,1.0
2,ENTITY#44,dslr,0.563572,0,1.0
3,ENTITY#44,camera,0.563572,0,1.0
4,ENTITY#44,18,0.26582,0,2.0
5,ENTITY#44,55mm,0.26582,0,2.0
6,ENTITY#44,55,0.175857,0,3.0
7,ENTITY#44,200mm,0.175857,0,3.0
8,ENTITY#44,lenses,0.175857,0,3.0
9,ENTITY#44,black,0.12929,0,4.0


### Compute stats

In [11]:
terms_stats_df_1 = terms_df.groupby(['label', 'term']).agg({'weight': 'sum','rank': 'mean'})
terms_stats_df_2 = terms_df.groupby(['label', 'term']).size().to_frame('count')
terms_stats_df = terms_stats_df_1.join(terms_stats_df_2).reset_index()

In [12]:
terms_stats_df.head()

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,16-35mm,0.724391,1.0,1
1,ENTITY#101,17-40mm,0.676306,1.0,1
2,ENTITY#101,2,0.055329,4.0,1
3,ENTITY#101,22,5.230077,1.0,6
4,ENTITY#101,24-105mm,0.724367,1.0,1


### Ranking #1
Rank terms by mean ranking (ascending) and total weight (descending)

In [15]:
ranking1_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by=['rank', 'count'], ascending=[True, False]))\
                .reset_index(drop=True)

In [20]:
ranking1_df.groupby('label').head(5)

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,kit,9.754005,1.0,12
1,ENTITY#101,dslr,6.969847,1.0,8
2,ENTITY#101,price,5.540626,1.0,7
3,ENTITY#101,22,5.230077,1.0,6
4,ENTITY#101,frame,5.230077,1.0,6
...,...,...,...,...,...
1580,ENTITY#96,dslr,15.461132,1.0,19
1581,ENTITY#96,price,3.312272,1.0,4
1582,ENTITY#96,comparison,1.388067,1.0,2
1583,ENTITY#96,24-105mm,0.706829,1.0,1


---

## Save selected ranking

In [21]:
ranking_filename = 'ranking_alaska_nn_single_terms.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [22]:
ranking_filepath

'../../data/terms/ranking_alaska_nn_single_terms.xlsx'

In [23]:
ranking1_df.to_excel(ranking_filepath)

---