# Process terms 
Process terms retrieved from the neural network model using `LIME`

---

## Load the retrieved terms

In [1]:
import os

In [2]:
root_dir = '../..'
data_dir = 'data/terms'
filename = 'relevant_terms_alaska_nn.csv'
filepath = os.path.join(root_dir, data_dir, filename)

In [3]:
import pandas as pd

In [4]:
terms_df = pd.read_csv(filepath)

In [5]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,nikon_d3200_dslr_camera,0.563572,0
1,ENTITY#44,18_55mm,0.26582,0
2,ENTITY#44,55_200mm_lenses,0.175857,0
3,ENTITY#44,black_ebay,0.12929,0
4,ENTITY#44,nikon_d3200,0.812544,1


---

## Rank terms

### Add `rank` column
Add a ranking column by sorting each document's terms by their LIME weight in descending order

In [8]:
terms_df['rank'] = terms_df.groupby(['label', 'data_id'])['weight'].rank(ascending=False)

In [9]:
terms_df.head()

Unnamed: 0,label,term,weight,data_id,rank
0,ENTITY#44,nikon_d3200_dslr_camera,0.563572,0,1.0
1,ENTITY#44,18_55mm,0.26582,0,2.0
2,ENTITY#44,55_200mm_lenses,0.175857,0,3.0
3,ENTITY#44,black_ebay,0.12929,0,4.0
4,ENTITY#44,nikon_d3200,0.812544,1,1.0


### Compute stats

In [10]:
terms_stats_df_1 = terms_df.groupby(['label', 'term']).agg({'weight': 'sum','rank': 'mean'})
terms_stats_df_2 = terms_df.groupby(['label', 'term']).size().to_frame('count')
terms_stats_df = terms_stats_df_1.join(terms_stats_df_2).reset_index()

In [11]:
terms_stats_df.head()

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,2,0.055329,4.0,1
1,ENTITY#101,3,0.085212,4.0,2
2,ENTITY#101,3_ebay,0.091817,4.0,1
3,ENTITY#101,5d,1.497793,2.0,5
4,ENTITY#101,5d_mark,10.682544,1.0,14


### Ranking #1
Rank terms by mean ranking (ascending) and total weight (descending)

In [12]:
ranking1_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by=['rank', 'weight'], ascending=[True, False]))\
                .reset_index(drop=True)

In [13]:
ranking1_df.groupby('label').head(3)

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,5d_mark_iii,38.21049,1.0,40
1,ENTITY#101,5d_mark,10.682544,1.0,14
2,ENTITY#101,canon_eos_5d_mark_iii_22_3_mp_full_frame,3.825436,1.0,4
56,ENTITY#102,canon_eos_5d_dslr_mark_ii_body_black_price,0.971038,1.0,1
57,ENTITY#102,5d_slr_mark_ii,0.969073,1.0,1
58,ENTITY#102,canon_eos_5d_mark_ii_body_w_acc,0.967563,1.0,1
102,ENTITY#16,nikon_d90,57.921286,1.0,62
103,ENTITY#16,nikon_d90_dx_12_3mp_digital_slr_camera,1.926593,1.0,2
104,ENTITY#16,nikon_d90_camera_body,1.922917,1.0,2
155,ENTITY#18,canon_eos_60d,53.296572,1.0,58


---

## Ranking #2

In [14]:
terms_stats_df['score'] = terms_stats_df.apply(lambda x: x['count']/(x['rank'])**2, axis=1)

In [15]:
terms_stats_df.head()

Unnamed: 0,label,term,weight,rank,count,score
0,ENTITY#101,2,0.055329,4.0,1,0.0625
1,ENTITY#101,3,0.085212,4.0,2,0.125
2,ENTITY#101,3_ebay,0.091817,4.0,1,0.0625
3,ENTITY#101,5d,1.497793,2.0,5,1.25
4,ENTITY#101,5d_mark,10.682544,1.0,14,14.0


In [16]:
ranking2_df = terms_stats_df.groupby('label')\
                .apply(lambda grp: grp.sort_values(by='score', ascending=False))\
                .reset_index(drop=True)

In [17]:
ranking2_df.groupby('label').head(3)

Unnamed: 0,label,term,weight,rank,count,score
0,ENTITY#101,5d_mark_iii,38.21049,1.0,40,40.0
1,ENTITY#101,5d_mark,10.682544,1.0,14,14.0
2,ENTITY#101,canon_eos_5d_mark_iii_22_3_mp_full_frame,3.825436,1.0,4,4.0
56,ENTITY#102,canon_eos_5d,51.191093,1.039474,76,70.337446
57,ENTITY#102,ii,22.174833,1.944444,72,19.043265
58,ENTITY#102,mark,13.996577,3.064935,77,8.196872
102,ENTITY#16,nikon_d90,57.921286,1.0,62,62.0
103,ENTITY#16,12_3_mp_digital_slr_camera_black_kit,1.390819,2.0,15,3.75
104,ENTITY#16,12_3_mp_digital_slr_camera_black_body,0.909726,2.0,13,3.25
155,ENTITY#18,canon_eos_60d,53.296572,1.0,58,58.0


After a quick comparison, `ranking#1` seems to yield better results than `ranking#2`

---

## Save selected ranking

In [19]:
ranking_filename = 'ranking_alaska_nn.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, ranking_filename)

In [20]:
ranking_filepath

'../../data/terms/ranking_alaska_nn.xlsx'

In [21]:
ranking1_df.to_excel(ranking_filepath)

---