# Compare rankings

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
rank_dir = 'terms'
src_dir = 'src'
dataset_name = 'stackoverflow'
model_name = 'bert'

## Load rankings

### LIME

In [2]:
import os
import pandas as pd

In [3]:
lime_rankings_filename = f'ranking_{dataset_name}_{model_name}.xlsx'
lime_rankings_filepath = os.path.join(root_dir, data_dir, rank_dir, lime_rankings_filename)
lime_df = pd.read_excel(lime_rankings_filepath, index_col=0)
lime_df.head()

Unnamed: 0,label,term,weight
0,.net,net,8.574018
1,.net,net_application,1.086539
2,.net,linq_net_two_data_tables,0.820128
3,.net,best_net_code,0.791083
4,.net,net_convert_number_string_representation,0.775314


In [4]:
lime_df.shape

(14705, 3)

### Baseline

In [5]:
baseline_ranking_filename = f'ranking_{dataset_name}_baseline.xlsx'
baseline_ranking_filepath = os.path.join(root_dir, data_dir, rank_dir, baseline_ranking_filename)
baseline_df = pd.read_excel(baseline_ranking_filepath, index_col=0)
baseline_df['term'] = baseline_df['term'].astype(str)
baseline_df.head()

Unnamed: 0,label,term
0,.net,net
1,.net,using
2,.net,like
3,.net,get
4,.net,cangrowtrue_cangrow


In [6]:
baseline_df.shape

(20000, 2)

## Group by label

In [7]:
grouped_lime_df = lime_df.groupby('label')
grouped_baseline_df = baseline_df.groupby('label')

## Save the top N terms to a file

In [8]:
n = 30

In [9]:
filename = f'top_n_{dataset_name}.xlsx'
filepath = os.path.join(root_dir, data_dir, rank_dir, filename)

In [10]:
with pd.ExcelWriter(filepath) as writer:
    for group_name, group_lime_df in grouped_lime_df:
        group_baseline_df = grouped_baseline_df.get_group(group_name)

        lime_terms = group_lime_df['term'].values[:n]
        baseline_terms = group_baseline_df['term'].values[:n]
        
        df = pd.DataFrame({'tfidf_term': pd.Series(baseline_terms), 
                           'bert_term': pd.Series(lime_terms)})
    
        df.to_excel(writer, 
                    sheet_name=group_name, 
                    index=False)

---