# Compare rankings

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
rank_dir = 'terms'
src_dir = 'src'
dataset_name = 'nyt'
model_name = 'bert'

## Load rankings

### LIME

In [2]:
import os
import pandas as pd

In [3]:
lime_rankings_filename = f'ranking_{dataset_name}_{model_name}.xlsx'
lime_rankings_filepath = os.path.join(root_dir, data_dir, rank_dir, lime_rankings_filename)
lime_df = pd.read_excel(lime_rankings_filepath, index_col=0)
lime_df.head()

Unnamed: 0,label,term,weight
0,Q11201,court,9.744176
1,Q11201,supreme_court,7.793146
2,Q11201,article,3.224292
3,Q11201,high_court,2.532203
4,Q11201,supreme_court_justice,1.695288


In [4]:
lime_df.shape

(14325, 3)

### Baseline

In [5]:
baseline_ranking_filename = f'ranking_{dataset_name}_baseline.xlsx'
baseline_ranking_filepath = os.path.join(root_dir, data_dir, rank_dir, baseline_ranking_filename)
baseline_df = pd.read_excel(baseline_ranking_filepath, index_col=0)
baseline_df.head()

Unnamed: 0,label,term
0,Q11201,court
1,Q11201,supreme_court
2,Q11201,says
3,Q11201,ruling
4,Q11201,ct


In [6]:
baseline_df.shape

(276020, 2)

## Retrieve label names

In [7]:
label_names_filename = 'expansion.xlsx'
label_names_filepath = os.path.join(root_dir, data_dir, corpus_dir, label_names_filename)

In [8]:
label_names_df = pd.read_excel(label_names_filepath, index_col=0)
label_names_df = label_names_df.drop(columns='terms')
label_names_df.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
Q60,New York City
Q51929311,largest city
Q1093829,city of the United States
Q486972,human settlement
Q1549591,big city


## Group by label

In [9]:
grouped_lime_df = lime_df.groupby('label')
grouped_baseline_df = baseline_df.groupby('label')

## Save the top N terms to a file

In [10]:
n = 30

In [11]:
filename = f'top_n_{dataset_name}.xlsx'
filepath = os.path.join(root_dir, data_dir, rank_dir, filename)

In [12]:
with pd.ExcelWriter(filepath) as writer:
    for group_name, group_lime_df in grouped_lime_df:
        group_baseline_df = grouped_baseline_df.get_group(group_name)

        lime_terms = group_lime_df['term'].values[:n]
        baseline_terms = group_baseline_df['term'].values[:n]

        label_name = label_names_df.loc[group_name, 'label']
        
        df = pd.DataFrame({'tfidf_term': pd.Series(baseline_terms), 
                           'bert_term': pd.Series(lime_terms), 
                           'label_name': label_name})
    
        df.to_excel(writer, 
                    sheet_name=group_name, 
                    index=False)

---