# Top N terms for Alaska

## Define filepaths

In [1]:
root_dir = '../..'
src_dir = 'src'
data_dir = 'data'
corpus_dir = 'corpus'
rankings_dir = 'terms'
dataset_name = 'alaska'

## Load ground truth

In [2]:
import os
import json

In [3]:
gt_filename = 'alaska_camera_gt.json'
gt_filepath = os.path.join(root_dir, data_dir, corpus_dir, gt_filename)

In [4]:
with open(gt_filepath, 'r') as fd:
    gt_dict = json.load(fd)

In [5]:
from pprint import pprint

In [6]:
pprint(gt_dict)

{'ENTITY#101': 'canon eos 5d mark iii',
 'ENTITY#102': 'canon eos 5d mark ii',
 'ENTITY#16': 'nikon d90',
 'ENTITY#18': 'canon eos 60d',
 'ENTITY#19': 'nikon d3300',
 'ENTITY#21': 'nikon d5100',
 'ENTITY#23': 'canon eos 7d',
 'ENTITY#36': 'nikon d3100',
 'ENTITY#37': 'nikon d80',
 'ENTITY#41': 'nikon d5200',
 'ENTITY#44': 'nikon d3200',
 'ENTITY#57': 'nikon d800',
 'ENTITY#58': 'nikon 1 j1',
 'ENTITY#6': 'nikon d5300',
 'ENTITY#7': 'olympus omd em5',
 'ENTITY#75': 'nikon d7000',
 'ENTITY#76': 'nikon d610',
 'ENTITY#8': 'nikon 1 j3',
 'ENTITY#84': 'nikon d300',
 'ENTITY#96': 'canon eos 70d'}


## Load, process and save rankings to a file

In [7]:
import sys
import pandas as pd

sys.path.append(os.path.join(root_dir, src_dir))

from training import TrainingCorpus 

In [8]:
def preprocess_rankings(df):
    df = df.copy()
    df['term'] = df['term'].map(lambda x: x.replace('_', ' '))
    df['term'] = df['term'].map(lambda x: x.replace('-', ''))
    return df

In [9]:
def save_top_terms(dataset_name, test_names, gt_dict, top_n):
    # tfidf baseline
    tfidf_filename = f'ranking_{dataset_name}_baseline.xlsx'
    tfidf_filepath = os.path.join(root_dir, data_dir, rankings_dir, tfidf_filename)
    tfidf_df = pd.read_excel(tfidf_filepath, index_col=0)
    tfidf_df = preprocess_rankings(tfidf_df)
    grouped_tfidf_df = tfidf_df.groupby('label')
    
    # BART baseline
    bart_filename = f'chunk_summary_{dataset_name}.xlsx'
    bart_filepath = os.path.join(root_dir, data_dir, rankings_dir, bart_filename)
    bart_summaries_df = pd.read_excel(bart_filepath, index_col=0)
    
    # rank BART tokens based on their position
    summary_rank_list = []
    
    for idx, row in bart_summaries_df.iterrows():
        summary = row['summary']
        for term in summary.split():
            entry = (idx, term)
            summary_rank_list.append(entry)
    
    bart_df = pd.DataFrame(summary_rank_list, columns=['label', 'term'])
    bart_df = preprocess_rankings(bart_df) 
    grouped_bart_df = bart_df.groupby('label')
    
    for test_name in test_names:
        # LSTM 
        nn_rank_filename = f'ranking_{dataset_name}_nn_{test_name}.xlsx'
        nn_rank_filepath = os.path.join(root_dir, data_dir, rankings_dir, nn_rank_filename)
        nn_df = pd.read_excel(nn_rank_filepath, index_col=0)
        nn_df = preprocess_rankings(nn_df)
        grouped_nn_df = nn_df.groupby('label')
        
        # distilBERT
        bert_rank_filename = f'ranking_{dataset_name}_bert_{test_name}.xlsx'
        bert_rank_filepath = os.path.join(root_dir, data_dir, rankings_dir, bert_rank_filename)
        bert_df = pd.read_excel(bert_rank_filepath)
        bert_df = preprocess_rankings(bert_df)
        grouped_bert_df = bert_df.groupby('label')
        
        out_filename = f'top_n_{dataset_name}_{test_name}.xlsx'
        out_filepath = os.path.join(root_dir, data_dir, rankings_dir, out_filename)
        
        with pd.ExcelWriter(out_filepath) as writer:
            
            for group_name, _ in grouped_tfidf_df:
                tfidf_terms = grouped_tfidf_df.get_group(group_name)['term'][:top_n].values
                summary_terms = grouped_bart_df.get_group(group_name)['term'][:top_n].values
                nn_terms = grouped_nn_df.get_group(group_name)['term'][:top_n].values
                bert_terms = grouped_bert_df.get_group(group_name)['term'][:top_n].values
                target = ' '.join(TrainingCorpus.tokenize(gt_dict[group_name])).lower()
            
                df = pd.DataFrame({'tfidf_term': pd.Series(tfidf_terms), 
                           'summary_term': pd.Series(summary_terms), 
                           'glove_term': pd.Series(nn_terms), 
                           'bert_term': pd.Series(bert_terms),
                           'target': target})
                
                df.to_excel(writer,
                    sheet_name=group_name, 
                    index=False)  

In [10]:
test_names = ['sampling_terms_test1', 
              'sampling_terms_test2', 
              'sampling_records_test1', 
              'sampling_records_test2', 
              'sampling_records_and_terms_test1']

In [11]:
top_n = 30

In [12]:
save_top_terms(dataset_name, test_names, gt_dict, top_n)

---