# Alaska evaluation

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
rank_dir = 'terms'

## Load ground truth

In [2]:
import os
import json

In [3]:
gt_filename = 'alaska_camera_gt.json'
gt_filepath = os.path.join(root_dir, data_dir, corpus_dir, gt_filename)

In [4]:
with open(gt_filepath, 'r') as fd:
    gt_dict = json.load(fd)

In [5]:
from pprint import pprint

In [6]:
pprint(gt_dict)

{'ENTITY#101': 'canon eos 5d mark iii',
 'ENTITY#102': 'canon eos 5d mark ii',
 'ENTITY#16': 'nikon d90',
 'ENTITY#18': 'canon eos 60d',
 'ENTITY#19': 'nikon d3300',
 'ENTITY#21': 'nikon d5100',
 'ENTITY#23': 'canon eos 7d',
 'ENTITY#36': 'nikon d3100',
 'ENTITY#37': 'nikon d80',
 'ENTITY#41': 'nikon d5200',
 'ENTITY#44': 'nikon d3200',
 'ENTITY#57': 'nikon d800',
 'ENTITY#58': 'nikon 1 j1',
 'ENTITY#6': 'nikon d5300',
 'ENTITY#7': 'olympus omd em5',
 'ENTITY#75': 'nikon d7000',
 'ENTITY#76': 'nikon d610',
 'ENTITY#8': 'nikon 1 j3',
 'ENTITY#84': 'nikon d300',
 'ENTITY#96': 'canon eos 70d'}


## Build noisy ground truth labels

In [7]:
gt_dict["ENTITY#44"] = ', '.join([gt_dict["ENTITY#44"], 
                                 gt_dict["ENTITY#7"], 
                                 gt_dict["ENTITY#102"]])

gt_dict["ENTITY#23"] = gt_dict["ENTITY#23"] + ', ' + gt_dict["ENTITY#8"]
gt_dict["ENTITY#18"] = gt_dict["ENTITY#18"] + ', ' + gt_dict["ENTITY#84"]

gt_dict["ENTITY#41"] = ', '.join([gt_dict["ENTITY#41"], 
                                 gt_dict["ENTITY#21"], 
                                 gt_dict["ENTITY#75"]])

gt_dict["ENTITY#76"] = ', '.join([gt_dict["ENTITY#76"], 
                                 gt_dict["ENTITY#58"], 
                                 gt_dict["ENTITY#19"]])

noisy_labels = ["ENTITY#44", "ENTITY#23", "ENTITY#18", "ENTITY#41", "ENTITY#76"]

In [8]:
pprint(gt_dict)

{'ENTITY#101': 'canon eos 5d mark iii',
 'ENTITY#102': 'canon eos 5d mark ii',
 'ENTITY#16': 'nikon d90',
 'ENTITY#18': 'canon eos 60d, nikon d300',
 'ENTITY#19': 'nikon d3300',
 'ENTITY#21': 'nikon d5100',
 'ENTITY#23': 'canon eos 7d, nikon 1 j3',
 'ENTITY#36': 'nikon d3100',
 'ENTITY#37': 'nikon d80',
 'ENTITY#41': 'nikon d5200, nikon d5100, nikon d7000',
 'ENTITY#44': 'nikon d3200, olympus omd em5, canon eos 5d mark ii',
 'ENTITY#57': 'nikon d800',
 'ENTITY#58': 'nikon 1 j1',
 'ENTITY#6': 'nikon d5300',
 'ENTITY#7': 'olympus omd em5',
 'ENTITY#75': 'nikon d7000',
 'ENTITY#76': 'nikon d610, nikon 1 j1, nikon d3300',
 'ENTITY#8': 'nikon 1 j3',
 'ENTITY#84': 'nikon d300',
 'ENTITY#96': 'canon eos 70d'}


## Load rankings

### LIME on GloVe-based neural network

In [9]:
import pandas as pd

In [10]:
glove_nn_rank_filename = 'ranking_alaska_nn_noisy.xlsx'
glove_nn_rank_filepath = os.path.join(root_dir, data_dir, rank_dir, glove_nn_rank_filename)

In [11]:
glove_nn_rank_df = pd.read_excel(glove_nn_rank_filepath, index_col=0)
# Replace '_' character with whitespaces
glove_nn_rank_df['term'] = glove_nn_rank_df['term'].map(lambda x: x.replace('_', ' '))
glove_nn_rank_df['term'] = glove_nn_rank_df['term'].map(lambda x: x.replace('-', ''))
glove_nn_rank_df.head()

Unnamed: 0,label,term,weight
0,ENTITY#101,5d mark iii,37.501637
1,ENTITY#101,5d mark,9.793319
2,ENTITY#101,canon eos,9.270045
3,ENTITY#101,canon eos 5d mark iii 22 3 mp full frame,3.834493
4,ENTITY#101,canon eos 5d mark iii body,3.660367


In [12]:
glove_nn_rank_df.shape

(1905, 3)

### LIME on DistilBERT

In [13]:
bert_rank_filename = 'ranking_alaska_bert_noisy.xlsx'
bert_rank_filepath = os.path.join(root_dir, data_dir, rank_dir, bert_rank_filename)

In [14]:
bert_rank_df = pd.read_excel(bert_rank_filepath, index_col=0)
# Replace '_' character with whitespaces
bert_rank_df['term'] = bert_rank_df['term'].map(lambda x: x.replace('_', ' '))
bert_rank_df['term'] = bert_rank_df['term'].map(lambda x: x.replace('-', ''))
bert_rank_df.head()

Unnamed: 0,label,term,weight
0,ENTITY#101,5d mark iii,36.710231
1,ENTITY#101,5d mark,10.632592
2,ENTITY#101,canon eos 5d mark iii 22 3 mp full frame,3.78872
3,ENTITY#101,canon eos 5d mark iii body,3.580491
4,ENTITY#101,5d mark iii dslr camera,2.824653


In [15]:
bert_rank_df.shape

(974, 3)

### TF-IDF baseline

In [16]:
tfidf_rank_filename = 'ranking_alaska_baseline_noisy.xlsx'
tfidf_rank_filepath = os.path.join(root_dir, data_dir, rank_dir, tfidf_rank_filename)

In [17]:
tfidf_rank_df = pd.read_excel(tfidf_rank_filepath, index_col=0)
tfidf_rank_df['term'] = tfidf_rank_df['term'].map(lambda x: x.replace('_', ' '))
tfidf_rank_df['term'] = tfidf_rank_df['term'].map(lambda x: x.replace('-', ''))
tfidf_rank_df.head()

Unnamed: 0,label,term
0,ENTITY#44,canon eos 5d
1,ENTITY#44,d3200
2,ENTITY#44,mark
3,ENTITY#44,ii
4,ENTITY#44,nikon d3200


In [18]:
tfidf_rank_df.shape

(33216, 2)

### BART baseline

Load summaries

In [19]:
summaries_filename = 'chunk_summary_alaska_noisy.xlsx'
summaries_filepath = os.path.join(root_dir, data_dir, rank_dir, summaries_filename)

In [20]:
summaries_df = pd.read_excel(summaries_filepath, index_col=0)
summaries_df.head()

Unnamed: 0_level_0,summary
entity,Unnamed: 1_level_1
ENTITY#44,nikon_d3200_digital_slr_camera 24_2_megapixels...
ENTITY#23,ebay_canon_eos_7d sale 734 39. camera 28_135mm...
ENTITY#18,camerafarm_australia canon eos 60d 18 1x optic...
ENTITY#36,ebay_nikon_d3100 18-55/3_5-5_6_vr 55-300/4 5 5...
ENTITY#41,ebay buy nikon_d5200_digital_slr_camera black ...


Rank terms based on their position

In [21]:
summary_rank_list = []

In [22]:
for idx, row in summaries_df.iterrows():
    summary = row['summary']
    for term in summary.split():
        entry = (idx, term)
        summary_rank_list.append(entry)

In [23]:
summary_rank_df = pd.DataFrame(summary_rank_list, columns=['label', 'term'])
summary_rank_df['term'] = summary_rank_df['term'].map(lambda x: x.replace('_', ' '))
summary_rank_df['term'] = summary_rank_df['term'].map(lambda x: x.replace('-', ''))
summary_rank_df.head()

Unnamed: 0,label,term
0,ENTITY#44,nikon d3200 digital slr camera
1,ENTITY#44,24 2 megapixels
2,ENTITY#44,less
3,ENTITY#44,walmart
4,ENTITY#44,com


In [24]:
summary_rank_df.shape

(176, 2)

Group by labels

In [25]:
grouped_tfidf_df = tfidf_rank_df.groupby('label')
grouped_summary_df = summary_rank_df.groupby('label')
grouped_glove_df = glove_nn_rank_df.groupby('label')
grouped_bert_df = bert_rank_df.groupby('label')

---

## Write top N terms to file

In [26]:
filename = 'top_n_alaska_noisy.xlsx'
filepath = os.path.join(root_dir, data_dir, rank_dir, filename)

In [27]:
top_n = 30

In [28]:
with pd.ExcelWriter(filepath) as writer:
    for group_name in noisy_labels:
        tfidf_terms = grouped_tfidf_df.get_group(group_name)['term'][:top_n].values
        summary_terms = grouped_summary_df.get_group(group_name)['term'][:top_n].values
        glove_terms = grouped_glove_df.get_group(group_name)['term'][:top_n].values
        bert_terms = grouped_bert_df.get_group(group_name)['term'][:top_n].values
        target = gt_dict[group_name]

        df = pd.DataFrame({'tfidf_term': pd.Series(tfidf_terms), 
                           'summary_term': pd.Series(summary_terms), 
                           'glove_term': pd.Series(glove_terms), 
                           'bert_term': pd.Series(bert_terms),
                           'target': target})
        df.to_excel(writer,
                    sheet_name=group_name, 
                    index=False)   

---