# Evaluation

## Load data

In [1]:
root_dir = '../..'
data_dir = 'data'
rank_dir = 'terms'
gt_dir = 'corpus'

### Load GT labels

In [2]:
import os
import json

In [3]:
gt_filename = 'alaska_camera_gt.json'
gt_filepath = os.path.join(root_dir, data_dir, gt_dir, gt_filename)

In [4]:
with open(gt_filepath, 'r') as fd:
    entity_to_label_dict = json.load(fd)

In [5]:
from pprint import pprint

In [6]:
pprint(entity_to_label_dict)

{'ENTITY#101': ['canon eos 5d mark iii'],
 'ENTITY#102': ['canon eos 5d mark ii'],
 'ENTITY#16': ['nikon d90'],
 'ENTITY#18': ['canon eos 60d'],
 'ENTITY#19': ['nikon d3300'],
 'ENTITY#21': ['nikon d5100'],
 'ENTITY#23': ['canon eos 7d'],
 'ENTITY#36': ['nikon d3100'],
 'ENTITY#37': ['nikon d80'],
 'ENTITY#41': ['nikon d5200'],
 'ENTITY#44': ['nikon d3200'],
 'ENTITY#57': ['nikon d800'],
 'ENTITY#58': ['nikon 1 j1'],
 'ENTITY#6': ['nikon d5300', 'nikon d800e'],
 'ENTITY#7': ['olympus omd em5',
              'olympus om-d em5',
              'olympus om-d e-m5',
              'olympus omd e-m5'],
 'ENTITY#75': ['nikon d7000'],
 'ENTITY#76': ['nikon d610'],
 'ENTITY#8': ['nikon 1 j3'],
 'ENTITY#84': ['nikon d300'],
 'ENTITY#96': ['canon eos 70d', 'canon eos 7d']}


### Load rankings

#### LIME

In [7]:
import pandas as pd

In [8]:
lime_filename = 'ranking_alaska_nn.xlsx'
lime_filepath = os.path.join(root_dir, data_dir, rank_dir, lime_filename)

In [9]:
lime_df = pd.read_excel(lime_filepath, index_col=0)
# Replace '_' character with whitespaces
lime_df['term'] = lime_df['term'].map(lambda x: x.replace('_', ' '))
lime_df.head()

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,5d mark iii,38.21049,1.0,40
1,ENTITY#101,5d mark,10.682544,1.0,14
2,ENTITY#101,canon eos 5d mark iii 22 3 mp full frame,3.825436,1.0,4
3,ENTITY#101,canon eos 5d mark iii body,3.609619,1.0,4
4,ENTITY#101,5d mark iii body,2.965371,1.0,3


In [10]:
lime_df.shape

(2039, 5)

#### Baseline

In [11]:
baseline_filename = 'ranking_alaska_baseline.xlsx'
baseline_filepath = os.path.join(root_dir, data_dir, rank_dir, baseline_filename)

In [12]:
baseline_df = pd.read_excel(baseline_filepath, index_col=0)
# Replace '_' character with whitespaces
baseline_df['term'] = baseline_df['term'].map(lambda x: x.replace('_', ' '))
baseline_df.head()

Unnamed: 0,label,term
0,ENTITY#44,d3200
1,ENTITY#44,nikon d3200
2,ENTITY#44,18
3,ENTITY#44,nikon
4,ENTITY#44,24 2 mp digital slr camera black kit


In [13]:
baseline_df.shape

(55360, 2)

---

## Compute HIT@n

In [14]:
import numpy as np

In [21]:
num_classes= lime_df['label'].unique().shape[0]
n = 100

In [16]:
def hit_at_n(entity_id: str,
             df: pd.DataFrame, 
             entity_to_label_dict: dict,
             n: int):
    
    terms = df.head(n)['term'].tolist()
    gt_terms = entity_to_label_dict[entity_id]
    
    i = 0
    is_found = False
    hits = np.zeros(n)
    
    while i < len(terms) and not is_found:
        term = terms[i]
        j = 0
        while j < len(gt_terms) and not is_found:
            gt_term = gt_terms[j]
            if gt_term in term:
                is_found = True
                hits[i:] = 1
            j += 1
        i += 1
    
    return hits

### LIME-based

In [22]:
hits_lime = lime_df.groupby('label').apply(lambda grp: hit_at_n(grp.name,
                                                                grp,
                                                                entity_to_label_dict,n))

### Tf-idf-based

In [23]:
hits_baseline = baseline_df.groupby('label').apply(lambda grp: hit_at_n(grp.name,
                                                                        grp,
                                                                        entity_to_label_dict,n))

### Plot hits

In [24]:
import plotly.graph_objects as go

hits_lime_sum = hits_lime.sum(axis=0)/num_classes*100
hits_baseline_sum = hits_baseline.sum(axis=0)/num_classes*100
x_values = np.arange(1, hits_lime_sum.shape[0] + 1)

fig = go.Figure()

fig.add_trace(go.Scatter(x=x_values,
                         y=hits_lime_sum,
                         mode='markers+lines',
                         name='LIME hits'))
fig.add_trace(go.Scatter(x=x_values,
                         y=hits_baseline_sum,
                         mode='markers+lines',
                         name='Baseline hits'))

fig.update_layout(xaxis_title='N',yaxis_title='Hits (%)')
fig.show()

---