# Evaluation
Evaluate our ranked list of terms with the ground truth labels for each entity

---

## Load data

In [1]:
root_dir = '../..'
data_dir = 'data'
rank_dir = 'terms'
gt_dir = 'corpus'

### Load ground truth labels

In [2]:
import os
import json

In [3]:
gt_filename = 'alaska_camera_gt.json'
gt_filepath = os.path.join(root_dir, data_dir, gt_dir, gt_filename)

In [4]:
with open(gt_filepath, 'r') as fd:
    entity_to_label_dict = json.load(fd)

In [5]:
from pprint import pprint

In [6]:
pprint(entity_to_label_dict)

{'ENTITY#101': ['canon eos 5d mark iii'],
 'ENTITY#102': ['canon eos 5d mark ii'],
 'ENTITY#16': ['nikon d90'],
 'ENTITY#18': ['canon eos 60d'],
 'ENTITY#19': ['nikon d3300'],
 'ENTITY#21': ['nikon d5100'],
 'ENTITY#23': ['canon eos 7d'],
 'ENTITY#36': ['nikon d3100'],
 'ENTITY#37': ['nikon d80'],
 'ENTITY#41': ['nikon d5200'],
 'ENTITY#44': ['nikon d3200'],
 'ENTITY#57': ['nikon d800'],
 'ENTITY#58': ['nikon 1 j1'],
 'ENTITY#6': ['nikon d5300', 'nikon d800e'],
 'ENTITY#7': ['olympus omd em5',
              'olympus om-d em5',
              'olympus om-d e-m5',
              'olympus omd e-m5'],
 'ENTITY#75': ['nikon d7000'],
 'ENTITY#76': ['nikon d610'],
 'ENTITY#8': ['nikon 1 j3'],
 'ENTITY#84': ['nikon d300'],
 'ENTITY#96': ['canon eos 70d', 'canon eos 7d']}


### Load ranking

In [7]:
import pandas as pd

In [8]:
ranking_filename = 'ranking_alaska_nn.xlsx'
ranking_filepath = os.path.join(root_dir, data_dir, rank_dir, ranking_filename)

In [9]:
alaska_ranking_df = pd.read_excel(ranking_filepath, index_col=0)
alaska_ranking_df.head()

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,5d_mark_iii,38.21049,1.0,40
1,ENTITY#101,5d_mark,10.682544,1.0,14
2,ENTITY#101,canon_eos_5d_mark_iii_22_3_mp_full_frame,3.825436,1.0,4
3,ENTITY#101,canon_eos_5d_mark_iii_body,3.609619,1.0,4
4,ENTITY#101,5d_mark_iii_body,2.965371,1.0,3


In [10]:
alaska_ranking_df.shape

(2039, 5)

Replace '_' character with whitespaces

In [11]:
alaska_ranking_df['term'] = alaska_ranking_df['term'].map(lambda x: x.replace('_', ' '))
alaska_ranking_df.head()

Unnamed: 0,label,term,weight,rank,count
0,ENTITY#101,5d mark iii,38.21049,1.0,40
1,ENTITY#101,5d mark,10.682544,1.0,14
2,ENTITY#101,canon eos 5d mark iii 22 3 mp full frame,3.825436,1.0,4
3,ENTITY#101,canon eos 5d mark iii body,3.609619,1.0,4
4,ENTITY#101,5d mark iii body,2.965371,1.0,3


---

## Mean rank
For each label (that is entity id) compute the rank of the first term matching the ground truth for that label and compute the mean across all rankings

In [12]:
label_to_rank_dict = {}

for entity_id, group_df in alaska_ranking_df.groupby('label'):
    entity_labels = entity_to_label_dict[entity_id]
    group_df = group_df.reset_index(drop=True)
    
    idx = 0
    label_found = False
    df_len = group_df.shape[0]
    
    while idx < df_len and not label_found:
        row = group_df.iloc[idx]
        term = row['term']
        
        for label in entity_labels:
            if label in term:
                label_found = True
                label_to_rank_dict[entity_id] = idx + 1 # as indexes start from 0
        
        idx += 1

In [13]:
pprint(label_to_rank_dict)

{'ENTITY#101': 3,
 'ENTITY#102': 3,
 'ENTITY#16': 1,
 'ENTITY#18': 1,
 'ENTITY#19': 1,
 'ENTITY#21': 1,
 'ENTITY#23': 1,
 'ENTITY#36': 2,
 'ENTITY#37': 1,
 'ENTITY#41': 1,
 'ENTITY#44': 1,
 'ENTITY#57': 1,
 'ENTITY#58': 4,
 'ENTITY#6': 1,
 'ENTITY#7': 2,
 'ENTITY#75': 1,
 'ENTITY#76': 1,
 'ENTITY#8': 6,
 'ENTITY#84': 1,
 'ENTITY#96': 1}


In [14]:
sum(label_to_rank_dict.values())/len(label_to_rank_dict)

1.7

In [16]:
alaska_ranking_df.groupby('label').get_group('ENTITY#8').head(10)

Unnamed: 0,label,term,weight,rank,count
1833,ENTITY#8,1 j3,21.505273,1.0,28
1834,ENTITY#8,nikon j3 interchangable lens digital camera,5.700558,1.0,8
1835,ENTITY#8,j3,2.917831,1.0,7
1836,ENTITY#8,1 j3 digital camera silver,0.947486,1.0,1
1837,ENTITY#8,1 j3 digital camera beige vvk185xh,0.945133,1.0,1
1838,ENTITY#8,nikon 1 j3 14 2 mp hd digital camera system,0.925174,1.0,1
1839,ENTITY#8,new boxed nikon 1 j3 camera body,0.914038,1.0,1
1840,ENTITY#8,nikon 1 j3 digital camera vr,0.890436,1.0,1
1841,ENTITY#8,nikon j3 digital camera,0.855032,1.0,1
1842,ENTITY#8,nikon 1 j3 beige digital camera kit,0.813688,1.0,1


---