# Define the ground truth labels

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'
alaska_dir = 'alaska_camera'

---

## Load the dataset

In [2]:
import os
import pandas as pd

In [3]:
filename = 'camera_entity_resolution_gt.xlsx'

In [4]:
alaska_camera_df = pd.read_excel(os.path.join(root_dir, data_dir, filename))
alaska_camera_df.head()

Unnamed: 0,"entity_id,spec_id"
0,"ENTITY#1,buy.net//4236"
1,"ENTITY#1,www.ebay.com//46670"
2,"ENTITY#10,www.pcconnection.com//12363"
3,"ENTITY#10,buy.net//6531"
4,"ENTITY#10,www.shopmania.in//698"


In [5]:
alaska_camera_df.shape

(3865, 1)

In [6]:
alaska_camera_df['entity_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[0])
alaska_camera_df['spec_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[1])

In [7]:
alaska_camera_df = alaska_camera_df.drop(columns=['entity_id,spec_id'])
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id
0,ENTITY#1,buy.net//4236
1,ENTITY#1,www.ebay.com//46670
2,ENTITY#10,www.pcconnection.com//12363
3,ENTITY#10,buy.net//6531
4,ENTITY#10,www.shopmania.in//698


---

## Filter selected entities

In [8]:
import sys

In [9]:
src_dir = 'src'

In [10]:
sys.path.append(os.path.join(root_dir, src_dir))

In [11]:
from training import TrainingCorpus

In [12]:
alaska_corpus_filename = 'alaska_corpus.json'

In [13]:
alaska_corpus = TrainingCorpus()
alaska_corpus.load(os.path.join(root_dir, data_dir, alaska_corpus_filename))

In [14]:
selected_clusters_id = alaska_corpus.labels
selected_clusters_id

['ENTITY#6',
 'ENTITY#7',
 'ENTITY#8',
 'ENTITY#16',
 'ENTITY#18',
 'ENTITY#19',
 'ENTITY#21',
 'ENTITY#23',
 'ENTITY#36',
 'ENTITY#37',
 'ENTITY#41',
 'ENTITY#44',
 'ENTITY#57',
 'ENTITY#58',
 'ENTITY#75',
 'ENTITY#76',
 'ENTITY#84',
 'ENTITY#96',
 'ENTITY#101',
 'ENTITY#102']

In [15]:
alaska_camera_df = alaska_camera_df.loc[alaska_camera_df['entity_id'].isin(selected_clusters_id)]
alaska_camera_df

Unnamed: 0,entity_id,spec_id
35,ENTITY#101,www.ebay.com//47984
36,ENTITY#101,www.ebay.com//55902
37,ENTITY#101,www.ebay.com//46638
38,ENTITY#101,www.ebay.com//43377
39,ENTITY#101,www.ebay.com//54031
...,...,...
3860,ENTITY#96,www.ebay.com//58771
3861,ENTITY#96,www.ebay.com//53048
3862,ENTITY#96,www.priceme.co.nz//1714
3863,ENTITY#96,www.ebay.com//48341


---

## Find GT labels for the selected clusters

In [16]:
dir_name = 'alaska_camera_annotated'

In [17]:
import json

In [18]:
def retrieve_gt_label(spec_id):
    entity_label = None
    spec_src, spec_id = spec_id.split('//')
    spec_id += '.json'
    spec_filepath = os.path.join(root_dir,
                                 data_dir,
                                 dir_name,
                                 spec_src,
                                 spec_id)
    with open(spec_filepath) as fd:
        spec_data = json.load(fd)
    
    if '__extracted_model' in spec_data:
        entity_label = spec_data['__extracted_model']
    return entity_label

In [19]:
from collections import defaultdict

In [20]:
entity_to_label_dict = defaultdict(list)

In [21]:
for _, row in alaska_camera_df.iterrows():
    entity_id = row['entity_id']
    entity_label = retrieve_gt_label(row['spec_id'])
    if entity_label and entity_label not in entity_to_label_dict[entity_id]:
        entity_to_label_dict[entity_id].append(entity_label)


In [22]:
from pprint import pprint

In [23]:
pprint(entity_to_label_dict)

defaultdict(<class 'list'>,
            {'ENTITY#101': ['canon eos 5d mark iii', 'canon eos 5d'],
             'ENTITY#102': ['canon eos 5d mark ii', 'canon eos 5d'],
             'ENTITY#16': ['nikon d90'],
             'ENTITY#18': ['canon eos 60d'],
             'ENTITY#19': ['nikon d3300'],
             'ENTITY#21': ['nikon d5100'],
             'ENTITY#23': ['canon eos 7d'],
             'ENTITY#36': ['nikon d3100'],
             'ENTITY#37': ['nikon d80'],
             'ENTITY#41': ['nikon d5200'],
             'ENTITY#44': ['nikon d3200'],
             'ENTITY#57': ['nikon d800'],
             'ENTITY#58': ['nikon 1 j1'],
             'ENTITY#6': ['nikon d5300', 'nikon d800e'],
             'ENTITY#7': ['olympus omd em5'],
             'ENTITY#75': ['nikon d7000'],
             'ENTITY#76': ['nikon d610'],
             'ENTITY#8': ['nikon 1 j3'],
             'ENTITY#84': ['nikon d300'],
             'ENTITY#96': ['canon eos 70d', 'canon eos 7d']})


Select only the first label for each entity (the other ones are noisy)

In [24]:
for e_id in entity_to_label_dict:
    entity_to_label_dict[e_id] = entity_to_label_dict[e_id][0]

In [25]:
pprint(entity_to_label_dict)

defaultdict(<class 'list'>,
            {'ENTITY#101': 'canon eos 5d mark iii',
             'ENTITY#102': 'canon eos 5d mark ii',
             'ENTITY#16': 'nikon d90',
             'ENTITY#18': 'canon eos 60d',
             'ENTITY#19': 'nikon d3300',
             'ENTITY#21': 'nikon d5100',
             'ENTITY#23': 'canon eos 7d',
             'ENTITY#36': 'nikon d3100',
             'ENTITY#37': 'nikon d80',
             'ENTITY#41': 'nikon d5200',
             'ENTITY#44': 'nikon d3200',
             'ENTITY#57': 'nikon d800',
             'ENTITY#58': 'nikon 1 j1',
             'ENTITY#6': 'nikon d5300',
             'ENTITY#7': 'olympus omd em5',
             'ENTITY#75': 'nikon d7000',
             'ENTITY#76': 'nikon d610',
             'ENTITY#8': 'nikon 1 j3',
             'ENTITY#84': 'nikon d300',
             'ENTITY#96': 'canon eos 70d'})


## Save GT to a file

In [26]:
gt_filename = 'alaska_camera_gt.json'
gt_filepath = os.path.join(root_dir, data_dir, gt_filename)

In [27]:
with open(gt_filepath, 'w') as fd:
    json.dump(entity_to_label_dict, fd)

---