###  use abt-buy dataset as evaluation

In [1]:
import rltk
import csv
import pandas as pd
import numpy as np

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [87]:
'''
Feel free to add more columns here for use in record linkage.
'''

class Abt(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['name']
    
    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name_string))


class Buy(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['name']
    
    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name_string))
    

In [89]:
dir_ = '../data/evaluation_datasets/Abt-Buy/'
Abt_file = dir_ + 'Abt.csv'
Buy_file = dir_ + 'Buy.csv'

ds1 = rltk.Dataset(rltk.CSVReader(open(Abt_file,)),record_class=Abt)
ds2 = rltk.Dataset(rltk.CSVReader(open(Buy_file, encoding ='utf-8')),record_class=Buy)

You can load your csv files into RLTK using this method:

And we can inspect a few entries:

In [90]:
# print some entries
print(ds1.generate_dataframe().head(5))
print(ds2.generate_dataframe().head(5))

     id                                        name_string  \
0   552                          Sony Turntable - PSLX350H   
1   580  Bose Acoustimass 5 Series III Speaker System -...   
2  4696                             Sony Switcher - SBV40S   
3  5644                   Sony 5 Disc CD Player - CDPCE375   
4  6284  Bose 27028 161 Bookshelf Pair Speakers In Whit...   

                                         name_tokens  
0                     {-, PSLX350H, Turntable, Sony}  
1  {Series, System, Speaker, 5, Acoustimass, III,...  
2                        {-, SBV40S, Switcher, Sony}  
3           {5, CDPCE375, CD, -, Player, Sony, Disc}  
4  {161WH, Pair, -, 27028, Bookshelf, Speakers, I...  
         id                                        name_string  \
0  10011646  Linksys EtherFast EZXS88W Ethernet Switch - EZ...   
1  10140760          Linksys EtherFast EZXS55W Ethernet Switch   
2  10221960    Netgear ProSafe FS105 Ethernet Switch - FS105NA   
3  10246269  Belkin Pro Series Hi

#### make labeled groundtruth

In [91]:
dev_set_file = dir_ + 'abt_buy_perfectMapping.csv'
dev = []
with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            dev.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(dev)} lines.')

dev_labeled = []
for instance in dev:
    instance_labeled = instance
    instance_labeled.append(1)
    dev_labeled.append(instance_labeled)
dev_labeled

Column names are: idAbt, idBuy
Processed 1097 lines.


[['38477', '10011646', 1],
 ['38475', '10140760', 1],
 ['33053', '10221960', 1],
 ['27248', '10246269', 1],
 ['25262', '10315184', 1],
 ['36260', '10316920', 1],
 ['35810', '10326220', 1],
 ['32034', '10333368', 1],
 ['38473', '10333846', 1],
 ['23686', '10333848', 1],
 ['38474', '10343605', 1],
 ['33377', '10346525', 1],
 ['27479', '10351869', 1],
 ['37606', '10353206', 1],
 ['27975', '10360978', 1],
 ['36207', '10363070', 1],
 ['17372', '10363243', 1],
 ['39179', '10365096', 1],
 ['21047', '10373002', 1],
 ['36258', '10376165', 1],
 ['20448', '10377226', 1],
 ['13202', '10379358', 1],
 ['39176', '10380060', 1],
 ['32579', '10388980', 1],
 ['22558', '10412590', 1],
 ['21175', '10412788', 1],
 ['26461', '200924629', 1],
 ['23296', '201647442', 1],
 ['20454', '201692648', 1],
 ['20450', '201692660', 1],
 ['33047', '201692665', 1],
 ['20453', '201692677', 1],
 ['20461', '201692679', 1],
 ['20466', '201692695', 1],
 ['26617', '201692705', 1],
 ['35467', '201699873', 1],
 ['21875', '201700

In [92]:
Abt_df = ds1.generate_dataframe()
Buy_df = ds2.generate_dataframe()

Abt_id = list(Abt_df['id'])
Buy_id = list(Buy_df['id'])
# print(Abt_id,Buy_id)

In [93]:
from random import sample

for abtid in Abt_id:
    buyids = sample(Buy_id, 2)
    if [abtid, buyids[0]] not in dev:
        dev_labeled.append([abtid, buyids[0], 0])
    if [abtid, buyids[1]] not in dev:
        dev_labeled.append([abtid, buyids[1], 0])

In [94]:
print(len(dev_labeled))
print(dev_labeled)

3259
[['38477', '10011646', 1], ['38475', '10140760', 1], ['33053', '10221960', 1], ['27248', '10246269', 1], ['25262', '10315184', 1], ['36260', '10316920', 1], ['35810', '10326220', 1], ['32034', '10333368', 1], ['38473', '10333846', 1], ['23686', '10333848', 1], ['38474', '10343605', 1], ['33377', '10346525', 1], ['27479', '10351869', 1], ['37606', '10353206', 1], ['27975', '10360978', 1], ['36207', '10363070', 1], ['17372', '10363243', 1], ['39179', '10365096', 1], ['21047', '10373002', 1], ['36258', '10376165', 1], ['20448', '10377226', 1], ['13202', '10379358', 1], ['39176', '10380060', 1], ['32579', '10388980', 1], ['22558', '10412590', 1], ['21175', '10412788', 1], ['26461', '200924629', 1], ['23296', '201647442', 1], ['20454', '201692648', 1], ['20450', '201692660', 1], ['33047', '201692665', 1], ['20453', '201692677', 1], ['20461', '201692679', 1], ['20466', '201692695', 1], ['26617', '201692705', 1], ['35467', '201699873', 1], ['21875', '201700823', 1], ['35111', '201701865'

In [95]:
gt = rltk.GroundTruth()
for row in dev_labeled:    
    r1 = ds1.get_record(row[0])
    r2  = ds2.get_record(row[1])
    if row[-1] == 1:
        gt.add_positive(r1.raw_object['id'], r2.raw_object['id'])
    else:
        gt.add_negative(r1.raw_object['id'], r2.raw_object['id'])

rltk.Trial(gt)

<rltk.evaluation.trial.Trial at 0x185215a6100>

In [96]:
gt_df = gt.generate_dataframe()
gt_df[gt_df['label']==True]

Unnamed: 0,id1,id2,label
0,38477,10011646,True
1,38475,10140760,True
2,33053,10221960,True
3,27248,10246269,True
4,25262,10315184,True
...,...,...,...
1092,30593,204860316,True
1093,28059,205592435,True
1094,28225,205592439,True
1095,31176,205844279,True


### Entity Linking Evaluation

In [97]:
def name_string_similarity_1(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name_string
    s2 = r2.name_string
    
    return rltk.jaro_winkler_similarity(s1, s2)
    
def name_string_similarity_2(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name_tokens
    s2 = r2.name_tokens
    
    return rltk.jaccard_index_similarity(s1, s2)

Here's how you can combine multiple similarity functions into a single weightened scoring function:

In [114]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.5 # this number is just an example, you need to change it

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = name_string_similarity_1(r1, r2)
    score_2 = name_string_similarity_2(r1, r2)
    
    total = 0.5 * score_1 + 0.5 * score_2
#     total = score_1
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

Lets run some candidates using the ground-truth

In [115]:
trial = rltk.Trial(gt)
candidate_pairs = rltk.get_record_pairs(ds1, ds2, ground_truth=gt)

count = 1
for r1, r2 in candidate_pairs:
    result, confidence = rule_based_method(r1, r2)
    # print(r1.name_string, r2.name_string, confidence)
    trial.add_result(r1, r2, result, confidence)
    # if count == 10:
    #     break
    # count += 1


Now lets evaluate our trial results

In [116]:
trial.evaluate()
print('Trial statistics based on Ground-Truth from development set data:')
print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')

Trial statistics based on Ground-Truth from development set data:
tp: 0.701642 [769]
fp: 0.003238 [7]
tn: 0.996762 [2155]
fn: 0.298358 [327]


In [117]:
trial.f_measure

0.8215811965811965

### Save Test predictions
You will be evaluated on dev and test predictions, over a hidden ground truth.

In [57]:
# test_set_file = dir_ + 'test.csv'
# test = []
# with open(test_set_file, encoding='utf-8', errors="replace") as csv_file:
#     csv_reader = csv.reader(csv_file, delimiter=',')
#     line_count = 0
#     for row in csv_reader:
#         if len(row) <= 1:
#             continue
#         if line_count == 0:
#             columns = row
#             line_count += 1
#         else:
#             test.append(row)
#     print(f'Column names are: {", ".join(columns)}')
#     print(f'Processed {len(test)} lines.')

In [58]:
# predictions = []
# for id1, id2 in test:
#     r1 = ds1.get_record(id1)
#     r2  = ds2.get_record(id2)
#     result, confidence = rule_based_method(r1, r2)
#     predictions.append((r1.id, r2.id, result, confidence))

In [59]:
# len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

In [60]:
# with open(dir_ + 'predictions.csv', mode='w') as file:
#     writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     for row in predictions:
#         writer.writerow(row)