In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
entity_df = pd.read_csv('data/intro_entity.csv')
entity_df

CPU times: user 17.7 s, sys: 1.26 s, total: 19 s
Wall time: 19 s


Unnamed: 0,entity,page_id,item_id,text_id
0,anti-authoritarian,867979,1030234,12
1,political,23040,179805,12
2,social philosophy,586276,180592,12
3,hierarchies,13998,188619,12
4,self-managed,40949353,15981562,12
...,...,...,...,...
35840002,Carl Randall,40277554,16215506,62473330
35840003,The World Ends With You,6987282,1416303,62473330
35840004,2016 Summer Olympics closing ceremony,44593137,18741083,62473330
35840005,2020 Summer Olympics,1610886,181278,62473330


In [3]:
%%time
item_dict = pd.read_feather('data/item_dict.ftr', use_threads=True).set_index('en_label').item_ids.to_dict()
display(len(item_dict))
item_dict['tesla']

48191954

CPU times: user 1min 1s, sys: 7.11 s, total: 1min 8s
Wall time: 1min 8s


array([    9036,   163343,   210893,   478214,   622424,   765530,
         780348,  1050485,  1428953,  1463050,  1548225,  1634161,
        2384079,  2406220,  3982823,  5172712,  7035686,  7705502,
        7705506,  7705515, 16258100, 19565583, 19845823, 23663332,
       27701406, 31803712, 37251206, 56084926])

In [4]:
id_counts_df = pd.read_csv("data/id_counts.csv").set_index('item_id')
display(id_counts_df)
item_views_dict = id_counts_df.views.to_dict()
item_counts_dict = id_counts_df.counts.to_dict()
print(item_views_dict[6199])
print(item_counts_dict[6199])

Unnamed: 0_level_0,page_id,title,views,counts
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6199,12,Anarchism,31335,3540
38404,25,Autism,49693,2114
101038,39,Albedo,14573,2825
9659,290,A,25859,175
173,303,Alabama,52765,11125
...,...,...,...,...
76894635,62470350,Daming Zhu,16,0
76894633,62470423,Tony Dews,7,2
76896959,62470432,Samsung PL20,9,0
6034153,62470465,Nils-Fredrik Palmstierna,8,3


31335
3540


In [5]:
%%time
samples = entity_df.sample(250000, random_state=1).to_numpy()
samples

CPU times: user 1.86 s, sys: 109 ms, total: 1.97 s
Wall time: 1.96 s


array([['Military Cross', 488249, 1335064, 187626],
       ['Wales', 69894, 25, 42507907],
       ['Joe Conforte', 11674164, 6209172, 1051658],
       ...,
       ['Manchester City F.C', 165813, 50602, 761536],
       ['Christian', 5211, 5043, 7857964],
       ['Chicago, Illinois', 6886, 1297, 6199675]], dtype=object)

In [6]:
def baseline_model(sample, mode='view'):
    candidate_ids = item_dict.get(sample[0].strip().casefold(), [])
    true_id = sample[2]
    #print(true_id, candidate_ids)
    
    #return false if there are no potential ids for a given entity name
    if len(candidate_ids) == 0:
        print(f'entity not in dictionary: {sample[0]}')
        return False
    
    #if there is only one id then check if that id is a match
    if len(candidate_ids) == 1:
        return candidate_ids[0] == true_id
    else:
        if mode == 'view':
            tie_break_list = [item_views_dict.get(i, 0) for i in candidate_ids]
        elif mode == 'count':
            tie_break_list = [item_counts_dict.get(i, 0) for i in candidate_ids]
        else:
            raise Exception(f'Invalid Mode: {mode}')
        candidate_id = candidate_ids[np.argmax(tie_break_list)]
        return candidate_id == true_id

print([baseline_model(i) for i in entity_df.sample(100, random_state=1).to_numpy()])

[True, False, True, True, True, False, True, False, True, False, True, True, True, True, True, False, True, True, True, True, True, False, True, True, True, True, False, True, False, True, True, True, True, True, True, True, False, False, True, False, True, True, False, False, True, True, True, False, True, True, True, True, True, True, False, False, True, False, True, True, False, False, True, False, True, False, True, False, True, True, False, False, True, True, True, True, True, True, True, True, False, True, True, False, True, True, False, True, True, True, True, True, True, True, True, False, True, True, True, True]


In [7]:
import multiprocessing as mp
from functools import partial
n_cores = mp.cpu_count()
n_cores

16

In [8]:
def run_test(samples_per_test, iterations, mode='view', multi = True):
    for i in range(iterations):
        samples = entity_df.sample(n=samples_per_test).to_numpy()
        
        if multi == False:
            results = [baseline_model(i, mode) for i in samples]
        else:
            baseline_model_mode=partial(baseline_model, mode=mode)

            p = mp.Pool(n_cores-1)
            results = p.map(baseline_model_mode, samples)
            
        print(f'test: {i+1}\tnum_samples: {len(results)}\taccuracy: {sum(results)/len(results)}\tmode: {mode}')
        

In [9]:
print('Baseline accuracy when disambiguating using the entity with the most views')
run_test(100000, 10, 'view')

Baseline accuracy when disambiguating using the entity with the most views
test: 1	num_samples: 100000	accuracy: 0.66431	mode: view
test: 2	num_samples: 100000	accuracy: 0.66493	mode: view
test: 3	num_samples: 100000	accuracy: 0.66461	mode: view
test: 4	num_samples: 100000	accuracy: 0.66268	mode: view
test: 5	num_samples: 100000	accuracy: 0.66151	mode: view
test: 6	num_samples: 100000	accuracy: 0.66432	mode: view
test: 7	num_samples: 100000	accuracy: 0.665	mode: view
test: 8	num_samples: 100000	accuracy: 0.66245	mode: view
test: 9	num_samples: 100000	accuracy: 0.66233	mode: view
test: 10	num_samples: 100000	accuracy: 0.66144	mode: view


In [10]:
print('Baseline accuracy when disambiguating using the entity with the most counts')
run_test(100000, 10, 'count')

Baseline accuracy when disambiguating using the entity with the most counts
test: 1	num_samples: 100000	accuracy: 0.72147	mode: count
test: 2	num_samples: 100000	accuracy: 0.72308	mode: count
test: 3	num_samples: 100000	accuracy: 0.71851	mode: count
test: 4	num_samples: 100000	accuracy: 0.71938	mode: count
test: 5	num_samples: 100000	accuracy: 0.71966	mode: count
test: 6	num_samples: 100000	accuracy: 0.71882	mode: count
test: 7	num_samples: 100000	accuracy: 0.72122	mode: count
test: 8	num_samples: 100000	accuracy: 0.72045	mode: count
test: 9	num_samples: 100000	accuracy: 0.72091	mode: count
test: 10	num_samples: 100000	accuracy: 0.72051	mode: count
