In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
%%time
entity_df_full = pd.read_csv('combined_entities_full.csv')
entity_df_full

Wall time: 25 s


Unnamed: 0,entity,page_id,item_id,text_id
0,anti-authoritarian,867979,1030234,0
1,political,23040,179805,0
2,social philosophy,586276,180592,0
3,hierarchies,13998,188619,0
4,self-managed,40949353,15981562,0
...,...,...,...,...
35195863,Tomasa Tequiero,39519608,6148330,5343559
35195864,Sos mi hombre,39950100,6132611,5343559
35195865,Luis Gatica,2099374,6316177,5343559
35195866,Lucho Gatica,2112544,954681,5343559


In [3]:
%%time
#This dataset only contains entities that have a matching item and item_id in the dictionary
entity_df_filtered = pd.read_csv('combined_entities_filtered.csv')
entity_df_filtered

Wall time: 20.7 s


Unnamed: 0,entity,page_id,item_id,text_id
0,social philosophy,586276,180592,0
1,cooperative,89313,4539,0
2,far-left,18247344,1129409,0
3,political spectrum,23490,210918,0
4,collectivism,5708,237789,0
...,...,...,...,...
26712752,Tomasa Tequiero,39519608,6148330,5343559
26712753,Sos mi hombre,39950100,6132611,5343559
26712754,Luis Gatica,2099374,6316177,5343559
26712755,Lucho Gatica,2112544,954681,5343559


In [4]:
%%time
item_dict = pickle.load(open("item_dict_full.p", "rb"))
print(item_dict['tesla'])

[163343, 478214, 765530, 1050485, 1428953, 1548225, 2384079, 2406220, 7705502, 19565583, 27701406, 31803712, 37251206, 56084926, 9036, 16258100]
Wall time: 1min 56s


In [5]:
item_views_dict = pickle.load(open("item_views.p", "rb"))
print(item_views_dict[6199])

31335


In [195]:
%%time
# Much faster than sampling using Pandas, can have duplicates

def get_samples(n, dataframe, seed=None):
    np.random.seed(seed)
    idx = np.random.randint(len(dataframe), size=n)
    samples = dataframe.iloc[idx,:].to_numpy()
    return samples

sampled_entities = get_samples(100000, entity_df_full)
sampled_entities

Wall time: 519 ms


array([['Peshawar', 66230, 1113311, 764495],
       ['Ajab Prem Ki Ghazab Kahani', 18279525, 2355787, 641511],
       ['Alma mater', 7971721, 1357677, 545077],
       ...,
       ['English', 18803164, 42406, 1670888],
       ['Fort Zeelandia (Taiwan)', 680757, 20706255, 2358275],
       ['studio album', 528282, 482994, 4299804]], dtype=object)

In [196]:
%%time
sampled_entities = entity_df_full.sample(n=100000, replace=False).to_numpy()
sampled_entities

Wall time: 3.02 s


array([['Scotland', 26994, 22, 1878185],
       ['disclosed', 11586, 842234, 11942],
       ['ICC', 381906, 722694, 2643619],
       ...,
       ['Tamarix gallica', 23263182, 164285, 4980280],
       ['police', 23627, 35535, 579309],
       ['Philips Consumer Electronics', 23550, 170416, 268512]],
      dtype=object)

In [206]:
def baseline_model(sample, skip=True):
    candidate_ids = item_dict.get(sample[0].lower(), [])
    true_id = sample[2]
    #print(true_id, candidate_ids)
    
    #return false if there are no potential ids for a given entity name
    if len(candidate_ids) == 0:
        if skip:
            return None
        return False
    
    #if there is only one id then check if that id is a match
    if len(candidate_ids) == 1:
        return candidate_ids[0] == true_id
    else:
        view_list = [item_views_dict.get(i, 0) for i in candidate_ids]
        candidate_id = candidate_ids[np.argmax(view_list)]
        return candidate_id == true_id

print([baseline_model(i, skip=True) for i in sampled_entities[:100]])

[False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, None, False, True, False, True, True, False, True, False, True, True, True, None, False, False, None, True, True, True, True, True, True, False, False, True, False, False, None, True, True, True, None, True, True, False, True, False, True, True, True, True, True, True, None, True, True, True, True, True, False, True, True, False, True, False, None, True, False, True, True, False, None, True, True, True, True, True, None, False, True, False, None, False, False, None, True, True, False, None, True, False, True, None, True, True]


In [203]:
def run_test(samples_per_test, iterations, skip=True):
    for i in range(iterations):
        sampled_entities = entity_df_full.sample(n=samples_per_test, replace=False).to_numpy()
        results = [baseline_model(i, skip) for i in sampled_entities]
        results = [i for i in results if i is not None] #only need to run this if skipping
        print(f'num_samples: {len(results)} accuracy: {sum(results)/len(results)}')
        
run_test(100000, 10)

num_samples: 88056 accuracy: 0.7968111201962388
num_samples: 87876 accuracy: 0.7941645045291092
num_samples: 87736 accuracy: 0.7936308926780341
num_samples: 88086 accuracy: 0.7951660876870331
num_samples: 87771 accuracy: 0.7963222476672249
num_samples: 87937 accuracy: 0.7953080045941981
num_samples: 87869 accuracy: 0.7992010834310166
num_samples: 88043 accuracy: 0.7975875424508478
num_samples: 87946 accuracy: 0.7952720987878926
num_samples: 87880 accuracy: 0.7983045061447428


In [204]:
run_test(100000, 10, skip=False)

num_samples: 100000 accuracy: 0.69729
num_samples: 100000 accuracy: 0.69849
num_samples: 100000 accuracy: 0.69877
num_samples: 100000 accuracy: 0.70079
num_samples: 100000 accuracy: 0.70093
num_samples: 100000 accuracy: 0.69958
num_samples: 100000 accuracy: 0.70084
num_samples: 100000 accuracy: 0.6988
num_samples: 100000 accuracy: 0.69966
num_samples: 100000 accuracy: 0.69691
