In [1]:
import pandas as pd
import numpy as np
import pickle
import sys

In [2]:
%%time
#unidirection will take ~2.5 min
#bidirectional will take ~5 min
graph_df = pd.read_csv('statements_uni.csv', delimiter = ',')
graph_dict = dict(zip(graph_df.Source.values, [np.fromstring(i, dtype=int, sep=',') for i in graph_df.Targets.values]))
del graph_df
print(graph_dict[9036])

[  367211       60  6581097 16086586 12750167       30    28513   131964
        5    31519   689775  1905051    43035   101333     9301   169470
   205375  1326886  1906857  2267418   632404  4920135   613201    93728
   111734   678263  1332315  1366018  1517792  3332215  5442484 17552952
 41799198   127885  1280677   131566  1132636  1132636    12152       60
       90     1085     1781     3617    13298    49258   367211 37251206
 15501913    93996  6374306  3739104   602358  2041543 12912667 17378135
 67311526    38104      150      188      397      652     1860     9056
     9067     9301    83364 16098713  5460604  1700481     1860]
Wall time: 2min 28s


In [3]:
dataset = pickle.load(open("dataset_full.p", "rb"))
dataset

[([[185764,
    834155,
    1422782,
    3153728,
    3538525,
    7838803,
    7838804,
    7838807,
    16034524,
    17035733,
    25096865,
    48625672,
    50645897,
    50845853,
    61085400],
   [],
   []],
  array(['trespass', 'trees &mdash', 'the forested area'], dtype=object),
  ('trespass', 215415, 3153728)),
 ([[], [], [8108, 2493880, 37460778, 39073105], [], [6590, 12681495], [18973]],
  array(['safavid iranian', 'garisi', 'georgian', 'tetritsqaro', '1556',
         'georgians'], dtype=object),
  ('Safavid Iranian', 181786, 161205)),
 ([[18470070]],
  array(['metropolitan region of sorocaba'], dtype=object),
  ('Metropolitan Region of Sorocaba', 47961025, 18470070)),
 ([[1253103],
   [44539,
    226802,
    243999,
    427075,
    1660212,
    2142152,
    2362130,
    2418186,
    2854437,
    2877448,
    2985936,
    3287625,
    3447972,
    3517472,
    3624241,
    3709258,
    3983166,
    7654661,
    7698406,
    7698407,
    7698408,
    7698410,
    7698415,
 

In [4]:
item_views_dict = pickle.load(open("item_views.p", "rb"))
print(item_views_dict[6199])

31335


In [29]:
def get_samples(n, dataset, seed=None):
    np.random.seed(seed)
    index_list = np.random.randint(len(dataset), size=n)
    return [dataset[i] for i in index_list] 

samples = get_samples(100, dataset, 0)
samples[:5]

[([[40,
    359060,
    783470,
    2325193,
    4825411,
    25114801,
    57406029,
    57406035,
    57406046,
    64156771,
    71872582,
    131964,
    211216,
    533534,
    36521584,
    49813633],
   [34713, 43325, 694307, 19507024, 30073416, 32276132, 186719]],
  array(['austria', 'salzburg'], dtype=object),
  ('Austria', 26964606, 40)),
 ([[128011, 62061764],
   [18582228],
   [2810976],
   [778, 800198, 4842324, 6459112],
   [7569750],
   [49096186]],
  array(['greater antilles', 'mesoglossus', 'harrimaniidae', 'bahamas',
         'southern caribbean', 'gulf of mexico'], dtype=object),
  ('Greater Antilles', 298550, 128011)),
 ([[],
   [843, 4121082, 20775002, 57939780, 57939782, 64147117, 2006542],
   [160554]],
  array(['established in 1947', 'pakistan', 'muhammad ali jinnah'],
        dtype=object),
  ('established in 1947', 265059, 129053)),
 ([[545449],
   [17672736],
   [408612,
    769560,
    898149,
    1440787,
    2665675,
    2815972,
    3024870,
    3542588,


In [45]:
def check(guess, truth):
    return guess == truth

def get_next_level(level_graph):
    next_level = np.unique(np.concatenate([graph_dict.get(i, np.array([], dtype=int)) for i in level_graph[-1]]))
    return np.setdiff1d(next_level, np.concatenate(level_graph))

def check_for_match(neighbor_ids, level_ids):
    #print(np.concatenate(neighbor_ids))
    #print(level_ids)
    matches = np.intersect1d(np.concatenate(neighbor_ids), level_ids)
    #print(matches)
    return matches

def get_highest_view_id(potential_ids, true_id):
    view_list = [item_views_dict.get(i, 0) for i in potential_ids]
    guess_id = potential_ids[np.argmax(view_list)]
    return check(guess_id, true_id)
    

def bfs(sample, graph, baseline=False, levels=5):
    potential_ids = np.array(sample[0][0])
    viable_neighbor_ids = [i for i in sample[0][1:] if len(i) > 0]
    true_id = sample[2][2]
    
    #print(f'true: {sample[2]}\npotential: {potential_ids}\nneighbors: {viable_neighbor_ids}')
    
    #return [len(i) for i in sample[0]]
    
    #return false if there are no potential ids for a given entity name
    if len(potential_ids) == 0:
        return False
    
    #if there is only one id then check if that id is a match
    if len(potential_ids) == 1:
        return check(potential_ids[0], true_id)
    
    #bypasses graph traversal and checks the id with the most view in 2 scenarios
    #1. baseline model
    #2. There are no neighboring entities or all the neighboring entities have no item_ids
    if baseline or len(viable_neighbor_ids) == 0:
        return get_highest_view_id(potential_ids, true_id)
    
    match_strength = []
    total_neighbors = len(np.concatenate(viable_neighbor_ids))
    for idx, potential_id in enumerate(potential_ids):
        match = False
        level_id_list = [pd.unique(graph_dict.get(potential_id, np.array([], dtype=int)))]
        matches = check_for_match(viable_neighbor_ids, level_id_list[-1])
        level_matches = [matches]
        total_matches = len(matches)
        
        curr_level = 2
        while curr_level < levels+1 and total_matches < total_neighbors and len(level_id_list[-1] > 0):
            level_id_list.append(get_next_level(level_id_list))
            matches = check_for_match(viable_neighbor_ids, level_id_list[-1])
            level_matches.append(matches)
            total_matches += len(matches)
            curr_level += 1
            
        #print the level, number of nodes, and match ids
        [print(i+1, len(j[0]), j[1]) for i, j in enumerate(zip(level_id_list, level_matches))]
        match_strength.append(sum([len(j)/2**(i) for i, j in enumerate(level_matches)]))
        
    print(f'match_strength: {match_strength}')
    
    #get all ids with the highest match strength score
    strongest_ids = [potential_ids[i] for i, j in enumerate(match_strength) if j == max(match_strength)]
    print(f'strongest_ids: {strongest_ids}')
    
    if len(strongest_ids) == 1:
        return check(strongest_ids[0], true_id)
    else:
        return get_highest_view_id(strongest_ids, true_id)
        
        #for i in level_id_list:
        #    print(potential_id, len(i))
    
#bfs(samples[1], graph_dict, baseline=True)
#bfs(samples[6], graph_dict)

results = [bfs(i, graph_dict, True) for i in get_samples(1000, dataset)]
sum(results)/len(results)

0.701

In [169]:
a = [5, 6, 7, 5, 7]
b = [1, 2, 3, 4, 5]
c = []
[b[i] for i, j in enumerate(a) if j == max(a)]
pd.unique(c)
sum([True, True, False])

2

In [4]:
def bfs(sample, graph):
    primary_ent = sample

def traverse(root, levels=1):
    level = [np.unique(graph_dict.get(root))]
    for i in range(1, levels):
        temp = np.unique(np.concatenate([graph_dict.get(j, np.array([], dtype=int)) for j in level[i-1]]))
        level.append(np.setdiff1d(temp, np.concatenate(level)))
    return level

for i in range(1, 11): 
    print(f'{i} levels')
    %time test = traverse(9036, i)
    print(len(test[-1]), 'nodes')

1 levels
Wall time: 11 ms
66 nodes
2 levels
Wall time: 46 ms
2462 nodes
3 levels
Wall time: 612 ms
37805 nodes
4 levels
Wall time: 1.86 s
166908 nodes
5 levels
Wall time: 2.44 s
326619 nodes
6 levels
Wall time: 4.06 s
371614 nodes
7 levels
Wall time: 6.02 s
305859 nodes
8 levels
Wall time: 6.26 s
211902 nodes
9 levels
Wall time: 6.74 s
156552 nodes
10 levels
Wall time: 10.9 s
99839 nodes
