In [1]:
import pandas as pd
import numpy as np
import time
import pickle

pd.options.display.float_format = "{:.5f}".format

In [2]:
%%time
statements_full_df = pd.read_csv('KDWD/statements.csv', dtype='int32')
statements_full_df.sort_values(by=['source_item_id'], inplace=True)
display(statements_full_df.dtypes)
statements_full_df

source_item_id      int32
edge_property_id    int32
target_item_id      int32
dtype: object

CPU times: user 26.4 s, sys: 2.52 s, total: 28.9 s
Wall time: 28.9 s


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
21,1,1343,602358
22,1,1419,5457948
23,1,1424,22903368
24,1,1552,11412
...,...,...,...
141206848,77257484,59,9286
141206849,77257491,31,318
141206850,77257491,59,9286
141206851,77257493,31,318


In [3]:
property_df = pd.read_csv('data/property_details.csv')
property_df.priority = property_df.priority.fillna(4).astype(int)
property_df

Unnamed: 0,priority,property_id,en_label,en_description,popularity,aliases,aliases_count
0,1,31,instance of,that class of which this subject is a particul...,26016076,"['is a', 'is an', 'has class', 'has type', 'is...",24.00000
1,1,279,subclass of,all instances of these items are instances of ...,1731815,"['rdfs:subClassOf', 'hyponym of', 'has supercl...",19.00000
2,1,460,said to be the same as,"this item is said to be the same as that item,...",78860,"['same as', 'disputed equivalence', 'the same ...",15.00000
3,1,734,family name,part of full name of person,1692849,"['last name', 'surname']",2.00000
4,1,155,follows,immediately prior item in a series of which th...,289262,"['succeeds to', 'previous is', 'before was', '...",11.00000
...,...,...,...,...,...,...,...
1213,4,5439,research measurement,a measurement of an experimentally observed va...,1,,
1214,4,7174,school class,(qualifier) class of a given year/time period ...,1,"['school cohort', 'cohort', 'class']",3.00000
1215,4,2443,stage reached,ultimate point in an event or competition reac...,1,"['eliminated at', 'round reached', 'point achi...",3.00000
1216,4,1310,statement disputed by,entity that disputes a given statement,1,"['disputed by', 'rejected by', 'opposed by']",3.00000


In [4]:
remove_property_ids = property_df[property_df.priority > 4].property_id
remove_property_ids

126     527
127     681
128     282
129     682
130     137
       ... 
189    5996
190    1754
191    1855
192    4329
193    1889
Name: property_id, Length: 68, dtype: int64

In [5]:
%%time
statements_df_filtered = statements_full_df[~statements_full_df.edge_property_id.isin(remove_property_ids)]
statements_df_filtered

CPU times: user 8.99 s, sys: 1.68 s, total: 10.7 s
Wall time: 10.7 s


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
22,1,1419,5457948
24,1,1552,11412
27,1,2184,136407
28,1,2579,338
...,...,...,...
141206843,77257472,31,318
141206845,77257483,31,318
141206847,77257484,31,318
141206849,77257491,31,318


In [6]:
id_count_df = pd.read_csv('data/id_counts.csv')
display(id_count_df)
display(id_count_df[['counts','views']].describe())

Unnamed: 0,page_id,item_id,title,views,counts
0,12,6199,Anarchism,31335,3540
1,25,38404,Autism,49693,2114
2,39,101038,Albedo,14573,2825
3,290,9659,A,25859,175
4,303,173,Alabama,52765,11125
...,...,...,...,...,...
5362169,62470350,76894635,Daming Zhu,16,0
5362170,62470423,76894633,Tony Dews,7,2
5362171,62470432,76896959,Samsung PL20,9,0
5362172,62470465,6034153,Nils-Fredrik Palmstierna,8,3


Unnamed: 0,counts,views
count,5362174.0,5362174.0
mean,22.72128,455.49829
std,356.40711,6783.22845
min,0.0,5.0
25%,1.0,21.0
50%,3.0,53.0
75%,10.0,181.0
max,283127.0,13300025.0


In [7]:
light_filter_ids = id_count_df[(id_count_df.counts > 0) & (id_count_df.views > 0)].page_id
heavy_filter_ids = id_count_df[(id_count_df.counts >= 10) & (id_count_df.views >= 181)].page_id
print(len(light_filter_ids), len(heavy_filter_ids))

4517152 885419


In [8]:
%%time
statements_df_large = statements_df_filtered[statements_df_filtered.source_item_id.isin(light_filter_ids) \
                                             | statements_df_filtered.target_item_id.isin(light_filter_ids)]
statements_df_large

CPU times: user 19.1 s, sys: 1.83 s, total: 20.9 s
Wall time: 20.9 s


Unnamed: 0,source_item_id,edge_property_id,target_item_id
27,1,2184,136407
29,1,2670,6999
31,1,2670,79925
34,1,2670,6005984
32,1,2670,185674
...,...,...,...
141206292,77257096,159,752762
141206432,77257167,175,122003
141206527,77257218,17,664
141206654,77257283,407,9309


In [9]:
%%time
statements_df_medium = statements_df_large[statements_df_large.source_item_id.isin(light_filter_ids) \
                                           & statements_df_large.target_item_id.isin(light_filter_ids)]
statements_df_medium

CPU times: user 6.34 s, sys: 608 ms, total: 6.95 s
Wall time: 6.95 s


Unnamed: 0,source_item_id,edge_property_id,target_item_id
1384,25,194,493517
1383,25,163,185692
1405,25,2633,2670632
1407,25,2936,9309
1354,25,36,10690
...,...,...,...
125370990,62469942,186,296955
125371045,62469970,136,191163
125371344,62470119,276,1192305
125371341,62470119,186,296955


In [10]:
%%time
statements_df_small = statements_df_medium[statements_df_medium.source_item_id.isin(heavy_filter_ids) \
                                           & statements_df_medium.target_item_id.isin(heavy_filter_ids)]
statements_df_small

CPU times: user 451 ms, sys: 7.94 ms, total: 459 ms
Wall time: 458 ms


Unnamed: 0,source_item_id,edge_property_id,target_item_id
1384,25,194,493517
1383,25,163,185692
1359,25,85,160036
1362,25,150,156150
1372,25,150,697126
...,...,...,...
125319547,62439410,279,8054
125339416,62451323,279,8054
125339415,62451323,31,8054
125366357,62467275,31,8054


In [11]:
def make_graph_dict(df):
    st = time.time()
    # get unique entities and index of those entities
    print(f'Generating indices of source nodes {time.time()-st}')
    statements_np = df[['source_item_id', 'target_item_id']].to_numpy()
    statements_np = statements_np[statements_np[:,0].argsort()]
    display(statements_np)

    unique, indices = np.unique(statements_np[:,0], return_index=True)
    print('source ids:', len(unique), unique)

    indices = np.append(indices, len(statements_np))
    print('id indicies:', len(indices), indices[-10:])
    
    print(f'Creating Dict {time.time()-st}')
    statement_dict = {j:np.unique(statements_np[indices[i]:indices[i+1], 1]) for i, j in enumerate(unique)}
    print(len(statement_dict.items()))
    return statement_dict

In [12]:
%%time
statement_dict_filtered = make_graph_dict(statements_df_filtered)
display(len(statement_dict_filtered))
statement_dict_filtered[25]

Generating indices of source nodes 4.76837158203125e-07


array([[       1, 36906466],
       [       1,  1079826],
       [       1,  2051667],
       ...,
       [77257484,      318],
       [77257491,      318],
       [77257493,      318]], dtype=int32)

source ids: 38488233 [       1        2        3 ... 77257484 77257491 77257493]
id indicies: 38488234 [98499888 98499889 98499890 98499891 98499892 98499893 98499894 98499895
 98499896 98499897]
Creating Dict 5.399371385574341
38488233


38488233

CPU times: user 6min 53s, sys: 4.15 s, total: 6min 58s
Wall time: 6min 58s


array([      21,       46,      145,     1860,     6256,     6266,
           9309,    10690,    25224,   109128,   156150,   160036,
         168159,   185692,   207176,   213361,   217142,   217829,
         217840,   331697,   493517,   505610,   596885,   643919,
         650682,   666063,   697126,   748065,   748078,   817960,
         817971,   843868,   844784,  1063608,  2670632,  3024240,
        3112646,  3306663,  3336843,  6767407, 10996863, 11294004,
       19588569, 24342199, 27103330], dtype=int32)

In [13]:
%%time
statement_dict_large = make_graph_dict(statements_df_large)
display(len(statement_dict_large))
statement_dict_large[25]

Generating indices of source nodes 7.152557373046875e-07


array([[       1,   136407],
       [       1,     6999],
       [       1,    79925],
       ...,
       [77257218,      664],
       [77257283,     9309],
       [77257288,     3947]], dtype=int32)

source ids: 18861189 [       1        2        3 ... 77257218 77257283 77257288]
id indicies: 18861190 [32159865 32159867 32159868 32159869 32159871 32159872 32159873 32159874
 32159875 32159876]
Creating Dict 1.7143964767456055
18861189


18861189

CPU times: user 3min 20s, sys: 2.06 s, total: 3min 22s
Wall time: 3min 22s


array([      21,       46,      145,     1860,     6256,     6266,
           9309,    10690,    25224,   109128,   156150,   160036,
         168159,   185692,   207176,   213361,   217142,   217829,
         217840,   331697,   493517,   505610,   596885,   643919,
         650682,   666063,   697126,   748065,   748078,   817960,
         817971,   843868,   844784,  1063608,  2670632,  3024240,
        3112646,  3306663,  3336843,  6767407, 10996863, 11294004,
       19588569, 24342199, 27103330], dtype=int32)

In [14]:
%%time
statement_dict_medium = make_graph_dict(statements_df_medium)
display(len(statement_dict_medium))
statement_dict_medium[25]

Generating indices of source nodes 2.384185791015625e-07


array([[      25,   493517],
       [      25,   213361],
       [      25,   697126],
       ...,
       [62470119,   296955],
       [62470119,  1192305],
       [62470119,  1192305]], dtype=int32)

source ids: 1338715 [      25       39      303 ... 62469942 62469970 62470119]
id indicies: 1338716 [2256789 2256791 2256793 2256795 2256797 2256799 2256800 2256801 2256802
 2256805]
Creating Dict 0.11435866355895996
1338715


1338715

CPU times: user 14.3 s, sys: 116 ms, total: 14.4 s
Wall time: 14.4 s


array([   9309,   10690,  109128,  156150,  160036,  185692,  213361,
        493517,  697126, 2670632], dtype=int32)

In [15]:
%%time
statement_dict_small = make_graph_dict(statements_df_small)
display(len(statement_dict_small))
statement_dict_small[25]

Generating indices of source nodes 2.384185791015625e-07


array([[      25,   493517],
       [      25,   185692],
       [      25,   160036],
       ...,
       [62451323,     8054],
       [62467275,     8054],
       [62467275,     8054]], dtype=int32)

source ids: 337883 [      25       39      303 ... 62439410 62451323 62467275]
id indicies: 337884 [588513 588515 588517 588519 588521 588523 588525 588527 588529 588531]
Creating Dict 0.033460378646850586
337883


337883

CPU times: user 3.65 s, sys: 16.1 ms, total: 3.67 s
Wall time: 3.66 s


array([156150, 160036, 185692, 213361, 493517, 697126], dtype=int32)

In [16]:
%%time
pickle.dump(statement_dict_filtered, open('data/graph_filtered.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

CPU times: user 6min 10s, sys: 14.9 s, total: 6min 25s
Wall time: 6min 21s


In [17]:
%%time
pickle.dump(statement_dict_large, open('data/graph_large.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

CPU times: user 2min 53s, sys: 7.08 s, total: 3min
Wall time: 3min 5s


In [18]:
%%time
pickle.dump(statement_dict_medium, open('data/graph_medium.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

CPU times: user 11.6 s, sys: 558 ms, total: 12.2 s
Wall time: 12.2 s


In [19]:
%%time
pickle.dump(statement_dict_small, open('data/graph_small.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

CPU times: user 2.87 s, sys: 112 ms, total: 2.98 s
Wall time: 3.01 s


### Format graph to run through node2vec and generate embeddings

In [20]:
property_priority_dict = property_df.set_index('property_id').priority.to_dict()
property_priority_dict[31]

1

In [21]:
%%time
statements_df_embed = statements_df_large.copy() # choose which size graph you want to embed
statements_df_embed['priority'] = [property_priority_dict[i] for i in statements_df_embed.edge_property_id.values]
statements_df_embed

CPU times: user 1min 34s, sys: 371 ms, total: 1min 34s
Wall time: 1min 34s


Unnamed: 0,source_item_id,edge_property_id,target_item_id,priority
27,1,2184,136407,4
29,1,2670,6999,4
31,1,2670,79925,4
34,1,2670,6005984,4
32,1,2670,185674,4
...,...,...,...,...
141206292,77257096,159,752762,3
141206432,77257167,175,122003,2
141206527,77257218,17,664,4
141206654,77257283,407,9309,4


In [22]:
weight_mapping = {1:5, 2:4, 3:3, 4:1}
statements_df_embed['weight'] = [weight_mapping.get(i, -1) for i in statements_df_embed.priority]
statements_df_embed = statements_df_embed.groupby(['source_item_id', 'target_item_id'])['weight'].sum().to_frame().reset_index()
statements_df_embed

Unnamed: 0,source_item_id,target_item_id,weight
0,1,6999,1
1,1,79925,1
2,1,136407,1
3,1,185674,1
4,1,497745,1
...,...,...,...
30684210,77257096,752762,3
30684211,77257167,122003,4
30684212,77257218,664,1
30684213,77257283,9309,1


In [23]:
%%time
#Exports to the format used by node2vec
statements_df_embed.to_csv('data/statements_embed.csv', header=False, index=False, sep=' ')

CPU times: user 54.1 s, sys: 506 ms, total: 54.6 s
Wall time: 54.6 s
