In [1]:
import pandas as pd
import numpy as np
import time
import pickle

from itertools import chain 

pd.options.display.float_format = "{:.5f}".format

In [2]:
%%time
statements_full_df = pd.read_csv('KDWD/statements.csv', dtype='int32')
display(statements_full_df.dtypes)
statements_full_df

source_item_id      int32
edge_property_id    int32
target_item_id      int32
dtype: object

CPU times: user 22.3 s, sys: 1.54 s, total: 23.9 s
Wall time: 32.4 s


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,31,36906466
1,1,279,3695190
2,1,398,497745
3,1,398,1133705
4,1,398,1139177
...,...,...,...
141206848,77257484,59,9286
141206849,77257491,31,318
141206850,77257491,59,9286
141206851,77257493,31,318


In [3]:
property_df = pd.read_csv('data/property_details.csv')
property_df.priority = property_df.priority.fillna(4).astype(int)
display(property_df)
np.unique(property_df.priority)

Unnamed: 0,priority,property_id,en_label,en_description,popularity,aliases,aliases_count
0,1,31,instance of,that class of which this subject is a particul...,26016076,"['is a', 'is an', 'has class', 'has type', 'is...",24.00000
1,1,279,subclass of,all instances of these items are instances of ...,1731815,"['rdfs:subClassOf', 'hyponym of', 'has supercl...",19.00000
2,1,460,said to be the same as,"this item is said to be the same as that item,...",78860,"['same as', 'disputed equivalence', 'the same ...",15.00000
3,1,734,family name,part of full name of person,1692849,"['last name', 'surname']",2.00000
4,1,155,follows,immediately prior item in a series of which th...,289262,"['succeeds to', 'previous is', 'before was', '...",11.00000
...,...,...,...,...,...,...,...
1213,4,5439,research measurement,a measurement of an experimentally observed va...,1,,
1214,4,7174,school class,(qualifier) class of a given year/time period ...,1,"['school cohort', 'cohort', 'class']",3.00000
1215,4,2443,stage reached,ultimate point in an event or competition reac...,1,"['eliminated at', 'round reached', 'point achi...",3.00000
1216,4,1310,statement disputed by,entity that disputes a given statement,1,"['disputed by', 'rejected by', 'opposed by']",3.00000


array([ 1,  2,  3,  4,  8,  9, 10])

In [4]:
property_priority_dict = property_df.set_index('property_id').priority.to_dict()
property_priority_dict[31]

1

In [5]:
%%time
statements_full_df['priority'] = [property_priority_dict[i] for i in statements_full_df.edge_property_id]
statements_full_df

CPU times: user 55.2 s, sys: 1.24 s, total: 56.4 s
Wall time: 56.3 s


Unnamed: 0,source_item_id,edge_property_id,target_item_id,priority
0,1,31,36906466,1
1,1,279,3695190,1
2,1,398,497745,4
3,1,398,1133705,4
4,1,398,1139177,4
...,...,...,...,...
141206848,77257484,59,9286,8
141206849,77257491,31,318,1
141206850,77257491,59,9286,8
141206851,77257493,31,318,1


In [6]:
statements_filtered_df = statements_full_df[statements_full_df.priority <= 4].copy()
statements_filtered_df

Unnamed: 0,source_item_id,edge_property_id,target_item_id,priority
0,1,31,36906466,1
1,1,279,3695190,1
2,1,398,497745,4
3,1,398,1133705,4
4,1,398,1139177,4
...,...,...,...,...
141206843,77257472,31,318,1
141206845,77257483,31,318,1
141206847,77257484,31,318,1
141206849,77257491,31,318,1


In [7]:
%%time
weight_mapping = {1:5, 2:4, 3:3, 4:1}
statements_filtered_df['weight'] = [weight_mapping.get(i, -1) for i in statements_filtered_df.priority]
statements_filtered_df = statements_filtered_df.groupby(['source_item_id', 'target_item_id'])['weight'].sum().to_frame().reset_index()
statements_filtered_df

CPU times: user 1min 26s, sys: 6.53 s, total: 1min 32s
Wall time: 1min 32s


Unnamed: 0,source_item_id,target_item_id,weight
0,1,323,5
1,1,338,1
2,1,6999,1
3,1,11412,1
4,1,18343,1
...,...,...,...
95133823,77257472,318,5
95133824,77257483,318,5
95133825,77257484,318,5
95133826,77257491,318,5


In [8]:
id_counts_df = pd.read_csv("data/id_counts.csv").set_index('item_id')
display(id_counts_df)
item_views_dict = id_counts_df.views.to_dict()
item_counts_dict = id_counts_df.counts.to_dict()
print(item_views_dict[6199])
print(item_counts_dict[6199])

Unnamed: 0_level_0,page_id,title,views,counts
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6199,12,Anarchism,31335,3540
38404,25,Autism,49693,2114
101038,39,Albedo,14573,2825
9659,290,A,25859,175
173,303,Alabama,52765,11125
...,...,...,...,...
76894635,62470350,Daming Zhu,16,0
76894633,62470423,Tony Dews,7,2
76896959,62470432,Samsung PL20,9,0
6034153,62470465,Nils-Fredrik Palmstierna,8,3


31335
3540


In [9]:
%%time
statements_filtered_df['target_counts'] = [item_counts_dict.get(i, 0) for i in statements_filtered_df.target_item_id]
statements_filtered_df['target_views'] = [item_views_dict.get(i, 0) for i in statements_filtered_df.target_item_id]
statements_filtered_df

CPU times: user 1min 46s, sys: 1.89 s, total: 1min 48s
Wall time: 1min 48s


Unnamed: 0,source_item_id,target_item_id,weight,target_counts,target_views
0,1,323,5,1183,56881
1,1,338,1,1333,10328
2,1,6999,1,558,5718
3,1,11412,1,2311,46414
4,1,18343,1,453,16939
...,...,...,...,...,...
95133823,77257472,318,5,1348,25105
95133824,77257483,318,5,1348,25105
95133825,77257484,318,5,1348,25105
95133826,77257491,318,5,1348,25105


In [10]:
%%time
statements_sorted_df = statements_filtered_df.sort_values(by=['source_item_id', 'weight', 'target_counts', 'target_views'], \
                                                          ascending=[True, False, False, False]).reset_index(drop=True)
statements_sorted_df

CPU times: user 43.6 s, sys: 6.66 s, total: 50.3 s
Wall time: 50.3 s


Unnamed: 0,source_item_id,target_item_id,weight,target_counts,target_views
0,1,323,5,1183,56881
1,1,3695190,5,18,1148
2,1,36906466,5,0,0
3,1,273508,4,342,8005
4,1,837317,4,116,4778
...,...,...,...,...,...
95133823,77257472,318,5,1348,25105
95133824,77257483,318,5,1348,25105
95133825,77257484,318,5,1348,25105
95133826,77257491,318,5,1348,25105


In [11]:
# get unique entities and index of those entities
statements_np = statements_sorted_df[['source_item_id', 'target_item_id', 'weight']].to_numpy()
display(statements_np)

unique, indices = np.unique(statements_np[:,0], return_index=True)
print('source ids:', len(unique), unique)

indices = np.append(indices, len(statements_np))
print('id indices:', len(indices), indices[-10:])

array([[       1,      323,        5],
       [       1,  3695190,        5],
       [       1, 36906466,        5],
       ...,
       [77257484,      318,        5],
       [77257491,      318,        5],
       [77257493,      318,        5]])

source ids: 38488233 [       1        2        3 ... 77257484 77257491 77257493]
id indices: 38488234 [95133819 95133820 95133821 95133822 95133823 95133824 95133825 95133826
 95133827 95133828]


In [13]:
%%time
statements_groups = [[j, statements_np[indices[i]:indices[i+1], 1], statements_np[indices[i]:indices[i+1], 2]] for i, j in enumerate(unique)]
len(statements_groups)

CPU times: user 1min 31s, sys: 3.22 s, total: 1min 34s
Wall time: 1min 34s


38488233

In [18]:
%%time
grouped_df = pd.DataFrame(statements_groups, columns=['source_item_id', 'target_item_ids', 'weights'])
grouped_df

CPU times: user 19.8 s, sys: 1.04 s, total: 20.8 s
Wall time: 20.7 s


Unnamed: 0,source_item_id,target_item_ids,weights
0,1,"[323, 3695190, 36906466, 273508, 837317, 12086...","[5, 5, 5, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, ..."
1,2,"[36133, 16502, 3504248, 323, 3400, 7879772, 18...","[5, 5, 5, 4, 4, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,"[483247, 203872, 937228, 1322005, 7239, 420, 1...","[5, 5, 5, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,4,"[3, 2996394, 2956046, 3505845, 1931388, 149086...","[5, 5, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1]"
4,5,"[215627, 15978631, 164509, 154954, 55983715, 1...","[6, 5, 5, 5, 5, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
38488228,77257472,[318],[5]
38488229,77257483,[318],[5]
38488230,77257484,[318],[5]
38488231,77257491,[318],[5]


In [24]:
def reduce_graph(df, n):
    tmp_df = df.copy()
    tmp_df.target_item_ids = [i[:n] for i in tmp_df.target_item_ids]
    tmp_df.weights = [i[:n] for i in tmp_df.weights]
    tmp_df['length'] = [len(i) for i in tmp_df.target_item_ids]
    print(f'Total items in graph: {np.sum(tmp_df.length)}')
    return tmp_df

In [25]:
%%time
statement_group_128 = reduce_graph(grouped_df, 128)
statement_group_128

Total items in graph: 95048260
CPU times: user 46.3 s, sys: 2.67 s, total: 48.9 s
Wall time: 48.9 s


Unnamed: 0,source_item_id,target_item_ids,weights,length
0,1,"[323, 3695190, 36906466, 273508, 837317, 12086...","[5, 5, 5, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, ...",29
1,2,"[36133, 16502, 3504248, 323, 3400, 7879772, 18...","[5, 5, 5, 4, 4, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, ...",37
2,3,"[483247, 203872, 937228, 1322005, 7239, 420, 1...","[5, 5, 5, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]",14
3,4,"[3, 2996394, 2956046, 3505845, 1931388, 149086...","[5, 5, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1]",12
4,5,"[215627, 15978631, 164509, 154954, 55983715, 1...","[6, 5, 5, 5, 5, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...",29
...,...,...,...,...
38488228,77257472,[318],[5],1
38488229,77257483,[318],[5],1
38488230,77257484,[318],[5],1
38488231,77257491,[318],[5],1


In [26]:
%%time
statement_group_32 = reduce_graph(grouped_df, 32)
statement_group_32

Total items in graph: 94644203
CPU times: user 46.6 s, sys: 2.88 s, total: 49.5 s
Wall time: 49.5 s


Unnamed: 0,source_item_id,target_item_ids,weights,length
0,1,"[323, 3695190, 36906466, 273508, 837317, 12086...","[5, 5, 5, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, ...",29
1,2,"[36133, 16502, 3504248, 323, 3400, 7879772, 18...","[5, 5, 5, 4, 4, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, ...",32
2,3,"[483247, 203872, 937228, 1322005, 7239, 420, 1...","[5, 5, 5, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]",14
3,4,"[3, 2996394, 2956046, 3505845, 1931388, 149086...","[5, 5, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1]",12
4,5,"[215627, 15978631, 164509, 154954, 55983715, 1...","[6, 5, 5, 5, 5, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...",29
...,...,...,...,...
38488228,77257472,[318],[5],1
38488229,77257483,[318],[5],1
38488230,77257484,[318],[5],1
38488231,77257491,[318],[5],1


In [27]:
%%time
statement_group_10 = reduce_graph(grouped_df, 10)
statement_group_10

Total items in graph: 90802072
CPU times: user 45.8 s, sys: 3.23 s, total: 49.1 s
Wall time: 49 s


Unnamed: 0,source_item_id,target_item_ids,weights,length
0,1,"[323, 3695190, 36906466, 273508, 837317, 12086...","[5, 5, 5, 4, 4, 4, 4, 4, 4, 1]",10
1,2,"[36133, 16502, 3504248, 323, 3400, 7879772, 18...","[5, 5, 5, 4, 4, 4, 4, 3, 1, 1]",10
2,3,"[483247, 203872, 937228, 1322005, 7239, 420, 1...","[5, 5, 5, 5, 4, 1, 1, 1, 1, 1]",10
3,4,"[3, 2996394, 2956046, 3505845, 1931388, 149086...","[5, 5, 5, 5, 2, 1, 1, 1, 1, 1]",10
4,5,"[215627, 15978631, 164509, 154954, 55983715, 1...","[6, 5, 5, 5, 5, 4, 2, 1, 1, 1]",10
...,...,...,...,...
38488228,77257472,[318],[5],1
38488229,77257483,[318],[5],1
38488230,77257484,[318],[5],1
38488231,77257491,[318],[5],1


In [28]:
%%time
statement_group_128.to_feather('data/graph_128.ftr')
statement_group_32.to_feather('data/graph_32.ftr')
statement_group_10.to_feather('data/graph_10.ftr')

CPU times: user 28.4 s, sys: 7.1 s, total: 35.5 s
Wall time: 34.8 s


### Format graph to run through node2vec and generate embeddings

In [32]:
%%time
embed_group_df = statement_group_128.copy()
embed_group_df

CPU times: user 701 ms, sys: 0 ns, total: 701 ms
Wall time: 700 ms


Unnamed: 0,source_item_id,target_item_ids,weights,length
0,1,"[323, 3695190, 36906466, 273508, 837317, 12086...","[5, 5, 5, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, ...",29
1,2,"[36133, 16502, 3504248, 323, 3400, 7879772, 18...","[5, 5, 5, 4, 4, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, ...",37
2,3,"[483247, 203872, 937228, 1322005, 7239, 420, 1...","[5, 5, 5, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]",14
3,4,"[3, 2996394, 2956046, 3505845, 1931388, 149086...","[5, 5, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1]",12
4,5,"[215627, 15978631, 164509, 154954, 55983715, 1...","[6, 5, 5, 5, 5, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...",29
...,...,...,...,...
38488228,77257472,[318],[5],1
38488229,77257483,[318],[5],1
38488230,77257484,[318],[5],1
38488231,77257491,[318],[5],1


In [33]:
%%time
embed_group_df['source_item_ids'] = [np.full(j, i) for i, j in embed_group_df[['source_item_id', 'length']]]
embed_group_df

CPU times: user 2min 46s, sys: 1.79 s, total: 2min 48s
Wall time: 2min 48s


Unnamed: 0,source_item_id,target_item_ids,weights,length,source_item_ids
0,1,"[323, 3695190, 36906466, 273508, 837317, 12086...","[5, 5, 5, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, ...",29,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2,"[36133, 16502, 3504248, 323, 3400, 7879772, 18...","[5, 5, 5, 4, 4, 4, 4, 3, 1, 1, 1, 1, 1, 1, 1, ...",37,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,3,"[483247, 203872, 937228, 1322005, 7239, 420, 1...","[5, 5, 5, 5, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]",14,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
3,4,"[3, 2996394, 2956046, 3505845, 1931388, 149086...","[5, 5, 5, 5, 2, 1, 1, 1, 1, 1, 1, 1]",12,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]"
4,5,"[215627, 15978631, 164509, 154954, 55983715, 1...","[6, 5, 5, 5, 5, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...",29,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
...,...,...,...,...,...
38488228,77257472,[318],[5],1,[77257472]
38488229,77257483,[318],[5],1,[77257483]
38488230,77257484,[318],[5],1,[77257484]
38488231,77257491,[318],[5],1,[77257491]


In [35]:
%%time
statement_embed = pd.DataFrame(np.concatenate(embed_group_df.source_item_ids.to_numpy()), columns=['source_item_id'])
statement_embed['target_item_id'] = np.concatenate(embed_group_df.target_item_ids.to_numpy())
statement_embed['weight'] = np.concatenate(embed_group_df.weights.to_numpy())
statement_embed

CPU times: user 25.9 s, sys: 1.07 s, total: 27 s
Wall time: 26.9 s


Unnamed: 0,source_item_id,target_item_id,weight
0,1,323,5
1,1,3695190,5
2,1,36906466,5
3,1,273508,4
4,1,837317,4
...,...,...,...
95048255,77257472,318,5
95048256,77257483,318,5
95048257,77257484,318,5
95048258,77257491,318,5


In [41]:
weight_counts = pd.DataFrame(np.unique(statement_embed.weight.to_numpy(), return_counts=True)).T
weight_counts

Unnamed: 0,0,1
0,1,28437784
1,2,360994
2,3,17797542
3,4,17427287
4,5,28459196
...,...,...
56,132,1
57,168,1
58,204,1
59,252,1


In [37]:
id_counts_df.describe()

Unnamed: 0,page_id,views,counts
count,5362174.0,5362174.0,5362174.0
mean,26913327.74195,455.49829,22.72128
std,18726305.66391,6783.22845,356.40711
min,12.0,5.0,0.0
25%,9634382.5,21.0,1.0
50%,25436919.5,53.0,3.0
75%,42601766.5,181.0,10.0
max,62473330.0,13300025.0,283127.0


In [38]:
light_filter_ids = id_counts_df[(id_counts_df.counts > 0) & (id_counts_df.views > 5)].index
print(light_filter_ids)
heavy_filter_ids = id_counts_df[(id_counts_df.counts > 10) & (id_counts_df.views > 181)].index
print(heavy_filter_ids)

Int64Index([    6199,    38404,   101038,     9659,      173,    41746,
                  91,      868,   853997,   277751,
            ...
            56374723, 76892151, 15690957, 16696928, 76891681, 76891691,
            76894639, 76894633,  6034153, 21083961],
           dtype='int64', name='item_id', length=4475122)
Int64Index([    6199,    38404,   101038,     9659,      173,    41746,
                  91,      868,   853997,   277751,
            ...
            76149547,  1461197, 76373863, 76495389, 76431987,  1069320,
            76548457,  4339424,  9026959, 76874768],
           dtype='int64', name='item_id', length=847100)


In [39]:
%%time
statement_embed_f1 = statement_embed[statement_embed.source_item_id.isin(light_filter_ids) \
                                     & statement_embed.target_item_id.isin(light_filter_ids)]
statement_embed_f1 = pd.concat([statement_embed[statement_embed.weight > 5], \
                                statement_embed_f1[statement_embed_f1.weight <= 5]])
statement_embed_f1

CPU times: user 20.3 s, sys: 2.23 s, total: 22.5 s
Wall time: 22.5 s


Unnamed: 0,source_item_id,target_item_id,weight
92,5,215627,6
146,16,49,7
274,17,1490,7
275,17,188712,6
434,20,46,7
...,...,...,...
94856958,76952747,2526255,4
94856959,76952747,3288991,3
94866531,76961040,482994,5
94868574,76963213,5,5


In [43]:
weight_counts['f1'] = np.unique(statement_embed_f1.weight.to_numpy(), return_counts=True)[1]
weight_counts[:12]

Unnamed: 0,0,1,f1
0,1,28437784,4179724
1,2,360994,151514
2,3,17797542,4766966
3,4,17427287,6643346
4,5,28459196,4831888
5,6,271709,271709
6,7,527045,527045
7,8,371689,371689
8,9,44629,44629
9,10,1289173,1289173


In [40]:
%%time
statement_embed_f2 = statement_embed_f1[statement_embed_f1.source_item_id.isin(heavy_filter_ids) \
                                        & statement_embed_f1.target_item_id.isin(heavy_filter_ids)]
statement_embed_f2 = pd.concat([statement_embed_f1[statement_embed_f1.weight > 3], \
                                statement_embed_f2[statement_embed_f2.weight <= 3]])
statement_embed_f2

CPU times: user 5.31 s, sys: 464 ms, total: 5.77 s
Wall time: 5.77 s


Unnamed: 0,source_item_id,target_item_id,weight
92,5,215627,6
146,16,49,7
274,17,1490,7
275,17,188712,6
434,20,46,7
...,...,...,...
94555191,76547561,145,3
94586587,76568609,30,1
94586588,76568609,1860,1
94688345,76719589,1484397,3


In [46]:
weight_counts['f2'] = np.unique(statement_embed_f2.weight.to_numpy(), return_counts=True)[1]
display(weight_counts[:12])
weight_counts.sum()

Unnamed: 0,0,1,f1,f2
0,1,28437784,4179724,1003749
1,2,360994,151514,46965
2,3,17797542,4766966,1018850
3,4,17427287,6643346,6643346
4,5,28459196,4831888,4831888
5,6,271709,271709,271709
6,7,527045,527045,527045
7,8,371689,371689,371689
8,9,44629,44629,44629
9,10,1289173,1289173,1289173


0         3277
1     95048260
f1    23138895
f2    16110255
dtype: int64

In [47]:
%%time
#Exports to the format used by node2vec
statement_embed_f1.to_csv('data/statements_embed_large.csv', header=False, index=False, sep=' ')
statement_embed_f2.to_csv('data/statements_embed_small.csv', header=False, index=False, sep=' ')

CPU times: user 1min 8s, sys: 572 ms, total: 1min 9s
Wall time: 1min 9s
