In [1]:
import json
import numpy as np
import torch
import pandas as pd
import pickle

In [2]:
import time
import graph_utils
from utils import get_metrics_dict
from graph_utils import build_graph, get_train_val_test_masks

In [3]:
with open('./nodes_new.json', 'r') as fh:
    dataset_nodes = json.load(fh)

In [4]:
with open('./edges.json', 'r') as fh:
    dataset_edges = json.load(fh)

### Summary statistics

In [5]:
print("number of nodes:", len(dataset_nodes.keys()))

number of nodes: 101244


In [6]:
print("number of nodes in relation set:", len(dataset_edges.keys()))

number of nodes in relation set: 100337


In [7]:
#join nodes with edges to get all the edge information 
#flatten node dataframe
df_node = pd.DataFrame.from_dict(dataset_nodes.items())
df_node.columns = ['node_id_str','features']
df_node = pd.concat([df_node, df_node['features'].apply(pd.Series)], axis = 1).drop('features', axis = 1)

In [8]:
df_node.head()

Unnamed: 0,node_id_str,gl,ms,se,ims
0,bn:00073045n,[A portable heating appliance used to warm an ...,WN:EN:space_heater,"[Space_heating, Spaceheater, space_heater]","[b3b7f9023408b19fec70e883030fc730181dc67c, 33e..."
1,bn:02412128n,[A tournament-style downhill speed skiing comp...,WIKI:EN:Ski_cross,"[Skiercross, skiercross, Skier-X, Skicross, Sk...","[da045aff0cc2a22d5f34df67e3ee066c906ae1d5, 494..."
2,bn:15049822n,"[The currency of Bermuda., Currency, The Bermu...",WIKI:EN:Bermudian_dollar,"[Bermudian_Dollar, Bermuda_dollar, Bermudian_d...","[6f9080eb62d0eb21ab96d0d9e5527da626149868, 3b1..."
3,bn:03603146n,"[Grenade launcher, The HK69A1 is a 40 mm grena...",WIKI:EN:Heckler_&_Koch_HK69A1,"[HK_79, Heckler_&_Koch_HK79A1, Heckler_&_Koch_...","[2a7f31a15c0f90e994711afe1fbd2b17905c30dc, d76..."
4,bn:03725374n,[The Jerusalem District is one of six administ...,GEONM:EN:Jerusalem_District,"[Jerusalem, Al_Quds_District, Jerusalem_Distri...","[28df9a413a48cd25f53ad773c7605f995c09b1bc, c99..."


In [9]:
df_node.se.isnull().values.any()

False

In [10]:
df_node.gl.isnull().values.any()

False

In [11]:
#flatten edge dataframe
from pandas.io.json import json_normalize
df_edges = pd.DataFrame.from_dict(dataset_edges.items())
df_edges.columns = ['node_id_str','features']

In [12]:
df_edges.head()

Unnamed: 0,node_id_str,features
0,bn:00046576n,"[{'s': 'bn:03760212n', 'r': 'RelatedTo', 'r_id..."
1,bn:00033022n,"[{'s': 'bn:00023394n', 'r': 'RelatedTo', 'r_id..."
2,bn:17006187n,"[{'s': 'bn:03547708n', 'r': 'RelatedTo', 'r_id..."
3,bn:00047705n,"[{'s': 'bn:00066209n', 'r': 'RelatedTo', 'r_id..."
4,bn:00009459n,"[{'s': 'bn:03722869n', 'r': 'RelatedTo', 'r_id..."


In [13]:
#get flatten edge df
df_list = []
for index, row in df_edges.iterrows():
    df_raw = json_normalize(row['features'])
    df_raw['node_id_str'] = row['node_id_str']
    df_list.append(df_raw)
df_edges_all = pd.concat(df_list, ignore_index=True)

In [14]:
df_edges_all.head()

Unnamed: 0,s,r,r_id,node_id_str
0,bn:03760212n,RelatedTo,133,bn:00046576n
1,bn:03237118n,RelatedTo,209,bn:00046576n
2,bn:02057488n,RelatedTo,352,bn:00046576n
3,bn:00013153n,RelatedTo,146,bn:00046576n
4,bn:00076121n,RelatedTo,324,bn:00046576n


In [15]:
df_edges_all.shape

(1901531, 4)

In [16]:
#Get list of unique nodes ID (both source and destination)
first_list = df_edges_all.node_id_str.unique().tolist()
second_list = df_edges_all.s.unique().tolist()
unique_nodes = first_list + list(set(second_list) - set(first_list))

In [17]:
len(first_list), len(second_list), len(unique_nodes)

(100337, 67900, 101244)

In [18]:
#create dictionary look up for string to numeric ID
node_num_dict = {node:i for i, node in enumerate(unique_nodes)}

In [28]:
len(node_num_dict.keys())

101244

In [19]:
#create dictionary look up for edgetype
edge_type_num_dict = {edge:i for i, edge in enumerate(df_edges_all.r.unique())}

In [20]:
#create dictionary look up for edge_id
edge_id_dict = {edge:i for i, edge in enumerate(df_edges_all.r_id.unique())}

In [26]:
#save ref dict
save_dict = {'node_num':node_num_dict,
            'edge_type_num':edge_type_num_dict,
            'edge_id':edge_id_dict }

In [27]:
pickle.dump(save_dict, open( "ref_dict.pkl", "wb" ) )

In [19]:
#helper functions
def node_id_num(row):
    return node_num_dict.get(row.node_id_str)

def to_nodes_list(row):
    return [node_num_dict.get(item['s']) for item in row.features]  

def edge_types_list(row):
    return [edge_type_num_dict.get(item['r']) for item in row.features]

def edge_ids_list(row):
    return [item['r_id'] for item in row.features]

In [20]:
#strip features dict into: to_nodes; edge_type; edge_id
df_edges['node_id'] = df_edges.apply(lambda row: node_id_num(row),axis=1)
df_edges['to_nodes'] = df_edges.apply (lambda row: to_nodes_list(row), axis=1)
df_edges['edge_types'] = df_edges.apply (lambda row: edge_types_list(row),axis=1)
df_edges['edge_id'] = df_edges.apply (lambda row: edge_ids_list(row),axis=1)

In [21]:
#sort by node_id column, so later adding graph node features will have the same ordering
df_edges = df_edges.sort_values(by=['node_id'])

In [22]:
df_edges.head()

Unnamed: 0,node_id_str,features,node_id,to_nodes,edge_types,edge_id
0,bn:00046576n,"[{'s': 'bn:03760212n', 'r': 'RelatedTo', 'r_id...",0,"[69282, 20318, 82276, 10181, 10825, 15633, 380...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[133, 209, 352, 146, 324, 325, 347, 251, 235, ..."
1,bn:00033022n,"[{'s': 'bn:00023394n', 'r': 'RelatedTo', 'r_id...",1,"[31253, 87835, 16212, 69742, 16219, 37192, 362...","[0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, ...","[611, 601, 604, 627, 630, 632, 599, 622, 615, ..."
2,bn:17006187n,"[{'s': 'bn:03547708n', 'r': 'RelatedTo', 'r_id...",2,"[30114, 81120, 37191, 98401, 29384, 33029, 342...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1791, 1870, 1824, 1832, 1834, 1899, 1885, 179..."
3,bn:00047705n,"[{'s': 'bn:00066209n', 'r': 'RelatedTo', 'r_id...",3,"[17387, 59853, 9229, 5687, 20471, 62161, 86316...","[0, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[2832, 2565, 3405, 1910, 2769, 3710, 4466, 263..."
4,bn:00009459n,"[{'s': 'bn:03722869n', 'r': 'RelatedTo', 'r_id...",4,"[68051, 74827, 85951, 15512, 100321, 66043, 12...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4713, 4659, 4746, 4712, 4591, 4599, 4852, 466..."


In [23]:
df_edges.shape

(100337, 6)

In [24]:
#assign same numeric id to node_df
df_node['node_id'] = df_node.apply(lambda row: node_id_num(row),axis=1)

In [25]:
#join edge_df with node_df
join_df = pd.merge(df_edges,df_node, how='left', on='node_id')

In [4]:
join_df.shape

(100337, 11)

In [31]:
edge_list = join_df.edge_types.tolist()
sum([len(item) for item in edge_list])

1901531

In [32]:
edge_feat = np.array([elem for item in edge_list for elem in item])

In [41]:
print((np.sort(np.bincount(edge_feat)),))

[      9      10      34     165     635     677    1441   15718   23535
   66935  111276  115260 1565836]


In [35]:
join_df.head()

Unnamed: 0,node_id_str_x,features,node_id,to_nodes,edge_types,edge_id,node_id_str_y,gl,ms,se,ims
0,bn:00046576n,"[{'s': 'bn:03760212n', 'r': 'RelatedTo', 'r_id...",0,"[69282, 20318, 82276, 10181, 10825, 15633, 380...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[133, 209, 352, 146, 324, 325, 347, 251, 235, ...",bn:00046576n,[Industry is the production of goods or relate...,WN:EN:industry,"[Industrial_activity, Industries, IndustrY, Su...","[2a350e80122cb5cbeaf3830d915623edcbfee1e1, 4d5..."
1,bn:00033022n,"[{'s': 'bn:00023394n', 'r': 'RelatedTo', 'r_id...",1,"[31253, 87835, 16212, 69742, 16219, 37192, 362...","[0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, ...","[611, 601, 604, 627, 630, 632, 599, 622, 615, ...",bn:00033022n,"[Family of fishes, Any member of the Lamnidae....",WN:EN:family_Isuridae,"[lamnid, white_sharks, mackerel_sharks, Isurid...","[63fe1fd039a8ccfe02f95cc0a7cb680deb891426, f85..."
2,bn:17006187n,"[{'s': 'bn:03547708n', 'r': 'RelatedTo', 'r_id...",2,"[30114, 81120, 37191, 98401, 29384, 33029, 342...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1791, 1870, 1824, 1832, 1834, 1899, 1885, 179...",bn:17006187n,[A near-threatened species is a species which ...,WIKIDATA:EN:LR/nt,"[near_threatened, Near-threatened, NT, Near_Th...","[b24e142a50e1b68dc888c0cb8fe5df853e7a1356, 68f..."
3,bn:00047705n,"[{'s': 'bn:00066209n', 'r': 'RelatedTo', 'r_id...",3,"[17387, 59853, 9229, 5687, 20471, 62161, 86316...","[0, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[2832, 2565, 3405, 1910, 2769, 3710, 4466, 263...",bn:00047705n,[Italy is a unitary parliamentary republic in ...,WN:EN:Italy,"[IT, Austrian_Empire_(Italy), The_Italian_repu...","[60c41e2fc8a1a185c7842a05ffc1212da712be09, 953..."
4,bn:00009459n,"[{'s': 'bn:03722869n', 'r': 'RelatedTo', 'r_id...",4,"[68051, 74827, 85951, 15512, 100321, 66043, 12...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4713, 4659, 4746, 4712, 4591, 4599, 4852, 466...",bn:00009459n,[A northwestern state of the United States of ...,WN:EN:Oregon,"[Oregón, Oregon_(State), Oregon_(U.S._state), ...","[80143c1ccf29d5fc2480b544d2a71d4f2ef668ff, 125..."


In [33]:
#save result
join_df.to_pickle("./join_df.pkl")

### DGL: create graph

In [3]:
join_df = pickle.load(open( "./join_df.pkl", "rb" ) )

In [13]:
join_df.gl.isnull().values.any()

False

In [35]:
%%time
#node_num_dict: 101244
G = build_graph(join_df, num_nodes=101244,directed=True,edge_feat=False)

CPU times: user 19.6 s, sys: 160 ms, total: 19.7 s
Wall time: 19.7 s


In [4]:
%%time
#node_num_dict: 101244
G = build_graph(join_df, num_nodes=101244,directed=True,edge_feat=True)

1901531
1901531
adding edge features
CPU times: user 20.3 s, sys: 498 ms, total: 20.8 s
Wall time: 20.8 s


In [6]:
print(G)

DGLGraph(num_nodes=101244, num_edges=1901531,
         ndata_schemes={}
         edata_schemes={'type': Scheme(shape=(1,), dtype=torch.int64)})


In [8]:
G.edges[3, 9229].data['type']

tensor([[6],
        [2],
        [3]])

In [29]:
# graph statistics:
# IN-DEGREE:
G_ind = G.in_degrees().float()
print("In-degree Median/Max/Min:", G_ind.mean(), G_ind.max(), G_ind.min())
# OUT-DEGREE:
G_out = G.out_degrees().float()
print("Out-degree Median/Max/Min:", G_out.median(), G_out.max(), G_out.min())

In-degree Median/Max/Min: tensor(18.7817) tensor(122.) tensor(0.)
Out-degree Median/Max/Min: tensor(6.) tensor(8643.) tensor(0.)


In [6]:
#Add edge/node features
#G.ndata['features'] = join_df.node_embedding

### R-GCN