In [46]:
import numpy as np
import pandas as pd
import networkx as nx
import os
import pickle

import datetime
import tqdm

In [47]:
p = os.path.abspath('..')
DATASET = 'COLLAB'
## Possible LIST : DD, ENZYMES, PROTEINS / REDDIT-MULTI-12K, COLLAB (Exhaustive tasks)
#DATASET = 'REDDIT-MULTI-12K'
DATA_DIR = p + '/data/' + DATASET

HEADER = None
NODE_LAB = True
NODE_ATT = True

This folder contains the following comma separated text files 
(replace DS by the name of the dataset):

$n$ = total number of nodes

$m$ = total number of edges

$N$ = number of graphs

(1) DS_A.txt (m lines) 
	
    sparse (block diagonal) adjacency matrix for all graphs,
	
    each line corresponds to (row, col) resp. (node_id, node_id)

(2) DS_graph_indicator.txt (n lines)
	
    column vector of graph identifiers for all nodes of all graphs,
	
    the value in the i-th line is the graph_id of the node with node_id i

(3) DS_graph_labels.txt (N lines) 
	
    class labels for all graphs in the dataset,
	
    the value in the i-th line is the class label of the graph with graph_id i

(4) DS_node_labels.txt (n lines)

	column vector of node labels,

    the value in the i-th line corresponds to the node with node_id i

In [48]:
ds_A = pd.read_csv(DATA_DIR + '/' + DATASET + '_A.txt', header = HEADER)
ds_gInd = pd.read_csv(DATA_DIR + '/' + DATASET + '_graph_indicator.txt', header = HEADER)
ds_gLab = pd.read_csv(DATA_DIR + '/' + DATASET + '_graph_labels.txt', header = HEADER)

In [49]:
is_class_indexed_one = (min(ds_gLab[0]) > 0)
is_graph_indexed_one = (min(ds_gInd[0]) > 0)
is_node_indexed_one = (min(ds_gInd[0]) > 0)

if(is_class_indexed_one):
    ds_gLab = ds_gLab - 1
if(is_graph_indexed_one):
    ds_gInd = ds_gInd - 1
if(is_node_indexed_one):
    ds_A = ds_A - 1

In the case of index starting from 1, we have to adjust them

In [50]:
try:
    ds_nLab = pd.read_csv(DATA_DIR + '/' + DATASET + '_node_labels.txt', header = HEADER)
except IOError:
    NODE_LAB = False
    print("No node labels")
    
try:
    ds_nAtt = pd.read_csv(DATA_DIR + '/' + DATASET + '_node_attributes.txt', header = HEADER)
except IOError:
    NODE_ATT = False
    print("No node attributes")

No node labels
No node attributes


In [51]:
n_classes = len(ds_gLab[0].unique())
n_graphs = len(ds_gLab)

In [52]:
## This is too exhaustive when the total number of edges |E| > 10M
#print(datetime.datetime.now())
#edge_to_graph = ds_A[0].apply(lambda x : ds_gInd[0][x])
#y_label = edge_to_graph.apply(lambda x : ds_gLab[0][x])
#ds_A = ds_A.assign(gL = edge_to_graph)
#ds_A = ds_A.assign(y = y_label)
#print(datetime.datetime.now())

In [53]:
graph_index_list = ds_gLab.index.values
graph_list = []

#for i in graph_index_list:
for i in tqdm.tqdm(graph_index_list):
    corresp_node_list = ds_gInd[ds_gInd[0] == i].index.values
    corresp_edge_list = ds_A[ds_A[0].isin(corresp_node_list)]
    edgelist = corresp_edge_list.apply(lambda x : (x[0],x[1]), axis = 1).tolist()
    ## Graph Attributes -> Graph Label
    graph_list.append(nx.Graph(edgelist, y = ds_gLab[0][i]))

100%|██████████| 5000/5000 [3:34:00<00:00,  2.57s/it]  


In [54]:
## Node Label -> OneHot vector? RexYing DiffPool chamzohagi
print(datetime.datetime.now()) 
if(NODE_LAB):
    for g in graph_list:
        for pt in g.nodes():
            g.node[pt]['label'] = ds_nLab[0][pt]
print(datetime.datetime.now())
## Node Attributes -> Node Attributes
if(NODE_ATT):
    for g in graph_list:
        for pt in g.nodes:
            g.node[pt]['attributes'] = ds_nAtt[0][pt]
print(datetime.datetime.now())

2018-10-18 20:49:50.601323
False
2018-10-18 20:49:50.601445
False
2018-10-18 20:49:50.601533


In [55]:
if not os.path.exists(p + '/pkls'):
    os.makedirs(p + '/pkls')
with open(p + '/pkls/' + DATASET + '_graphlist.pkl', 'wb') as f:
    pickle.dump(graph_list,f)
f.close()