In [1]:
import json
from collections import defaultdict
import pickle
import numpy as np
import pandas as pd
import networkx as nx
from scipy import sparse

In [245]:
# BlogCatalog data
nodes_raw = pd.read_csv('./data/raw/blogcatalog/nodes.csv', header=None)
edges_raw = pd.read_csv('./data/raw/blogcatalog/edges.csv', header=None)
groups_raw = pd.read_csv('./data/raw/blogcatalog/groups.csv', header=None)
groups_edges_raw = pd.read_csv('./data/raw/blogcatalog/group-edges.csv', header=None)

In [246]:
nodes = nodes_raw.values - 1
edges = edges_raw.values - 1
groups_edges_raw = groups_edges_raw - 1

In [247]:
node_labels = pd.get_dummies(groups_edges_raw.sort_values(by=0), columns=[1]).groupby([0], as_index=True).sum().astype(int).values

In [248]:
adj_dict = defaultdict(set)
for e in edges:
    adj_dict[e[0]].add(e[1])
    adj_dict[e[1]].add(e[0])

In [7]:
def dict_to_matrix(dic):
    n = len(dic.keys())
    adj_matrix = np.zeros([n, n], dtype=np.uint8)
    for key in dic.keys():
        nodes = np.array(list(dic[key]))
        adj_matrix[key, nodes] = 1
    return sparse.csr_matrix(adj_matrix)

In [249]:
adj_matrix = dict_to_matrix(adj_dict)

In [250]:
sparse.save_npz('./data/preprocessed/blogcatalog/adj_matrix.npz', adj_matrix)
with open('./data/preprocessed/blogcatalog/adj_dict.pkl', 'wb') as f:
    pickle.dump(adj_dict, f)

In [251]:
with open('./data/preprocessed/blogcatalog/node_labels.pkl', 'wb') as f:
    pickle.dump(node_labels, f)

In [122]:
# facebook
edges = []
with open('./data/raw/facebook/facebook_combined.txt', 'r') as f:
    lines = f.readlines()
for l in lines:
    edges.append([int(s) for s in l.strip().split()])
edges = np.array(edges)

In [123]:
adj_dict = defaultdict(set)
for e in edges:
    adj_dict[e[0]].add(e[1])
    adj_dict[e[1]].add(e[0])
    
adj_matrix = dict_to_matrix(adj_dict)

In [127]:
np.sum(adj_matrix) / 2 / adj_matrix.shape[0]

21.84550631344392

In [115]:
sparse.save_npz('./data/preprocessed/facebook/adj_matrix.npz', adj_matrix)
with open('./data/preprocessed/facebook/adj_dict.pkl', 'wb') as f:
    pickle.dump(adj_dict, f)

In [252]:
# pubmed
ind_allx = pickle.load(open('./data/raw/pubmed/ind.pubmed.allx', 'rb'), encoding='latin1')
ind_ally = pickle.load(open('./data/raw/pubmed/ind.pubmed.ally', 'rb'), encoding='latin1')
ind_tx = pickle.load(open('./data/raw/pubmed/ind.pubmed.tx', 'rb'), encoding='latin1')
ind_ty = pickle.load(open('./data/raw/pubmed/ind.pubmed.ty', 'rb'), encoding='latin1')
ind_x = pickle.load(open('./data/raw/pubmed/ind.pubmed.x', 'rb'), encoding='latin1')
ind_y = pickle.load(open('./data/raw/pubmed/ind.pubmed.y', 'rb'), encoding='latin1')
ind_graph = pickle.load(open('./data/raw/pubmed/ind.pubmed.graph', 'rb'), encoding='latin1')  # defaultdict(list) - adj_dict

In [253]:
with open('./data/raw/pubmed/ind.pubmed.test.index', 'r') as f:
    lines = f.readlines()
test_inds = []
for l in lines:
    test_inds.append(int(l))
test_inds = np.array(test_inds)

In [254]:
adj_dict = defaultdict(set)
for key in ind_graph.keys():
    adj_dict[key] = set(ind_graph[key])

In [255]:
adj_matrix = dict_to_matrix(adj_dict)

In [256]:
sparse.save_npz('./data/preprocessed/pubmed/adj_matrix.npz', adj_matrix)
with open('./data/preprocessed/pubmed/adj_dict.pkl', 'wb') as f:
    pickle.dump(adj_dict, f)

In [257]:
train_inds = np.array([i for i in range(adj_matrix.shape[0]) if i not in test_inds])

In [258]:
node_features = np.zeros([adj_matrix.shape[0], ind_allx.shape[1]], dtype=np.float32)
node_features[train_inds] = ind_allx.toarray()
node_features[test_inds] = ind_tx.toarray()
node_features = sparse.csr_matrix(node_features)

In [259]:
node_labels = np.zeros([adj_matrix.shape[0], ind_ally.shape[1]], dtype=int)
node_labels[train_inds] = ind_ally
node_labels[test_inds] = ind_ty

In [260]:
sparse.save_npz('./data/preprocessed/pubmed/node_features.npz', node_features)
with open('./data/preprocessed/pubmed/node_labels.pkl', 'wb') as f:
    pickle.dump(node_labels, f)

In [349]:
# ppi
class_map = json.load(open('./data/raw/ppi/ppi-class_map.json', 'r'))
id_map = json.load(open('./data/raw/ppi/ppi-id_map.json', 'r'))
node_features = np.load('./data/raw/ppi/ppi-feats.npy')
G = json.load(open('./data/raw/ppi/ppi-G.json', 'r'))

In [350]:
graph = nx.node_link_graph(G)

In [351]:
adj_dict = defaultdict(set)
for edge in graph.edges:
    adj_dict[edge[0]].add(edge[1])
    adj_dict[edge[1]].add(edge[0])

In [352]:
# adj_matrix = nx.adjacency_matrix(graph)
adj_matrix = dict_to_matrix(adj_dict)

In [353]:
nodes = np.array(sorted([int(s) for s in id_map.keys()]))
node_labels = np.zeros([len(nodes), len(class_map['0'])], dtype=int)
for n in nodes:
    node_labels[n, :] = np.array(class_map[str(n)]).astype(int)

In [354]:
node_features = sparse.csr_matrix(node_features)

In [355]:
sparse.save_npz('./data/preprocessed/ppi/adj_matrix.npz', adj_matrix)
with open('./data/preprocessed/ppi/adj_dict.pkl', 'wb') as f:
    pickle.dump(adj_dict, f)

sparse.save_npz('./data/preprocessed/ppi/node_features.npz', node_features)
with open('./data/preprocessed/ppi/node_labels.pkl', 'wb') as f:
    pickle.dump(node_labels, f)

In [3]:
# reddit
class_map = json.load(open('./data/raw/reddit/reddit-class_map.json', 'r'))
id_map = json.load(open('./data/raw/reddit/reddit-id_map.json', 'r'))
node_features = np.load('./data/raw/reddit/reddit-feats.npy')
G = json.load(open('./data/raw/reddit/reddit-G_full.json', 'r'))

In [4]:
graph = nx.node_link_graph(G)

In [5]:
adj_dict = defaultdict(set)
for edge in graph.edges:
    adj_dict[edge[0]].add(edge[1])
    adj_dict[edge[1]].add(edge[0])

In [8]:
adj_matrix = dict_to_matrix(adj_dict)

In [25]:
node_ids = list(id_map.items())
node_ids = sorted(node_ids, key=lambda x: x[1])
class_size = max([class_map.items()]) + 1
node_labels = np.zeros([len(node_ids), class_size], dtype=np.uint8)
for n in node_ids:
    node_labels[n[1], class_map[n[0]]] = 1

In [14]:
node_features = sparse.csr_matrix(node_features)

In [32]:
sparse.save_npz('./data/preprocessed/reddit/adj_matrix.npz', adj_matrix)
with open('./data/preprocessed/reddit/adj_dict.pkl', 'wb') as f:
    pickle.dump(adj_dict, f)

sparse.save_npz('./data/preprocessed/reddit/node_features.npz', node_features)
with open('./data/preprocessed/reddit/node_labels.pkl', 'wb') as f:
    pickle.dump(node_labels, f)