In [1]:
import pathlib
import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
import networkx as nx
import utils.preprocess
from sklearn.model_selection import train_test_split

save_prefix = r'./data/IMDB_processed/'
read_perfix = r'./data\IMDB'
num_ntypes = 3

## load
This IMDB dataset is from GTN(https://github.com/seongjunyun/Graph_Transformer_Networks)

In [6]:
import pickle

with open(r'./data/IMDB/edges.pkl', 'rb') as f:
    edges = pickle.load(f)
print(edges)

with open(r'./data/IMDB/labels.pkl', 'rb') as f:
    data = pickle.load(f)
print([len(data[p]) for p in range(len(data))])
labels = np.concatenate([np.array(data[p]) for p in range(len(data))], axis = 0)

with open(r'./data/IMDB/node_features.pkl', 'rb') as f:
    features = pickle.load(f)
print(features.shape)

[<12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 4661 stored elements in Compressed Sparse Row format>, <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 4661 stored elements in Compressed Sparse Row format>, <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 13983 stored elements in Compressed Sparse Row format>, <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 13983 stored elements in Compressed Sparse Row format>]
[300, 300, 2339]
(12772, 1256)


In [12]:
edge_1 = np.stack(edges[0].nonzero()).T
edge_1

array([[   0, 4661],
       [   1, 4662],
       [   2, 4663],
       ...,
       [4658, 6357],
       [4659, 6930],
       [4660, 6492]])

In [13]:
np.unique(edges[1].nonzero()[0]).shape

(2270,)

In [14]:
np.unique(edges[3].nonzero()[0]).shape

(5841,)

## type_mask

In [27]:
raw_dims = [4661, 2270, 5841]
dim = 4661 + 2270 + 5841
print(raw_dims)
print(dim)

prefix_operator = np.ones((num_ntypes+1, num_ntypes))
prefix_operator = np.tril(prefix_operator, k=-1)   # 下三角矩阵
prefix_operator = prefix_operator.dot(raw_dims).astype(int)
print(prefix_operator)

# 0 for movies 4661, 1 for directors 2270, 2 for actors 5841
type_mask = np.zeros(dim,dtype=int)
for i in range(num_ntypes):
    type_mask[prefix_operator[i]:prefix_operator[i+1]] = i

[4661, 2270, 5841]
12772
[    0  4661  6931 12772]


## adjM

In [28]:
adj = edges[0] + edges[1] + edges[2] + edges[3]
adjM = adj.A
adjM.shape

(12772, 12772)

## schema

In [29]:
schema = {
    'stem':[1,0,2],
}

In [None]:
import time

In [None]:
t = time.time()
link_intances = utils.preprocess.get_intances(adjM, type_mask, schema, prefix_operator)
subgraphs = utils.preprocess.get_schema_subgraphs_parallel(schema, link_intances)
print(time.time()-t)
subgraphs = subgraphs[subgraphs.columns.sort_values()]
print(subgraphs)

## save

### split

In [36]:
# subgraphs train/validation/test splits
rand_seed = 33333333
train_val_idx, test_idx = train_test_split(np.arange(len(adjM)), test_size=0.113, random_state=rand_seed)
a = np.isin(subgraphs,test_idx)
a = a.sum(axis=1).astype('bool')
subgraphs_test = subgraphs[a]
subgraphs_tr_val = subgraphs[~a]
subgraphs[a].shape
print(subgraphs_test.shape[0]/len(subgraphs)) # 30% for test
train_idx, val_idx = train_test_split(train_val_idx, test_size=0.051, random_state=rand_seed)
b = np.isin(subgraphs_tr_val,val_idx)
b = b.sum(axis=1).astype('bool')
subgraphs_val = subgraphs_tr_val[b]
subgraphs_train = subgraphs_tr_val[~b]
subgraphs_tr_val[b].shape
print(subgraphs_val.shape[0]/len(subgraphs)) # 10% for val
print(len(subgraphs_train)/len(subgraphs)) # 60% for train

# save data 

# save schema
np.save(save_prefix + 'schema.npy', schema) # schema
np.save(save_prefix + 'schema_pairs.npy', schema_pairs)
# subgraph table
np.save(save_prefix + 'subgraphs.npy', subgraphs)
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (authors, papers, terms and conferences) features
for i in range(num_ntypes):
    scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(i), scipy.sparse.csr_matrix(features[prefix_operator[i]:prefix_operator[i+1]]))
# all nodes (authors, papers, terms and conferences) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# type prefix
np.save(save_prefix + 'prefix_operator.npy', prefix_operator)
# movie genre labels
np.save(save_prefix + 'labels.npy', labels)

np.savez(save_prefix + 'subgraphs_train_val_test.npz',
         subgraphs_train=subgraphs_train,
         subgraphs_val=subgraphs_val,
         subgraphs_test=subgraphs_test) # subgraph train&val&test

0.2993635128370164
0.10362583136665951
0.5970106557963241
