In [1]:
import pathlib

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
import networkx as nx
import utils.preprocess
from sklearn.model_selection import train_test_split

save_prefix = r'./data/IMDB_processed/graph_split/'
read_perfix = r'./data\IMDB'
num_ntypes = 3

## load

In [6]:
import pickle

# 读取.pkl文件
with open(r'./data/IMDB/edges.pkl', 'rb') as f:
    edges = pickle.load(f)

# 使用读取的数据
print(edges)

with open(r'./data/IMDB/labels.pkl', 'rb') as f:
    data = pickle.load(f)

# 使用读取的数据
print([len(data[p]) for p in range(len(data))])
labels = np.concatenate([np.array(data[p]) for p in range(len(data))], axis = 0)

with open(r'./data/IMDB/node_features.pkl', 'rb') as f:
    features = pickle.load(f)

# 使用读取的数据
print(features.shape)


[<12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 4661 stored elements in Compressed Sparse Row format>, <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 4661 stored elements in Compressed Sparse Row format>, <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 13983 stored elements in Compressed Sparse Row format>, <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
	with 13983 stored elements in Compressed Sparse Row format>]
[300, 300, 2339]
(12772, 1256)


In [8]:
labels.shape


(2939, 2)

In [12]:
edge_1 = np.stack(edges[0].nonzero()).T
edge_1

array([[   0, 4661],
       [   1, 4662],
       [   2, 4663],
       ...,
       [4658, 6357],
       [4659, 6930],
       [4660, 6492]])

In [13]:
np.unique(edges[1].nonzero()[0]).shape

(2270,)

In [14]:
np.unique(edges[3].nonzero()[0]).shape

(5841,)

In [15]:
edge_2 = np.stack(edges[2].nonzero()).T
edge_2

array([[    0,  6931],
       [    0,  7404],
       [    0,  8558],
       ...,
       [ 4660,  8877],
       [ 4660, 10769],
       [ 4660, 12771]])

In [16]:
edge_1
edge_1_pos_num = 932     # 20%
idx = np.random.choice(len(edge_1), edge_1_pos_num, replace=False)
edge_1_pos = edge_1[idx]


In [17]:
mask = np.zeros_like(edges[0].A, dtype=bool) # lil_matrix or csr_matrix depends on tasks
mask[np.ix_(np.unique(edges[0].nonzero()[0]),np.unique(edges[1].nonzero()[0]))] = True
edges_1_neg_adj = np.logical_and(~(edges[0].A.astype('bool')), mask) 
edge_1_neg_candicate = np.stack(edges_1_neg_adj.nonzero()).T  # (10575809, 2)

edge_1_neg_num = 932 
idx = np.random.choice(len(edge_1_neg_candicate), edge_1_neg_num, replace=False)
edge_1_neg = edge_1_neg_candicate[idx]

In [18]:
edge_type = np.array([1]*932).reshape(-1,1)
pair_class = np.zeros(932, dtype= int)
pair_class[:466] = 1
pair_class = pair_class.reshape(-1,1)
edge_1_pos = edge_1_pos[np.random.permutation(range(932))] # shuffle
edge_1_neg = edge_1_neg[np.random.permutation(range(932))] # shuffle

edge_1_val = np.concatenate([edge_type, np.concatenate([edge_1_pos[:466], edge_1_neg[:466]]), pair_class], axis=1)
edge_1_test = np.concatenate([edge_type, np.concatenate([edge_1_pos[466:], edge_1_neg[466:]]), pair_class], axis=1)


In [19]:
edge_2_pos_num = 2800     # 20%
idx = np.random.choice(len(edge_2), edge_2_pos_num, replace=False)
edge_2_pos = edge_2[idx]

mask = np.zeros_like(edges[2].A, dtype=bool) # lil_matrix or csr_matrix depends on tasks
mask[np.ix_(np.unique(edges[2].nonzero()[0]),np.unique(edges[3].nonzero()[0]))] = True
edges_2_neg_adj = np.logical_and(~(edges[2].A.astype('bool')), mask) 
edge_2_neg_candicate = np.stack(edges_2_neg_adj.nonzero()).T  # (27210918, 2)

In [20]:
edge_2_neg_num = 2800 
idx = np.random.choice(len(edge_2_neg_candicate), edge_2_neg_num, replace=False)
edge_2_neg = edge_2_neg_candicate[idx]

In [21]:
edge_type = np.array([2]*2800).reshape(-1,1)
pair_class = np.zeros(2800, dtype= int)
pair_class[:1400] = 1
pair_class = pair_class.reshape(-1,1)
edge_2_pos = edge_2_pos[np.random.permutation(range(2800))] # shuffle
edge_2_neg = edge_2_neg[np.random.permutation(range(2800))] # shuffle

edge_2_val = np.concatenate([edge_type, np.concatenate([edge_2_pos[:1400], edge_2_neg[:1400]]), pair_class], axis=1)
edge_2_test = np.concatenate([edge_type, np.concatenate([edge_2_pos[1400:], edge_2_neg[1400:]]), pair_class], axis=1)

In [22]:
imdb_val = np.concatenate([edge_1_val,edge_2_val])
imdb_test = np.concatenate([edge_1_test,edge_2_test])

## type_mask

In [27]:
raw_dims = [4661, 2270, 5841]
dim = 4661 + 2270 + 5841
print(raw_dims)
print(dim)

prefix_operator = np.ones((num_ntypes+1, num_ntypes))
prefix_operator = np.tril(prefix_operator, k=-1)   # 下三角矩阵
prefix_operator = prefix_operator.dot(raw_dims).astype(int)
print(prefix_operator)

# 0 for movies 4661, 1 for directors 2270, 2 for actors 5841
type_mask = np.zeros(dim,dtype=int)
for i in range(num_ntypes):
    type_mask[prefix_operator[i]:prefix_operator[i+1]] = i

[4661, 2270, 5841]
12772
[    0  4661  6931 12772]


## adjM

In [28]:
adj = edges[0] + edges[1] + edges[2] + edges[3]
adjM = adj.A
adjM.shape

(12772, 12772)

## ontology

In [29]:
ontology = {
    'stem':[1,0,2],
}
ontology_pairs = [[1,0],[0,2]]

In [30]:
link_intances = utils.preprocess.get_intances_by_pair(adjM, type_mask, ontology, prefix_operator)
#13983

In [31]:
subgraphs = utils.preprocess.get_ontology_subgraphs_v2(ontology, link_intances)

subgraphs = subgraphs[subgraphs.columns.sort_values()]
print(len(subgraphs))
print(subgraphs)

# 12828

13983
          0     1      2
0         0  4661   6931
1         0  4661   7404
2         0  4661   8558
3        25  4661   6950
4        25  4661   7069
...     ...   ...    ...
13978  4657  6929  10766
13979  4657  6929  12769
13980  4659  6930   8876
13981  4659  6930   9918
13982  4659  6930  10768

[13983 rows x 3 columns]


In [32]:
import time

In [33]:
t = time.time()
link_intances = utils.preprocess.get_intances_by_pair(adjM, type_mask, ontology, prefix_operator)
subgraphs = utils.preprocess.get_ontology_subgraphs_v3(ontology, link_intances)
print(time.time()-t)
subgraphs = subgraphs[subgraphs.columns.sort_values()]
print(len(subgraphs))
print(subgraphs)

0.1066434383392334
13983
          0     1      2
0         0  4661   6931
1         0  4661   7404
2         0  4661   8558
3        25  4661   6950
4        25  4661   7069
...     ...   ...    ...
13978  4657  6929  10766
13979  4657  6929  12769
13980  4659  6930   8876
13981  4659  6930   9918
13982  4659  6930  10768

[13983 rows x 3 columns]


## incomplete

In [34]:
ontology_pairs = [[1,0],[0,2]]
res_adj = utils.preprocess.find_res_adj(adjM, subgraphs)
incomplete_ontology_subgraphs, incomplete_subgraphs = utils.preprocess.find_incomplete_subgraph(adjM, type_mask, ontology_pairs, res_adj)
print(len(incomplete_ontology_subgraphs))
print(incomplete_subgraphs)


  self._set_arrayXarray(i, j, x)


Thu Apr 18 10:42:52 2024, finding pairs...
Thu Apr 18 10:42:53 2024, finding pairs...
0
[]


## save

In [35]:
# create the directories if they do not exist
for i in ['complete','incomplete']:
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)

# save data 

# mapping of node to subgraphs

# mapping of node to node pairs 

# save schema
np.save(save_prefix + 'complete/ontology.npy', ontology) # schema
np.save(save_prefix + 'ontology_pairs.npy', ontology_pairs)
# subgraph table
np.save(save_prefix + 'complete/subgraphs.npy', subgraphs)
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (authors, papers, terms and conferences) features
for i in range(num_ntypes):
    scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(i), scipy.sparse.csr_matrix(features[prefix_operator[i]:prefix_operator[i+1]]))
# all nodes (authors, papers, terms and conferences) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# type prefix
np.save(save_prefix + 'prefix_operator.npy', prefix_operator)
# movie genre labels
np.save(save_prefix + 'labels.npy', labels)

## split

In [36]:
# subgraphs train/validation/test splits
rand_seed = 33333333
train_val_idx, test_idx = train_test_split(np.arange(len(adjM)), test_size=0.113, random_state=rand_seed)
a = np.isin(subgraphs,test_idx)
a = a.sum(axis=1).astype('bool')
subgraphs_test = subgraphs[a]
subgraphs_tr_val = subgraphs[~a]
subgraphs[a].shape
print(subgraphs_test.shape[0]/len(subgraphs)) # 30% for test
train_idx, val_idx = train_test_split(train_val_idx, test_size=0.051, random_state=rand_seed)
b = np.isin(subgraphs_tr_val,val_idx)
b = b.sum(axis=1).astype('bool')
subgraphs_val = subgraphs_tr_val[b]
subgraphs_train = subgraphs_tr_val[~b]
subgraphs_tr_val[b].shape
print(subgraphs_val.shape[0]/len(subgraphs)) # 10% for val
print(len(subgraphs_train)/len(subgraphs)) # 60% for train

np.savez(save_prefix + 'complete/' + 'subgraphs_train_val_test.npz',
         subgraphs_train=subgraphs_train,
         subgraphs_val=subgraphs_val,
         subgraphs_test=subgraphs_test) # subgraph train&val&test
# node split
np.savez(save_prefix + 'complete/' + 'train_val_test_nodes.npz',
         train_nodes=train_idx,
         val_nodes=val_idx,
         test_nodes=test_idx) # nodes train&val&test

0.2993635128370164
0.10362583136665951
0.5970106557963241


## =============================================================

In [38]:
adjM[np.unique(test_idx)].sum()/adjM.sum()

0.11086676678824287

In [39]:
adjM[np.unique(val_idx)].sum()/adjM.sum()

0.04620789530143746

In [None]:
label[4423]

array([0, 0, 0], dtype=uint8)

In [None]:
len((label).nonzero()[0])

2939

In [None]:
features = IMDB['features']
features.shape

(12772, 1256)

In [None]:
edges = IMDB['edges']
for i in range(len(edges)):
    print(edges[i])

[<12772x12772 sparse matrix of type '<class 'numpy.float64'>'
 	with 4661 stored elements in Compressed Sparse Column format>
 <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
 	with 4661 stored elements in Compressed Sparse Column format>
 <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
 	with 13983 stored elements in Compressed Sparse Column format>
 <12772x12772 sparse matrix of type '<class 'numpy.float64'>'
 	with 13983 stored elements in Compressed Sparse Column format>]
