In [1]:
from scipy import io
import numpy as np
from scipy.sparse import csr_matrix
mat_file = io.loadmat('ACM.mat')

In [2]:
mat_file

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Mon Aug 08 18:23:50 2011',
 '__version__': '1.0',
 '__globals__': [],
 'TvsP': <1903x12499 sparse matrix of type '<class 'numpy.float64'>'
 	with 972973 stored elements in Compressed Sparse Column format>,
 'PvsA': <12499x17431 sparse matrix of type '<class 'numpy.float64'>'
 	with 37055 stored elements in Compressed Sparse Column format>,
 'PvsV': <12499x196 sparse matrix of type '<class 'numpy.float64'>'
 	with 12499 stored elements in Compressed Sparse Column format>,
 'AvsF': <17431x1804 sparse matrix of type '<class 'numpy.float64'>'
 	with 30424 stored elements in Compressed Sparse Column format>,
 'VvsC': <196x14 sparse matrix of type '<class 'numpy.float64'>'
 	with 196 stored elements in Compressed Sparse Column format>,
 'PvsL': <12499x73 sparse matrix of type '<class 'numpy.float64'>'
 	with 12499 stored elements in Compressed Sparse Column format>,
 'PvsC': <12499x14 sparse matrix of type '<class 'numpy.fl

In [3]:
paper_conf = mat_file['PvsC'].nonzero()[1]

In [4]:
# DataBase
paper_db = np.isin(paper_conf,[1,13])
paper_db_idx = np.where(paper_db == True)[0]
paper_db_idx = np.sort(np.random.choice(paper_db_idx,994,replace=False))
# Data Mining
paper_dm = np.isin(paper_conf,[0])
paper_dm_idx = np.where(paper_dm == True)[0]
# Wireless Communication
paper_wc = np.isin(paper_conf,[9,10])
paper_wc_idx = np.where(paper_wc == True)[0]

In [5]:
paper_idx = np.sort(list(paper_db_idx)+list(paper_dm_idx)+list(paper_wc_idx))

In [6]:
# 0 : database, 1: wireless communication, 2: data mining
paper_target = []
for idx in paper_idx:
    if idx in paper_db_idx:
        paper_target.append(0)
    elif idx in paper_wc_idx:
        paper_target.append(1)
    else:
        paper_target.append(2)
paper_target = np.array(paper_target)

In [7]:
paper_target.shape

(3025,)

## Edges (PAP, PSP)
[0,1,9,10,13] : KDD,SIGMOD,SIGCOMM,MobiCOMM,VLDB

In [8]:
mat_file['PvsA']

<12499x17431 sparse matrix of type '<class 'numpy.float64'>'
	with 37055 stored elements in Compressed Sparse Column format>

In [9]:
authors = mat_file['PvsA'][paper_idx].nonzero()[1]
author_dic = {}
re_authors = []
for author in authors:
    if author not in author_dic:
        author_dic[author] = len(author_dic) + len(paper_idx)
    re_authors.append(author_dic[author])
re_authors = np.array(re_authors)

In [10]:
len(author_dic)

5912

In [11]:
subjects = mat_file['PvsL'][paper_idx].nonzero()[1]
subject_dic = {}
re_subjects = []
for subject in subjects:
    if subject not in subject_dic:
        subject_dic[subject] = len(subject_dic) + len(paper_idx) + len(author_dic)
    re_subjects.append(subject_dic[subject])
re_subjects = np.array(re_subjects)

In [12]:
len(subject_dic)

57

In [13]:
node_num = len(paper_idx) + len(author_dic) + len(subject_dic)

In [14]:
node_num

8994

In [15]:
papers = mat_file['PvsA'][paper_idx].nonzero()[0]
data = np.ones_like(papers)

In [16]:
A_pa = csr_matrix((data, (papers, re_authors)), shape=(node_num,node_num))

In [17]:
A_pa

<8994x8994 sparse matrix of type '<class 'numpy.int32'>'
	with 9936 stored elements in Compressed Sparse Row format>

In [18]:
papers = mat_file['PvsL'][paper_idx].nonzero()[0]
data = np.ones_like(papers)

In [19]:
A_ps = csr_matrix((data, (papers, re_subjects)), shape=(node_num,node_num))

In [20]:
A_ps

<8994x8994 sparse matrix of type '<class 'numpy.int32'>'
	with 3025 stored elements in Compressed Sparse Row format>

In [21]:
A_ap = A_pa.transpose()

In [22]:
A_sp = A_ps.transpose()

In [23]:
edges = [A_pa,A_ap,A_ps,A_sp]

# Node Features

In [24]:
terms = mat_file['TvsP'].transpose()[paper_idx].nonzero()[1]
term_dic = {}
re_terms = []
for term in terms:
    if term not in term_dic:
        term_dic[term] = len(term_dic) + len(paper_idx) + len(author_dic) + len(subject_dic)
    re_terms.append(term_dic[term])
re_terms = np.array(re_terms)

In [25]:
mat_file['TvsP'].transpose()

<12499x1903 sparse matrix of type '<class 'numpy.float64'>'
	with 972973 stored elements in Compressed Sparse Row format>

In [26]:
# tmp
tmp_num_node = node_num + len(term_dic)
papers = mat_file['PvsA'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_pa_tmp = csr_matrix((data, (papers, re_authors)), shape=(tmp_num_node,tmp_num_node))
papers = mat_file['PvsL'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_ps_tmp = csr_matrix((data, (papers, re_subjects)), shape=(tmp_num_node,tmp_num_node))
papers = mat_file['PvsT'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_pt_tmp = csr_matrix((data, (papers, re_terms)), shape=(tmp_num_node,tmp_num_node))

In [27]:
paper_feat = np.array(A_pt_tmp[:len(paper_idx),-len(term_dic):].toarray()>0, dtype=np.int)
author_feat = np.array(A_pa_tmp.transpose().dot(A_pt_tmp)[len(paper_idx):len(paper_idx)+len(author_dic),-len(term_dic):].toarray()>0, dtype=np.int)
subject_feat = np.array(A_ps_tmp.transpose().dot(A_pt_tmp)[len(paper_idx)+len(author_dic):len(paper_idx)+len(author_dic)+len(subject_dic),-len(term_dic):].toarray()>0, dtype=np.int)

In [28]:
node_faeture = np.concatenate((paper_feat,author_feat,subject_feat))

# Label

In [29]:
paper_target.shape

(3025,)

In [30]:
# Train, Valid
train_valid_DB = list(np.random.choice(np.where(paper_target==0)[0],300, replace=False))
train_valid_WC = list(np.random.choice(np.where(paper_target==1)[0],300, replace=False))
train_valid_DM = list(np.random.choice(np.where(paper_target==2)[0],300, replace=False))

train_idx = np.array(train_valid_DB[:200] + train_valid_WC[:200] + train_valid_DM[:200])
train_target = paper_target[train_idx]
train_label = np.vstack((train_idx,train_target)).transpose()
valid_idx = np.array(train_valid_DB[200:] + train_valid_WC[200:] + train_valid_DM[200:])
valid_target = paper_target[valid_idx]
valid_label = np.vstack((valid_idx,valid_target)).transpose()
test_idx = np.array(list((set(np.arange(paper_target.shape[0])) - set(train_idx)) - set(valid_idx)))
test_target = paper_target[test_idx]
test_label = np.vstack((test_idx,test_target)).transpose()

In [31]:
labels = [train_label,valid_label,test_label]

In [32]:
import pickle
with open('node_features.pkl', 'wb') as f:
    pickle.dump(node_faeture, f)
with open('edges.pkl', 'wb') as f:
    pickle.dump(edges, f)
with open('labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

In [98]:
set(test_idx) & set(valid_idx)

set()

In [92]:
set(np.arange(paper_target.shape[0])) - set(train_idx)

{1,
 2,
 3,
 4,
 5,
 7,
 9,
 10,
 11,
 13,
 14,
 15,
 16,
 18,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 39,
 40,
 41,
 43,
 44,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 59,
 60,
 61,
 62,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 75,
 77,
 79,
 80,
 82,
 83,
 84,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 117,
 118,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 147,
 148,
 149,
 150,
 152,
 153,
 154,
 155,
 156,
 158,
 159,
 161,
 164,
 165,
 166,
 168,
 169,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 187,
 188,
 189,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 203,
 204,
 205,
 206,
 207,
 209,
 210,
 211,
 212,
 213,
 215,
 216,
 217,
 218,

In [86]:
set(np.arange(paper_target.shape[0])) - set

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [27]:
mat_file['TvsP'].transpose()[paper_idx]

<3025x1903 sparse matrix of type '<class 'numpy.float64'>'
	with 257243 stored elements in Compressed Sparse Row format>

In [28]:
mat_file['TvsP']

<1903x12499 sparse matrix of type '<class 'numpy.float64'>'
	with 972973 stored elements in Compressed Sparse Column format>

In [255]:
A_ps

<3025x55 sparse matrix of type '<class 'numpy.int32'>'
	with 3025 stored elements in Compressed Sparse Row format>

In [None]:
mat_file['PvsA'][paper_idx].nonzero()[1]

In [168]:
tmp_paper = mat_file['P'][np.isin(tmp,[0,1,9,10,13])]

In [169]:
tmp = mat_file['PvsC'].nonzero()[1]

In [170]:
tmp

array([0, 0, 0, ..., 7, 7, 7], dtype=int32)

In [171]:
tmp[np.isin(tmp,[0,1,9,10,13])].shape

(4025,)

In [172]:
tmp[np.isin(tmp,[9,10])].shape

(970,)

In [134]:
tmp[np.isin(tmp,[0])].shape

(1061,)

In [176]:
tmp[np.isin(tmp,[1,13])].shape

(1994,)

In [141]:
np.sum(mat_file2['label'] * np.array([0,1,0]))

965.0

In [138]:
np.sum(mat_file2['label'] * np.array([1,0,0]))

1061.0

In [139]:
np.sum(mat_file2['label'] * np.array([0,0,1]))

999.0

In [3]:
mat_file2 = io.loadmat('ACM3025.mat')

In [127]:
mat_file2['feature'].shape

(3025, 1870)

In [128]:
mat_file2

{'__header__': b'MATLAB 5.0 MAT-file Platform: nt, Created on: Wed Feb 27 20:29:23 2019',
 '__version__': '1.0',
 '__globals__': [],
 'PTP': array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 'PLP': array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 1., 0., 1.],
        ...,
        [0., 0., 1., ..., 1., 0., 1.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 1., 0., 1.]]),
 'PAP': array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]),
 'feature': array([[1., 1., 1., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0

In [13]:
(len(mat_file2['PAP'].nonzero()[0])- 3025)/2

13128.0

In [11]:
mat_file2['PAP'].shape

(3025, 3025)

In [15]:
a = mat_file2['PAP']

In [None]:
a.

In [16]:
(len(mat_file2['PLP'].nonzero()[0])- 3025)/2

1103868.0

In [57]:
mat_file2

{'__header__': b'MATLAB 5.0 MAT-file Platform: nt, Created on: Wed Feb 27 20:29:23 2019',
 '__version__': '1.0',
 '__globals__': [],
 'PTP': array([[1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        ...,
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 'PLP': array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 1., 0., 1.],
        ...,
        [0., 0., 1., ..., 1., 0., 1.],
        [0., 1., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 1., 0., 1.]]),
 'PAP': array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 1.]]),
 'feature': array([[1., 1., 1., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0