In [1]:
import os
import numpy as np
import scipy.sparse as sp
from collections import Counter, defaultdict
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_percentage_error
import time

import pickle
import sys
import scipy.sparse as sp
import torch
import torch.nn.functional as F

# data_loader

## load_nodes

In [2]:
path = '../../data/{}'.format('iYO844')

In [3]:
def load_nodes():
    """
    return nodes dict
        total: total number of nodes
        count: a dict of int, number of nodes for each type
        attr: a dict of np.array (or None), attribute matrices for each type of nodes
        shift: node_id shift for each type. You can get the id range of a type by 
                    [ shift[node_type], shift[node_type]+count[node_type] )
    """
    nodes = {'total':0, 'count':Counter(), 'attr':{}, 'shift':{}}
    with open(os.path.join(path, 'node.dat'), 'r', encoding='utf-8') as f:
        for line in f:
            th = line.split('\t')
            if len(th) == 4:
                # Then this line of node has attribute
                node_id, node_name, node_type, node_attr = th
                node_id = int(node_id)
                node_type = int(node_type)
                node_attr = list(map(float, node_attr.split(',')))
                nodes['count'][node_type] += 1
                nodes['attr'][node_id] = node_attr
                nodes['total'] += 1
            elif len(th) == 3:
                # Then this line of node doesn't have attribute
                node_id, node_name, node_type = th
                node_id = int(node_id)
                node_type = int(node_type)
                nodes['count'][node_type] += 1
                nodes['total'] += 1
            else:
                raise Exception("Too few information to parse!")
    shift = 0
    attr = {}
    for i in range(len(nodes['count'])):
        nodes['shift'][i] = shift
        if shift in nodes['attr']:
            mat = []
            for j in range(shift, shift+nodes['count'][i]):
                mat.append(nodes['attr'][j])
            attr[i] = np.array(mat)
        else:
            attr[i] = None
        shift += nodes['count'][i]
    nodes['attr'] = attr
    return nodes

In [4]:
nodes = load_nodes()
nodes

{'total': 1000,
 'count': Counter({0: 384, 1: 616}),
 'attr': {0: array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.71546936, ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.7172692 , ..., 0.        , 0.        ,
          0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
          0.        ],
         [0.        , 0.        , 0.7081847 , ..., 0.        , 0.        ,
          0.        ]]),
  1: array([[0.        , 0.        , 0.        , ..., 0.07333138, 0.01263912,
          0.01970579],
         [0.        , 0.        , 0.        , ..., 0.07504918, 0.00998594,
          0.03200828],
         [0.        , 0.        , 0.        , ..., 0.06466646, 0.00431759,
          0.01475397],
         ...,
         [0.        ,

## load_links

In [5]:
def get_node_type(node_id):
    for i in range(len(nodes['shift'])):
        if node_id < nodes['shift'][i]+nodes['count'][i]:
            return i
        
def list_to_sp_mat(li):
    data = [x[2] for x in li]
    i = [x[0] for x in li]
    j = [x[1] for x in li]
    return sp.coo_matrix((data, (i,j)), shape=(nodes['total'], nodes['total'])).tocsr()

def load_links():
    """
    return links dict
        total: total number of links
        count: a dict of int, number of links for each type
        meta: a dict of tuple, explaining the link type is from what type of node to what type of node
        data: a dict of sparse matrices, each link type with one matrix. Shapes are all (nodes['total'], nodes['total'])
    """
    links = {'total':0, 'count':Counter(), 'meta':{}, 'data':defaultdict(list)}
    with open(os.path.join(path, 'link.dat'), 'r', encoding='utf-8') as f:
        for line in f:
            th = line.split('\t')
            h_id, t_id, r_id, link_weight = int(th[0]), int(th[1]), int(th[2]), float(th[3])
            if r_id not in links['meta']:
                h_type = get_node_type(h_id)
                t_type = get_node_type(t_id)
                links['meta'][r_id] = (h_type, t_type)
            links['data'][r_id].append((h_id, t_id, link_weight))
            links['count'][r_id] += 1
            links['total'] += 1
    new_data = {}
    for r_id in links['data']:
        new_data[r_id] = list_to_sp_mat(links['data'][r_id])
    links['data'] = new_data
    return links

In [6]:
links = load_links()
links

{'total': 1458,
 'count': Counter({0: 735, 1: 723}),
 'meta': {0: (1, 0), 1: (0, 1)},
 'data': {0: <1000x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 624 stored elements in Compressed Sparse Row format>,
  1: <1000x1000 sparse matrix of type '<class 'numpy.float64'>'
  	with 653 stored elements in Compressed Sparse Row format>}}

## load_labels

In [7]:
def load_labels(name):
    """
    return labels dict
        num_labels: total number of labels
        total: total number of labeled data
        count: number of labeled data for each node type
        data: a numpy matrix with shape (self.nodes['total'], self.labels['num_labels'])
        mask: to indicate if that node is labeled, if False, that line of data is masked
    """
    labels = {'num_labels':0, 'total':0, 'count':Counter(), 'data':None, 'mask':None}
    nl = 2
    mask = np.zeros(nodes['total'], dtype=bool)
    data = [[0.0, 0.0] for i in range(nodes['total'])]
    with open(os.path.join(path, name), 'r', encoding='utf-8') as f:
        for line in f:
            th = line.split('\t')
            node_id, node_type, node_label = int(th[0]), int(th[1]), list(map(float, th[2].split(',')))
            mask[node_id] = True
            data[node_id] = node_label
            labels['count'][node_type] += 1
            labels['total'] += 1
    labels['num_labels'] = nl
    labels['data'] = data
    labels['mask'] = mask
    return labels

In [8]:
labels_train = load_labels('label.dat')
labels_train['data']

[[0.0243, 0.0],
 [0.0, 0.0],
 [0.0, 1.72],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [30.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 6.425],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [24.075, 1190.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [11.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 1.1],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0],
 [0.0, 0.0]

In [9]:
from data_loader import data_loader
dl = data_loader('../../data/{}'.format('iYO844'))

In [10]:
features = []
for i in range(len(dl.nodes['count'])):
    th = dl.nodes['attr'][i]
    if th is None:
        features.append(sp.eye(dl.nodes['count'][i]))
    else:
        features.append(th)
        
adjM = sum(dl.links['data'].values())
labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_labels']), dtype=float)
val_ratio = 0.2
train_idx = np.nonzero(dl.labels_train['mask'])[0]
np.random.shuffle(train_idx)
split = int(train_idx.shape[0]*val_ratio)
val_idx = train_idx[:split]
train_idx = train_idx[split:]
train_idx = np.sort(train_idx)
val_idx = np.sort(val_idx)
test_idx = np.nonzero(dl.labels_test['mask'])[0]
labels[train_idx] = dl.labels_train['data'][train_idx]
labels[val_idx] = dl.labels_train['data'][val_idx]

In [11]:
data = dl.labels_train['data']
# np.array(data)
data

array([[0.0243, 0.    ],
       [0.    , 0.    ],
       [0.    , 1.72  ],
       ...,
       [0.    , 0.    ],
       [0.    , 0.    ],
       [0.    , 0.    ]])

In [12]:
train_idx

array([  0,   2,  31,  37,  47,  50,  67,  79,  88,  89, 107, 108, 110,
       118, 126, 131, 135, 155, 171, 178, 185, 193, 195, 211, 232, 237,
       240, 248, 262, 266, 276, 283, 288, 290, 296, 325, 332, 334, 352,
       366, 367, 374, 375, 382])

In [13]:
dl.labels_train['data']

array([[0.0243, 0.    ],
       [0.    , 0.    ],
       [0.    , 1.72  ],
       ...,
       [0.    , 0.    ],
       [0.    , 0.    ],
       [0.    , 0.    ]])

In [14]:
from data_loader import data_loader
from utils import load_data, mat2tensor, regression_loss

In [15]:
features_list, adjM, labels, train_val_test_idx, dl = load_data('iYO844')



TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
def load_data(prefix='iYO844'):
    from scripts.data_loader import data_loader
    dl = data_loader('../../data/{}'.format(prefix))
    features = []
    for i in range(len(dl.nodes['count'])):
        th = dl.nodes['attr'][i]
        if th is None:
            features.append(sp.eye(dl.nodes['count'][i]))
        else:
            features.append(th)
    adjM = sum(dl.links['data'].values())
    labels = np.zeros((dl.nodes['count'][0], dl.labels_train['num_labels']), dtype=float)
    val_ratio = 0.2
    train_idx = np.nonzero(dl.labels_train['mask'])[0]
    np.random.shuffle(train_idx)
    split = int(train_idx.shape[0]*val_ratio)
    val_idx = train_idx[:split]
    train_idx = train_idx[split:]
    train_idx = np.sort(train_idx)
    val_idx = np.sort(val_idx)
    test_idx = np.nonzero(dl.labels_test['mask'])[0]
    labels[train_idx] = dl.labels_train['data'][train_idx]
    labels[val_idx] = dl.labels_train['data'][val_idx]
    # labels[test_idx] = dl.labels_test['data'][test_idx]
    # if prefix != 'IMDB':
    #     labels = labels.argmax(axis=1)
    train_val_test_idx = {}
    train_val_test_idx['train_idx'] = train_idx
    train_val_test_idx['val_idx'] = val_idx
    train_val_test_idx['test_idx'] = test_idx
    
    return features,\
           adjM, \
           labels,\
           train_val_test_idx,\
            dl
            
features_list, adjM, labels, train_val_test_idx, dl = load_data('iYO844')



KeyError: 'num_classes'



KeyError: 'num_classes'

In [None]:
dl

<data_loader.data_loader at 0x7ff43eac1cd0>

In [None]:
from utils import load_data, mat2tensor, regression_loss

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
features_list = [mat2tensor(features).to(device) for features in features_list]

In [None]:
features_list

[tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.7155,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.7173,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.7082,  ..., 0.0000, 0.0000, 0.0000]]),
 tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0733, 0.0126, 0.0197],
         [0.0000, 0.0000, 0.0000,  ..., 0.0750, 0.0100, 0.0320],
         [0.0000, 0.0000, 0.0000,  ..., 0.0647, 0.0043, 0.0148],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0851, 0.0036, 0.0370],
         [0.0000, 0.0000, 0.0000,  ..., 0.0644, 0.0120, 0.0240],
         [0.0000, 0.0000, 0.0000,  ..., 0.0771, 0.0129, 0.0293]])]

In [None]:
in_dims = [features.shape[1] for features in features_list]

In [None]:
in_dims

[384, 384]

In [None]:
labels = torch.FloatTensor(labels).to(device)
train_idx = train_val_test_idx['train_idx']
train_idx = np.sort(train_idx)
val_idx = train_val_test_idx['val_idx']
val_idx = np.sort(val_idx)
test_idx = train_val_test_idx['test_idx']
test_idx = np.sort(test_idx)

In [None]:
import dgl

Using backend: pytorch


In [None]:
g = dgl.from_scipy(adjM+(adjM.T))
g = dgl.remove_self_loop(g)
g = dgl.add_self_loop(g)
g = g.to(device)

In [None]:
g

Graph(num_nodes=1000, num_edges=3506,
      ndata_schemes={}
      edata_schemes={})

In [None]:
num_labels = dl.labels_train['num_labels']
num_labels

2

In [None]:
from model import GCN, GAT

In [None]:
hidden_dim = 64
num_layers = 1
dropout = 0.5

In [None]:
net = GCN(g, in_dims, hidden_dim, num_labels, num_layers, F.elu, dropout)

In [None]:
net.to(device)

GCN(
  (layers): ModuleList(
    (0): GraphConv(in=64, out=64, normalization=both, activation=<function elu at 0x7ff43d8aaf70>)
    (1): GraphConv(in=64, out=2, normalization=both, activation=None)
  )
  (fc_list): ModuleList(
    (0): Linear(in_features=384, out_features=64, bias=True)
    (1): Linear(in_features=384, out_features=64, bias=True)
  )
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
logits = net(features_list)

In [None]:
print(logits[train_idx].shape)
logits[train_idx]

torch.Size([44, 2])


tensor([[-1.8743, -0.4848],
        [-1.0639, -0.5699],
        [-1.4826,  1.0544],
        [-0.9992, -0.4515],
        [ 0.1631,  1.8237],
        [-0.5169, -0.2541],
        [-0.3441,  0.1454],
        [-0.1851,  0.3012],
        [-0.2801,  0.2937],
        [-0.5601,  0.3947],
        [-1.4253,  0.7481],
        [-1.2102,  0.7528],
        [-0.6537, -0.1294],
        [-0.2530,  0.4410],
        [-0.5582,  0.8547],
        [-1.2973,  0.5710],
        [-2.7355,  1.3869],
        [-0.8768, -0.0272],
        [-0.0357,  1.3320],
        [-1.7878, -0.9200],
        [-0.6594, -1.0535],
        [-1.2012, -0.2564],
        [-1.3059, -0.4519],
        [-1.4498,  1.0103],
        [-1.0474, -0.1075],
        [-1.8103,  0.5304],
        [-0.1332, -0.5179],
        [-1.0840, -0.0540],
        [-1.3243,  1.0313],
        [-1.4399,  1.4311],
        [-0.8190,  0.4558],
        [-0.4589,  0.4048],
        [-3.1639, -0.6956],
        [-2.1997, -0.1904],
        [ 0.0583,  0.4077],
        [-0.8327,  0

In [None]:
print(labels[train_idx].shape)
labels[train_idx]

torch.Size([44, 2])


tensor([[2.4300e-02, 0.0000e+00],
        [0.0000e+00, 1.7200e+00],
        [3.0000e+01, 0.0000e+00],
        [0.0000e+00, 6.4250e+00],
        [2.4075e+01, 1.1900e+03],
        [1.1000e+01, 0.0000e+00],
        [0.0000e+00, 1.1000e+00],
        [5.0000e+00, 0.0000e+00],
        [1.0000e+00, 0.0000e+00],
        [0.0000e+00, 2.3200e+00],
        [1.2000e-02, 8.3000e+01],
        [8.9000e-03, 0.0000e+00],
        [0.0000e+00, 5.3000e+00],
        [9.0000e-01, 0.0000e+00],
        [0.0000e+00, 2.5950e-01],
        [2.1000e+00, 0.0000e+00],
        [8.0000e+00, 0.0000e+00],
        [4.2880e-01, 4.9657e+03],
        [8.7000e+00, 0.0000e+00],
        [0.0000e+00, 1.1000e+01],
        [2.0200e+00, 6.2600e+00],
        [5.9000e-02, 0.0000e+00],
        [3.3000e-02, 0.0000e+00],
        [4.0250e-02, 2.5775e+02],
        [5.0000e+00, 4.8500e+01],
        [1.1400e-01, 6.4800e+01],
        [1.4000e-01, 0.0000e+00],
        [0.0000e+00, 2.2950e-01],
        [3.0000e+00, 0.0000e+00],
        [0.000