# Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

!ls /content/gdrive/My\ Drive

Mounted at /content/gdrive
 aimas2020
'Automatic Generation of Topic Labels.gslides'
'Colab Notebooks'
 cvdl2020
 iir_book.pdf
 ir_final
'Medical AI'
'Paper Slides'
 Q56094077
 res18_diabete_noaug.pth
'Towards Better Text Understanding and Retrieval through Kernel Entity Saliency Modeling.gslides'
 tsai.ipynb
 獎助學金
 申請資料


In [None]:
# !unzip /content/gdrive/MyDrive/Q56094077/snrs/hw1_0319/hw1_data.zip -d /content/gdrive/MyDrive/Q56094077/snrs/hw1_0319

# Import Library

In [20]:
import os

import torch
import torch.nn as nn

import pandas as pd
import numpy as np

from tqdm import tqdm

In [7]:
import torch_geometric
from torch_geometric.data import Data, DataLoader
import torch_geometric.utils as utils

# Define constant

In [8]:
_root = os.getcwd()

_data = os.path.join(_root, "hw1_data")

data_synthetic = os.path.join(_data, "Synthetic", "5000")
data_youtube = os.path.join(_data, "youtube")

# Dataset

## Data

- data.x	节点特征，维度是[num_nodes, num_node_features]。
- data.edge_index	维度是[2, num_edges]，描述图中节点的关联关系，每一列对应的两个元素，分别是边的起点和重点。数据类型是torch.long。需要注意的是，data.edge_index是定义边的节点的张量（tensor），而不是节点的列表（list）。
- data.edge_attr	边的特征矩阵，维度是[num_edges, num_edge_features]
- data.y	训练目标（维度可以是任意的）。对于节点相关的任务，维度为[num_nodes, *]；对于图相关的任务，维度为[1,*]。
- data.position	节点位置矩阵（Node position matrix），维度为[num_nodes, num_dimensions]。

- [Learning to Identify High Betweenness Centrality Nodes from
Scratch: A Novel Graph Neural Network Approach](https://arxiv.org/pdf/1905.10418.pdf)
- node initial feature = [$(d_v), 1, 1]

In [223]:
synthetic = []
between = []
for f in os.listdir(data_synthetic):
    if "score" in f:
        # ground truth of betweenness centrality
        p = os.path.join(data_synthetic, f)
        between.append(p)
    else:
        p = os.path.join(data_synthetic, f)
        synthetic.append(p)

between.sort()
synthetic.sort()

In [265]:
batch = 1

data_list = []

for index, f in enumerate(synthetic):
    edge_index = torch_geometric.io.read_txt_array(f, dtype=torch.long)
    edge_index = edge_index.t().contiguous()
    edge_index = utils.to_undirected(edge_index)

    row, col = edge_index  
    deg = utils.degree(col) # must use col to get degree, why?
    deg = deg.numpy()  

    vertice = []
    for d in deg:
        vertice.append([d, 1, 1])
    vertice = np.array(vertice, dtype=np.float)
    vertice = torch.from_numpy(vertice)
    
    ### between centrality
    bcs = []
    bc = torch_geometric.io.read_txt_array(between[index], dtype=torch.double)
    bc = bc.t().contiguous()
    row, col = bc
    bc = col
    bc = bc.numpy()
    for b in bc:
        bcs.append([b])

#     bcs = np.array(bcs)
    data = Data(x=vertice, edge_index=edge_index, y=bcs)

    data_list.append(data)

loader = DataLoader(data_list, batch_size=batch)
# print(loader)

# Model

In [266]:
from torch_geometric.nn import MessagePassing
import torch.nn.functional as F
from torch_geometric.nn import global_max_pool
from torch_geometric.typing import Adj, OptTensor
from torch_geometric.transforms import Distance

In [267]:
class Net(MessagePassing):
    def __init__(self, c, p, q, num_layers, aggr="add"):
        super(Net, self).__init__(aggr=aggr)
        
        self.num_layers = num_layers
        self.w_0 = torch.nn.Linear(in_features=c, out_features=p).double()
        
        self.rnn = torch.nn.GRUCell(p, p).double()
  
        self.w_4 = torch.nn.Linear(in_features=p, out_features=q).double()
        self.w_5 = torch.nn.Linear(in_features=q, out_features=1).double()

    def forward(self, data):
        
        x, edge_index = data.x, data.edge_index
        
        # h_0 = x

        # h_1
        x = self.w_0(x)
        x = F.normalize(x, p=2, dim=1)
        
        row, col = edge_index
        deg = utils.degree(col, x.size(0), dtype=x.dtype)
        deg = torch.add(deg, 1)
        deg_inv_sqrt = torch.pow(deg, -0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        h_s = [x]
        
        
        for i in range(self.num_layers-1):
            # internally calls the message(), aggregate() and update() functions
            m = self.propagate(edge_index, x=x, norm=norm)
            x = self.rnn(m, x)
            x = F.normalize(x, p=2, dim=1) 
           
            h_s.append(x)
        
        h_s = torch.stack(h_s)
#         print(h_s.shape)
        z = global_max_pool(h_s, torch.tensor([0], dtype=torch.long).to(device))
        
        ### Decoder
        z = self.w_4(z)
        z = F.relu(z)
        z = self.w_5(z)
        
        return z

    def message(self, x_j, norm: OptTensor):
        return x_j if norm is None else norm.view(-1, 1) * x_j
    

## Params

In [285]:
depth = 5
p = 128 # embedding dimension of hidden state
q = int(p/2)

epochs = 1000
model_save = os.path.join(_root, "weight.pth")

In [286]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:1


In [287]:
model = Net(c=3, p=p, q=q, num_layers=depth).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=5e-4)

criterion = torch.nn.BCEWithLogitsLoss()

In [None]:
for epoch in range(epochs):
    
#     bce_loss = 0.0
    graph_cnt = 0
    for data in tqdm(loader):
        
        optimizer.zero_grad()
        
        data = data.to(device)
        bc_pr = model(data)
        bc_gt = data.y
        bc_gt = np.array(bc_gt)
        
        picked = [(torch.rand(25000, 2) * 4999).long()]
        for b in range(batch-1):
            picked.append((torch.rand(25000, 2) * 4999).long())
        
        bce_loss = torch.tensor(0, dtype=torch.float).to(device)
        for b in range(batch):
            index = picked[b]
            for i in range(len(index)):
                s1, s2 = index[i]
                
                y_gt = bc_gt[b][s2] - bc_gt[b][s1]
                y_pr = bc_pr[b][s2] - bc_pr[b][s1]
                
                y_gt = torch.from_numpy(y_gt).to(device)
                loss = criterion(y_pr, y_gt)
                bce_loss += loss
                
#         bce_loss += data.num_graphs * loss.item()
        graph_cnt += data.num_graphs
        
        bce_loss.backward()
        optimizer.step()
        
    print("Epoch = {}, loss = {}".format(epoch, bce_loss.item()/graph_cnt))
    
    
    checkpoint = {
        'model_stat': model.state_dict(),
        'optimizer_stat': optimizer.state_dict(),
    }
    torch.save(checkpoint, model_save)

 13%|█▎        | 4/30 [00:39<04:15,  9.85s/it]

In [277]:
# test fit model
predict = model(data.to(device))
predict.shape


torch.Size([1, 5000, 1])

### Sampling nodes

In our experiments, we randomly sample 5|V | source nodes and 5|V |
target nodes with replacement

In [274]:
picked = (torch.rand(25000, 2) * 4999).long()
for b in range(batch-1):
    picked = torch.stack((picked, (torch.rand(25000, 2) * 4999).long()))
    
picked.shape

torch.Size([25000, 2])

In [275]:
for data in loader:
    print(data)

Batch(batch=[5000], edge_index=[2, 39964], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39962], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39960], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39966], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39966], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39968], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39964], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39968], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39964], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39962], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39968], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39962], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39960], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39966], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39964], x=[5000, 3], y=[1])
Batch(batch=[5000], edge_index=[2, 39964], x=[5000, 3],

In [192]:
(loader[0].y-loader.y[1]).shape

AttributeError: 'DataLoader' object has no attribute 'y'