In [92]:
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

In [93]:
from utils import read_test1_data
from utils import gen_graph
from utils import prepare_synthetic
from utils import shuffle_graph
from utils import preprocessing_data
from utils import get_pairwise_ids

from utils import prepare_test1
from utils import top_n_acc

In [94]:
RANDOM_STATE = 11
SYNTHETIC_NUM = 50
# SYNTHETIC_NUM = 1000

# number of gen nodes
# NUM_MIN = 4000
# NUM_MAX = 4001
NUM_MIN = 200
NUM_MAX = 201


MAX_EPOCHS = 10000
LEARNING_RATE = 1e-4
EMBEDDING_SIZE = 128
DEPTH = 5
BATCH_SIZE = 16
# BATCH_SIZE = 1

TEST1_NUM = 1

## Read Graph

In [95]:
test1_g, test1_bc, test1_edgeindex = read_test1_data(0)

## Generate Synthetic Graph

In [96]:
train_g = gen_graph(500, 501)
print(len(train_g.edges()))

1984


In [23]:
train_g.degree

DegreeView({0: 48, 1: 42, 2: 2, 3: 41, 4: 70, 5: 44, 6: 74, 7: 40, 8: 37, 9: 30, 10: 23, 11: 20, 12: 22, 13: 25, 14: 13, 15: 21, 16: 46, 17: 44, 18: 18, 19: 25, 20: 19, 21: 25, 22: 45, 23: 14, 24: 30, 25: 22, 26: 19, 27: 19, 28: 24, 29: 20, 30: 19, 31: 19, 32: 9, 33: 10, 34: 10, 35: 6, 36: 7, 37: 22, 38: 10, 39: 8, 40: 11, 41: 12, 42: 13, 43: 19, 44: 7, 45: 11, 46: 17, 47: 7, 48: 9, 49: 12, 50: 14, 51: 12, 52: 11, 53: 14, 54: 6, 55: 12, 56: 9, 57: 9, 58: 13, 59: 6, 60: 13, 61: 7, 62: 20, 63: 7, 64: 17, 65: 20, 66: 11, 67: 8, 68: 10, 69: 11, 70: 15, 71: 15, 72: 13, 73: 21, 74: 18, 75: 9, 76: 11, 77: 9, 78: 6, 79: 9, 80: 10, 81: 16, 82: 22, 83: 6, 84: 16, 85: 6, 86: 8, 87: 7, 88: 19, 89: 17, 90: 10, 91: 12, 92: 10, 93: 6, 94: 16, 95: 11, 96: 8, 97: 8, 98: 12, 99: 9, 100: 5, 101: 6, 102: 4, 103: 8, 104: 8, 105: 14, 106: 6, 107: 5, 108: 8, 109: 4, 110: 6, 111: 8, 112: 8, 113: 7, 114: 5, 115: 14, 116: 15, 117: 16, 118: 6, 119: 10, 120: 5, 121: 9, 122: 5, 123: 8, 124: 7, 125: 9, 126: 5, 127:

In [22]:
[train_g.degree(i) for i in range(train_g.number_of_nodes())]

[48,
 42,
 2,
 41,
 70,
 44,
 74,
 40,
 37,
 30,
 23,
 20,
 22,
 25,
 13,
 21,
 46,
 44,
 18,
 25,
 19,
 25,
 45,
 14,
 30,
 22,
 19,
 19,
 24,
 20,
 19,
 19,
 9,
 10,
 10,
 6,
 7,
 22,
 10,
 8,
 11,
 12,
 13,
 19,
 7,
 11,
 17,
 7,
 9,
 12,
 14,
 12,
 11,
 14,
 6,
 12,
 9,
 9,
 13,
 6,
 13,
 7,
 20,
 7,
 17,
 20,
 11,
 8,
 10,
 11,
 15,
 15,
 13,
 21,
 18,
 9,
 11,
 9,
 6,
 9,
 10,
 16,
 22,
 6,
 16,
 6,
 8,
 7,
 19,
 17,
 10,
 12,
 10,
 6,
 16,
 11,
 8,
 8,
 12,
 9,
 5,
 6,
 4,
 8,
 8,
 14,
 6,
 5,
 8,
 4,
 6,
 8,
 8,
 7,
 5,
 14,
 15,
 16,
 6,
 10,
 5,
 9,
 5,
 8,
 7,
 9,
 5,
 6,
 9,
 10,
 8,
 5,
 6,
 7,
 7,
 9,
 15,
 8,
 6,
 11,
 11,
 8,
 6,
 8,
 8,
 7,
 13,
 8,
 7,
 10,
 10,
 4,
 11,
 7,
 8,
 7,
 11,
 9,
 11,
 4,
 9,
 8,
 4,
 8,
 7,
 6,
 5,
 6,
 8,
 11,
 9,
 4,
 6,
 6,
 7,
 6,
 6,
 6,
 8,
 4,
 14,
 4,
 8,
 6,
 9,
 7,
 7,
 7,
 10,
 13,
 6,
 5,
 8,
 5,
 7,
 5,
 4,
 7,
 5,
 7,
 4,
 5,
 9,
 6,
 7,
 8,
 4,
 8,
 5,
 10,
 4,
 5,
 5,
 6,
 5,
 6,
 6,
 6,
 7,
 6,
 5,
 4,
 6,
 9,
 5,
 7,
 4,

In [97]:
# nx.betweenness_centrality(train_g)

In [98]:
(np.array(list(train_g.edges())) + 100)[:10]

array([[100, 104],
       [100, 108],
       [100, 109],
       [100, 113],
       [100, 114],
       [100, 115],
       [100, 119],
       [100, 122],
       [100, 130],
       [100, 150]])

In [99]:
# nx.betweenness_centrality(train_g)

## DrBC

In [100]:
from scipy import stats
# from model1 import DrBC
from model import DrBC
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [101]:
model = DrBC().to(device)
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [102]:
model.parameters

<bound method Module.parameters of DrBC(
  (linear0): Linear(in_features=3, out_features=128, bias=True)
  (gcn): GCNConv()
  (gru): GRUCell(128, 128)
  (mlp): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)>

In [103]:
# list(model.parameters())[9].grad.data

In [104]:
pm = list(model.parameters())

for i, p in enumerate(pm):
    print(f"pm{i} shape: {p.shape}")

pm0 shape: torch.Size([128, 3])
pm1 shape: torch.Size([128])
pm2 shape: torch.Size([384, 128])
pm3 shape: torch.Size([384, 128])
pm4 shape: torch.Size([384])
pm5 shape: torch.Size([384])
pm6 shape: torch.Size([64, 128])
pm7 shape: torch.Size([64])
pm8 shape: torch.Size([1, 64])
pm9 shape: torch.Size([1])


In [105]:
# list(dict(nx.degree(train_g)).values())
# list(dict(nx.degree(train_g)).values())
# list(dict(nx.betweenness_centrality(train_g)).values())

In [106]:
def validate(model, v_data):
    model.eval()
    total_acc = 0.
    total_kendall = 0.
    for val_X, val_y, val_edge_index in v_data:
        val_X, val_edge_index = val_X.to(device), val_edge_index.to(device)
        
        with torch.no_grad():
            val_y_pred = model(val_X, val_edge_index)

        # print('val_X: ', val_X[:5])
        # print('val_edge_list: ', val_edge_index[:, -5:])
        # print(val_edge_index.shape)
        # print('pred_y: ', val_y_pred[:5])
        print('val_y: ', val_y[:5])
        # return
        val_y_pred = val_y_pred.cpu().detach().numpy()
        val_y = val_y.detach().numpy()

        pred_index = val_y_pred.argsort()[::-1]
        true_index = val_y.argsort()[::-1]
        
        acc = top_n_acc(pred_index, true_index)
        kendall_t, _ = stats.kendalltau(val_y_pred, val_y)
        print(val_y_pred[:3], val_y[:5])

        total_acc += acc
        total_kendall += kendall_t

    total_acc /= len(v_data)
    total_kendall /= len(v_data)
    return total_acc, total_kendall
    

def train(model, optim, loss_fn, epochs:int):
    g_list, dg_list, bc_list  = prepare_synthetic(SYNTHETIC_NUM, (NUM_MIN, NUM_MAX))
    v_data = prepare_test1(TEST1_NUM)
    
    ls_metric = []
    batch_cnt = len(g_list) // BATCH_SIZE
    for e in range(epochs + 1):
        model.train()
        g_list, dg_list, bc_list = shuffle_graph(g_list, dg_list, bc_list)
        batch_bar = tqdm(range(batch_cnt))
        batch_bar.set_description(f'Epochs {e:<5}')
        train_loss = 0
        pair_cnt = 0
        for i in batch_bar:
            # batch
            s_index, e_index = i*BATCH_SIZE, (i+1)*BATCH_SIZE
            train_g, train_dg, train_bc = g_list[s_index: e_index], dg_list[s_index: e_index], bc_list[s_index: e_index]
            X, y, edge_index = preprocessing_data(train_g, train_dg, train_bc)
            X, y, edge_index = X.to(device), y.to(device), edge_index.to(device)
            out = model(X, edge_index)

            # pairwise-loss
            s_ids, t_ids = get_pairwise_ids(train_g)
            out_diff = out[s_ids] - out[t_ids]
            y_diff = y[s_ids] - y[t_ids]
            loss = loss_fn(out_diff, torch.sigmoid(y_diff))

            # optim
            optim.zero_grad()
            loss.backward()
            optim.step()

            pair_cnt += s_ids.shape[0]
            train_loss += (loss.item() * s_ids.shape[0])
            if i == (batch_cnt - 1):
                # last batch
                train_loss /= pair_cnt
                batch_bar.set_postfix(loss=round(train_loss, 6)) 

        if e % 50 == 0:
            # print('out: ', out[:10])
            val_acc, val_kendall = validate(model, v_data)
            ls_metric.append([e, val_acc, val_kendall])
            print(f"Val Acc: {val_acc * 100:.4f} % | Val KendallTau: {val_kendall:.4f}")
        

_ = train(model, optim, loss_fn, 200)

[Generating new training graph]: 100%|██████████| 50/50 [00:05<00:00,  8.79it/s]
[Reading test1 graph]: 100%|██████████| 1/1 [00:00<00:00,  7.61it/s]
Epochs 0    : 100%|██████████| 3/3 [00:00<00:00, 46.53it/s, loss=1.11e+4]


val_y:  tensor([0.0942, 0.0540, 0.0443, 0.0223, 0.0764])
[0.1201888  0.12030778 0.1203984 ] [0.09417453 0.0539708  0.04434366 0.02232567 0.07643765]
Val Acc: 0.0000 % | Val KendallTau: -0.5688


Epochs 1    : 100%|██████████| 3/3 [00:00<00:00, 53.49it/s, loss=1.11e+4]
Epochs 2    : 100%|██████████| 3/3 [00:00<00:00, 51.24it/s, loss=1.11e+4]
Epochs 3    : 100%|██████████| 3/3 [00:00<00:00, 52.59it/s, loss=1.11e+4]
Epochs 4    : 100%|██████████| 3/3 [00:00<00:00, 51.28it/s, loss=1.11e+4]
Epochs 5    : 100%|██████████| 3/3 [00:00<00:00, 50.04it/s, loss=1.11e+4]
Epochs 6    : 100%|██████████| 3/3 [00:00<00:00, 51.67it/s, loss=1.11e+4]
Epochs 7    : 100%|██████████| 3/3 [00:00<00:00, 49.42it/s, loss=1.11e+4]
Epochs 8    : 100%|██████████| 3/3 [00:00<00:00, 50.35it/s, loss=1.11e+4]
Epochs 9    : 100%|██████████| 3/3 [00:00<00:00, 52.23it/s, loss=1.11e+4]
Epochs 10   : 100%|██████████| 3/3 [00:00<00:00, 50.67it/s, loss=1.11e+4]
Epochs 11   : 100%|██████████| 3/3 [00:00<00:00, 49.96it/s, loss=1.11e+4]
Epochs 12   : 100%|██████████| 3/3 [00:00<00:00, 48.73it/s, loss=1.11e+4]
Epochs 13   : 100%|██████████| 3/3 [00:00<00:00, 50.55it/s, loss=1.11e+4]
Epochs 14   : 100%|██████████| 3/3 [00

val_y:  tensor([0.0942, 0.0540, 0.0443, 0.0223, 0.0764])
[0.13755865 0.13730532 0.13711198] [0.09417453 0.0539708  0.04434366 0.02232567 0.07643765]
Val Acc: 94.0000 % | Val KendallTau: 0.7397


Epochs 51   : 100%|██████████| 3/3 [00:00<00:00, 55.10it/s, loss=1.11e+4]
Epochs 52   : 100%|██████████| 3/3 [00:00<00:00, 55.22it/s, loss=1.11e+4]
Epochs 53   : 100%|██████████| 3/3 [00:00<00:00, 52.12it/s, loss=1.11e+4]
Epochs 54   : 100%|██████████| 3/3 [00:00<00:00, 50.62it/s, loss=1.11e+4]
Epochs 55   : 100%|██████████| 3/3 [00:00<00:00, 47.53it/s, loss=1.11e+4]
Epochs 56   : 100%|██████████| 3/3 [00:00<00:00, 49.79it/s, loss=1.11e+4]
Epochs 57   : 100%|██████████| 3/3 [00:00<00:00, 45.81it/s, loss=1.11e+4]
Epochs 58   : 100%|██████████| 3/3 [00:00<00:00, 55.12it/s, loss=1.11e+4]
Epochs 59   : 100%|██████████| 3/3 [00:00<00:00, 52.96it/s, loss=1.11e+4]
Epochs 60   : 100%|██████████| 3/3 [00:00<00:00, 54.68it/s, loss=1.11e+4]
Epochs 61   : 100%|██████████| 3/3 [00:00<00:00, 52.36it/s, loss=1.11e+4]
Epochs 62   : 100%|██████████| 3/3 [00:00<00:00, 51.02it/s, loss=1.11e+4]
Epochs 63   : 100%|██████████| 3/3 [00:00<00:00, 50.88it/s, loss=1.11e+4]
Epochs 64   : 100%|██████████| 3/3 [00

val_y:  tensor([0.0942, 0.0540, 0.0443, 0.0223, 0.0764])
[0.15483494 0.15361322 0.15267663] [0.09417453 0.0539708  0.04434366 0.02232567 0.07643765]
Val Acc: 98.0000 % | Val KendallTau: 0.4487


Epochs 101  : 100%|██████████| 3/3 [00:00<00:00, 46.39it/s, loss=1.11e+4]
Epochs 102  : 100%|██████████| 3/3 [00:00<00:00, 42.24it/s, loss=1.11e+4]
Epochs 103  : 100%|██████████| 3/3 [00:00<00:00, 43.76it/s, loss=1.11e+4]
Epochs 104  : 100%|██████████| 3/3 [00:00<00:00, 47.05it/s, loss=1.11e+4]
Epochs 105  : 100%|██████████| 3/3 [00:00<00:00, 46.03it/s, loss=1.11e+4]
Epochs 106  : 100%|██████████| 3/3 [00:00<00:00, 50.23it/s, loss=1.11e+4]
Epochs 107  : 100%|██████████| 3/3 [00:00<00:00, 48.66it/s, loss=1.11e+4]
Epochs 108  : 100%|██████████| 3/3 [00:00<00:00, 50.96it/s, loss=1.11e+4]
Epochs 109  : 100%|██████████| 3/3 [00:00<00:00, 48.22it/s, loss=1.11e+4]
Epochs 110  : 100%|██████████| 3/3 [00:00<00:00, 51.04it/s, loss=1.11e+4]
Epochs 111  : 100%|██████████| 3/3 [00:00<00:00, 45.99it/s, loss=1.11e+4]
Epochs 112  : 100%|██████████| 3/3 [00:00<00:00, 50.04it/s, loss=1.11e+4]
Epochs 113  : 100%|██████████| 3/3 [00:00<00:00, 45.01it/s, loss=1.11e+4]
Epochs 114  : 100%|██████████| 3/3 [00

val_y:  tensor([0.0942, 0.0540, 0.0443, 0.0223, 0.0764])
[0.22101112 0.21601504 0.21199271] [0.09417453 0.0539708  0.04434366 0.02232567 0.07643765]
Val Acc: 98.0000 % | Val KendallTau: 0.6856


Epochs 151  : 100%|██████████| 3/3 [00:00<00:00, 46.91it/s, loss=1.11e+4]
Epochs 152  : 100%|██████████| 3/3 [00:00<00:00, 47.80it/s, loss=1.11e+4]
Epochs 153  : 100%|██████████| 3/3 [00:00<00:00, 45.84it/s, loss=1.11e+4]
Epochs 154  : 100%|██████████| 3/3 [00:00<00:00, 48.82it/s, loss=1.11e+4]
Epochs 155  : 100%|██████████| 3/3 [00:00<00:00, 41.96it/s, loss=1.11e+4]
Epochs 156  : 100%|██████████| 3/3 [00:00<00:00, 47.69it/s, loss=1.11e+4]
Epochs 157  : 100%|██████████| 3/3 [00:00<00:00, 43.30it/s, loss=1.11e+4]
Epochs 158  : 100%|██████████| 3/3 [00:00<00:00, 49.31it/s, loss=1.11e+4]
Epochs 159  : 100%|██████████| 3/3 [00:00<00:00, 48.47it/s, loss=1.11e+4]
Epochs 160  : 100%|██████████| 3/3 [00:00<00:00, 40.53it/s, loss=1.11e+4]
Epochs 161  : 100%|██████████| 3/3 [00:00<00:00, 47.25it/s, loss=1.11e+4]
Epochs 162  : 100%|██████████| 3/3 [00:00<00:00, 45.11it/s, loss=1.11e+4]
Epochs 163  : 100%|██████████| 3/3 [00:00<00:00, 46.34it/s, loss=1.11e+4]
Epochs 164  : 100%|██████████| 3/3 [00

val_y:  tensor([0.0942, 0.0540, 0.0443, 0.0223, 0.0764])
[0.23258547 0.22666341 0.22184072] [0.09417453 0.0539708  0.04434366 0.02232567 0.07643765]
Val Acc: 98.0000 % | Val KendallTau: 0.7020


In [110]:
# Read file
import urllib.request  

class readFile():
  def __init__(self,file):
    if file == 'y':
      url1 = 'https://raw.githubusercontent.com/emschenn/mlg_hw1/master/hw1_data/youtube/com-youtube.txt' 
      url2 = 'https://raw.githubusercontent.com/emschenn/mlg_hw1/master/hw1_data/youtube/com-youtube_score.txt' 
    else:
      url1 = 'https://raw.githubusercontent.com/emschenn/mlg_hw1/master/hw1_data/Synthetic/5000/' + file + '.txt'
      url2 = 'https://raw.githubusercontent.com/emschenn/mlg_hw1/master/hw1_data/Synthetic/5000/' + file + '_score.txt'
    self.bc_value,s_list,t_list,self.deg_list,n = [],[],[],[],0
    for line in urllib.request.urlopen(url2):
      _,v = line.decode('utf-8').split()
      self.bc_value.append([n,float(v)])
      n += 1
    for x in range(len(self.bc_value)):
      self.deg_list.append([0,1,1])
    for line in urllib.request.urlopen(url1):
      s,t = line.decode('utf-8').split()
      s,t = int(s),int(t)
      s_list.append(s)
      t_list.append(t)
      self.deg_list[s][0]+=1
      self.deg_list[t][0]+=1
    # self.edge_index=[s_list+t_list,t_list+s_list]
    self.edge_index=[s_list,t_list]

  def get_deg_list(self):
    # print(self.deg_list)
    return torch.Tensor(self.deg_list).cuda()

  def get_edge_index(self):
    # print(self.edge_index)
    return torch.tensor(self.edge_index,dtype=torch.long).cuda()

  def get_bc_value(self):
    # print(self.bc_value)
    return self.bc_value


In [113]:
# Evaluation
f = readFile('0')
model = model
t = f.get_deg_list()
t1 = f.get_edge_index()
print(t.shape, t1.shape)
with torch.no_grad():
  outs = model(t,t1)
  print('val_X: ', t[:5])
  print('val_edge_list: ', t1[:, -5:])
  print('pred_y: ', outs[:5])

# Top-N % accuracy
def takeSecond(elem):
    return elem[1]

def topN_accuracy(file,outs,n):
  predict_value,bc_value = [],[]
  for i,j in enumerate(outs.tolist()):
    predict_value.append([i,j])
  bc_value = f.get_bc_value()
  print('val_y: ', bc_value[:5])
  bc_value.sort(key = takeSecond,reverse = True)
  predict_value.sort(key = takeSecond,reverse = True)
  p,t = [],[]
  for x in range(int(len(predict_value)*n/100)):
    p.append(predict_value[x][0])
    t.append(bc_value[x][0])
  # print(t)
  # print(p)
  return(len(set(t)&set(p)) / len(p))

print(topN_accuracy(f,outs,n=1))
print(topN_accuracy(f,outs,n=5))
print(topN_accuracy(f,outs,n=10))

# Kendall tau
import scipy.stats as stats
def kendall_tau(file,outs):
  predict_value,bc_value = [],[]
  for i,j in enumerate(outs.tolist()):
    predict_value.append(j)
  for i in f.get_bc_value():
    bc_value.append(i[1])
  # print(predict_value)
  # print(bc_value)
  tau, _ = stats.kendalltau(predict_value, bc_value)
  print(predict_value[:3], bc_value[:3])
  print(f.get_bc_value()[:3])
  return(tau)

# def kendall_tau(file,outs):
#   predict_value,bc_value = [],[]
#   for i,j in enumerate(outs.tolist()):
#     predict_value.append(*j)
#   for i in file.get_bc_value():
#     bc_value.append(i[1])
#   # print(predict_value)
#   # print(bc_value)
#   tau, _ = stats.kendalltau(predict_value, bc_value)
#   return(tau)

print(kendall_tau(f,outs))

torch.Size([5000, 3]) torch.Size([2, 19982])
val_X:  tensor([[239.,   1.,   1.],
        [178.,   1.,   1.],
        [149.,   1.,   1.],
        [ 90.,   1.,   1.],
        [196.,   1.,   1.]], device='cuda:0')
val_edge_list:  tensor([[4823, 4828, 4844, 4870, 4937],
        [4987, 4968, 4849, 4928, 4953]], device='cuda:0')
pred_y:  tensor([0.2326, 0.2267, 0.2218, 0.2004, 0.2396], device='cuda:0')
val_y:  [[0, 0.09417453090592563], [1, 0.05397079661985897], [2, 0.04434365787783783], [3, 0.022325672571532364], [4, 0.0764376504965615]]
0.98
val_y:  [[0, 0.09417453090592563], [5, 0.092789552991686], [4, 0.0764376504965615], [1, 0.05397079661985897], [6, 0.05002370607942536]]
0.884
val_y:  [[0, 0.09417453090592563], [5, 0.092789552991686], [4, 0.0764376504965615], [1, 0.05397079661985897], [6, 0.05002370607942536]]
0.852
[0.23258547484874725, 0.22666341066360474, 0.2218407243490219] [0.09417453090592563, 0.092789552991686, 0.0764376504965615]
[[0, 0.09417453090592563], [5, 0.092789552991686

In [109]:
val_y:  tensor([0.0942, 0.0540, 0.0443, 0.0223, 0.0764])
[0.23258547 0.22666341 0.22184072] [0.09417453 0.0539708  0.04434366 0.02232567 0.07643765]

SyntaxError: invalid syntax (1758474167.py, line 1)

In [None]:
# g = _[2]
# g.degree(list(range(99, 105)))

## To-Do List
* (done) loss_fn 再加上 sigmoid
* (done) pairwise 目前跨圖了
* (done) h 要 normalized
* (done) aggregate 改成 MessagePassing
* (done) synthetic graph 後，shuffle graph 的順序
* (done) 加入 Epochs
* Metric: top1, 5, 10
* Metric: kendall tau distance
* wall-clock running time
* test step
* (done) change to leaky relu -> back to relu