In [1]:
import math
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

In [2]:
from utils import read_test_data
from utils import gen_graph
from utils import prepare_synthetic
from utils import shuffle_graph
from utils import preprocessing_data
from utils import get_pairwise_ids

from utils import prepare_test
from utils import top_n_acc

In [3]:
RANDOM_STATE = 11
# SYNTHETIC_NUM = 16
SYNTHETIC_NUM = 1000


# number of gen nodes
# NUM_MIN = 4000
# NUM_MAX = 4001
NUM_MIN = 5000
NUM_MAX = 5001
IS_PARALLEL = True if NUM_MIN >= 1000 else False


MAX_EPOCHS = 10000
LEARNING_RATE = 1e-4
EMBEDDING_SIZE = 128
DEPTH = 5
BATCH_SIZE = 16

TEST1_NUM = 30

MODEL_SAVED_PATH = "saved_model/"

## Read Graph

In [4]:
test1_g, test1_bc, test1_edgeindex = read_test_data(0)

## Generate Synthetic Graph

In [5]:
train_g = gen_graph(500, 501)
print(len(train_g.edges()))

1983


In [6]:
# [train_g.degree(i) for i in range(train_g.number_of_nodes())]

In [7]:
# nx.betweenness_centrality(train_g)

In [8]:
# (np.array(list(train_g.edges())) + 100)[:10]

In [9]:
# nx.betweenness_centrality(train_g)

## DrBC

In [10]:
from scipy import stats
# from model1 import DrBC
from model import DrBC
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
model = DrBC().to(device)
optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean')

In [12]:
model.parameters

<bound method Module.parameters of DrBC(
  (linear0): Linear(in_features=3, out_features=128, bias=True)
  (gcn): GCNConv()
  (gru): GRUCell(128, 128)
  (mlp): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)>

In [13]:
# list(model.parameters())[9].grad.data

In [14]:
pm = list(model.parameters())

for i, p in enumerate(pm):
    print(f"pm{i} shape: {p.shape}")

pm0 shape: torch.Size([128, 3])
pm1 shape: torch.Size([128])
pm2 shape: torch.Size([384, 128])
pm3 shape: torch.Size([384, 128])
pm4 shape: torch.Size([384])
pm5 shape: torch.Size([384])
pm6 shape: torch.Size([64, 128])
pm7 shape: torch.Size([64])
pm8 shape: torch.Size([1, 64])
pm9 shape: torch.Size([1])


In [15]:
def validate(model, v_data):
    model.eval()
    total_acc1 = 0.
    total_acc5 = 0.
    total_acc10 = 0.
    total_kendall = 0.
    start_time = time.time()
    for val_X, val_y, val_edge_index in v_data:
        val_X, val_edge_index = val_X.to(device), val_edge_index.to(device)
        
        with torch.no_grad():
            val_y_pred = model(val_X, val_edge_index)

        val_y_pred = val_y_pred.cpu().detach().numpy()
        val_y = val_y.detach().numpy()

        pred_index = val_y_pred.argsort()[::-1]
        true_index = val_y.argsort()[::-1]
        
        acc1 = top_n_acc(pred_index, true_index, n=1)
        acc5 = top_n_acc(pred_index, true_index, n=5)
        acc10 = top_n_acc(pred_index, true_index, n=10)
        kendall_t, _ = stats.kendalltau(val_y_pred, val_y)

        total_acc1 += acc1
        total_acc5 += acc5
        total_acc10 += acc10
        total_kendall += kendall_t

    total_acc1 /= len(v_data)
    total_acc5 /= len(v_data)
    total_acc10 /= len(v_data)
    total_kendall /= len(v_data)
    time_spent = time.time() - start_time
    return round(total_acc1, 6), \
        round(total_acc5, 6), \
        round(total_acc10, 6), \
        round(total_kendall, 6), \
        round(time_spent, 2)
    

def train(model, optim, loss_fn, epochs:int):
    g_list, dg_list, bc_list  = prepare_synthetic(SYNTHETIC_NUM, (NUM_MIN, NUM_MAX), IS_PARALLEL)
    v_data = prepare_test(TEST1_NUM)
    ls_metric = []
    epoch_bar = tqdm(range(epochs), desc="Epochs")
    for e in epoch_bar:
        if (e % 5000 == 0) and (e != 0):
            # re generate synthetic graph
            g_list, dg_list, bc_list  = prepare_synthetic(SYNTHETIC_NUM, (NUM_MIN, NUM_MAX), IS_PARALLEL)
        model.train()
        g_list, dg_list, bc_list = shuffle_graph(g_list, dg_list, bc_list)
        train_g, train_dg, train_bc = g_list[:16], dg_list[:16], bc_list[:16]
        X, y, edge_index = preprocessing_data(train_g, train_dg, train_bc)
        X, y, edge_index = X.to(device), y.to(device), edge_index.to(device)
        out = model(X, edge_index)

        # pairwise-loss
        s_ids, t_ids = get_pairwise_ids(train_g)
        out_diff = out[s_ids] - out[t_ids]
        y_diff = y[s_ids] - y[t_ids]
        loss = loss_fn(out_diff, torch.sigmoid(y_diff))

        # optim
        optim.zero_grad()
        loss.backward()
        optim.step()

        epoch_bar.set_postfix(loss=loss.item())
        if e % 500 == 0:
            # validate
            val_acc1, val_acc5, val_acc10, val_kendall, time_spent = validate(model, v_data)
            ls_metric.append([e, val_acc1, val_acc5, val_acc10, val_kendall, time_spent])
            print(f"[{e}] Val Acc1: {val_acc1 * 100:.2f} % | Acc5: {val_acc5 * 100:.2f} % | Acc10: {val_acc10 * 100:.2f} % | KendallTau: {val_kendall:.4f} | spend: {time_spent} secs")
            print('-'*50)

    # last time 
    val_acc1, val_acc5, val_acc10, val_kendall, time_spent = validate(model, v_data)
    ls_metric.append([epochs, val_acc1, val_acc5, val_acc10, val_kendall, time_spent])
    print(f"[{epochs}] Val Acc1: {val_acc1 * 100:.2f} % | Acc5: {val_acc5 * 100:.2f} % | Acc10: {val_acc10 * 100:.2f} % | KendallTau: {val_kendall:.4f} | spend: {time_spent} secs")
    print('-'*50)

    return ls_metric

train_metric = train(model, optim, loss_fn, MAX_EPOCHS)

[Generating new training graph]: 100%|██████████| 1000/1000 [7:57:26<00:00, 28.65s/it] 
[Reading test1 graph]: 100%|██████████| 30/30 [00:12<00:00,  2.50it/s]
Epochs:   0%|          | 1/10000 [00:01<5:13:57,  1.88s/it, loss=0.867]

[0] Val Acc1: 0.00 % | Acc5: 0.00 % | Acc10: 0.00 % | KendallTau: -0.6277 | spend: 0.2 secs
--------------------------------------------------


Epochs:   5%|▌         | 501/10000 [02:37<55:15,  2.87it/s, loss=0.502] 

[500] Val Acc1: 93.27 % | Acc5: 94.49 % | Acc10: 93.57 % | KendallTau: 0.8674 | spend: 0.19 secs
--------------------------------------------------


Epochs:  10%|█         | 1001/10000 [05:09<52:34,  2.85it/s, loss=0.501] 

[1000] Val Acc1: 93.73 % | Acc5: 94.69 % | Acc10: 93.75 % | KendallTau: 0.8787 | spend: 0.19 secs
--------------------------------------------------


Epochs:  15%|█▌        | 1501/10000 [07:37<49:23,  2.87it/s, loss=0.499]

[1500] Val Acc1: 95.20 % | Acc5: 94.75 % | Acc10: 93.77 % | KendallTau: 0.8809 | spend: 0.19 secs
--------------------------------------------------


Epochs:  20%|██        | 2001/10000 [10:04<47:05,  2.83it/s, loss=0.499]

[2000] Val Acc1: 95.80 % | Acc5: 94.71 % | Acc10: 93.75 % | KendallTau: 0.8814 | spend: 0.19 secs
--------------------------------------------------


Epochs:  25%|██▌       | 2501/10000 [12:31<43:21,  2.88it/s, loss=0.5]  

[2500] Val Acc1: 96.20 % | Acc5: 94.67 % | Acc10: 93.79 % | KendallTau: 0.8817 | spend: 0.19 secs
--------------------------------------------------


Epochs:  30%|███       | 3001/10000 [14:59<41:33,  2.81it/s, loss=0.5]  

[3000] Val Acc1: 96.00 % | Acc5: 94.71 % | Acc10: 93.77 % | KendallTau: 0.8820 | spend: 0.19 secs
--------------------------------------------------


Epochs:  35%|███▌      | 3501/10000 [17:28<38:25,  2.82it/s, loss=0.502]

[3500] Val Acc1: 96.13 % | Acc5: 94.71 % | Acc10: 93.77 % | KendallTau: 0.8822 | spend: 0.19 secs
--------------------------------------------------


Epochs:  40%|████      | 4001/10000 [19:55<35:01,  2.86it/s, loss=0.5]  

[4000] Val Acc1: 96.33 % | Acc5: 94.73 % | Acc10: 93.78 % | KendallTau: 0.8823 | spend: 0.19 secs
--------------------------------------------------


Epochs:  45%|████▌     | 4501/10000 [22:22<32:11,  2.85it/s, loss=0.5]  

[4500] Val Acc1: 96.33 % | Acc5: 94.68 % | Acc10: 93.81 % | KendallTau: 0.8825 | spend: 0.19 secs
--------------------------------------------------


Epochs:  50%|█████     | 5000/10000 [24:55<24:55,  3.34it/s, loss=0.5]    

### Save model / train_loss

In [None]:
model_saved_name = f'{MODEL_SAVED_PATH}DrBC_G{SYNTHETIC_NUM}_N{NUM_MIN}_E{MAX_EPOCHS}.pth'
torch.save(model.state_dict(), model_saved_name)

# train
df = pd.DataFrame(train_metric, columns=['epochs', 'val_acc1', 'val_acc5', 'val_acc10', 'val_kendall', 'time'])
df.to_csv(f"{MODEL_SAVED_PATH}train_metrics_G{SYNTHETIC_NUM}_N{NUM_MIN}_E{MAX_EPOCHS}.csv", index=False)
df

Unnamed: 0,epochs,val_acc1,val_acc5,val_acc10,val_kendall,time
0,0,0.918667,0.889067,0.821933,0.438386,0.19
1,500,0.959333,0.93,0.8966,0.671982,0.19
2,1000,0.955333,0.927733,0.891133,0.671589,0.19
3,1500,0.946,0.932,0.895533,0.686061,0.19
4,2000,0.944667,0.9348,0.899667,0.701001,0.19
5,2500,0.952667,0.934933,0.900133,0.70315,0.19
6,3000,0.942,0.935067,0.901467,0.712888,0.19
7,3500,0.932,0.934933,0.899,0.704082,0.19
8,4000,0.931333,0.935333,0.898933,0.713081,0.19
9,4500,0.93,0.933067,0.8972,0.704947,0.19


## Test

In [None]:
model = DrBC().to(device)
model.load_state_dict(torch.load(model_saved_name))

<All keys matched successfully>

In [None]:
t_data = prepare_test('y')
test_acc1, test_acc5, test_acc10, test_kendall, test_spend = validate(model, t_data) 

  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


In [None]:
print('acc: ', test_acc1, test_acc5, test_acc10, sep='\n')
print('kendall: ', test_kendall)

acc: 
0.600546
0.601509
0.636026
kendall:  0.527324


In [None]:
# acc: 
# 0.600546
# 0.601509
# 0.636026
# kendall:  0.527324

In [None]:
# synthetic graph num: 100
# synthetic node num: 200
# epoch: 200


# with L2 norm
# acc: 
# 0.613588
# 0.495506
# 0.302029
# kendall:  -0.435382

# without L2 norm + bc apply log
# 0.615791
# 0.618709
# 0.643578
# kendall:  0.288244


### Experiment

In [None]:
model200 = DrBC().to(device)
model200.load_state_dict(torch.load(f'{MODEL_SAVED_PATH}DrBC_G1000_N200_E10000.pth'))

model5000 = DrBC().to(device)
model5000.load_state_dict(torch.load(f'{MODEL_SAVED_PATH}DrBC_G1000_N5000_E10000.pth'))

scales = [5000, 10000, 20000]
ls_metrics = []
for scale in scales:
    print('-'*15, scale)
    g_list, dg_list, bc_list = prepare_synthetic(30, (scale, scale+1), parallel=True)
    for i in enumerate(range(len(g_list))):
        test_X, test_y, test_edge_index = preprocessing_data([g_list[i]], [dg_list[i]], [bc_list[i]])
        t_data = [test_X, test_y, test_edge_index]
        _acc1, _acc5, _acc10, _kendall, _time = validate(model200, t_data)
        ls_metrics.append([scale, '200', i, _acc1, _acc5, _acc10, _kendall, _time])

        _acc1, _acc5, _acc10, _kendall, _time = validate(model5000, t_data)
        ls_metrics.append([scale, '5000', i, _acc1, _acc5, _acc10, _kendall, _time])

In [None]:
df = pd.DataFrame(ls_metrics, columns=['scale', 'model', 'test_graph_id', 'test_acc1', 'test_acc5', 'test_acc10', 'test_kendall', 'time'])
df.to_csv('test_scale_diff_result.csv', index=False)

## To-Do List
* (done) loss_fn 再加上 sigmoid
* (done) pairwise 目前跨圖了
* (done) h 要 normalized
* (done) aggregate 改成 MessagePassing
* (done) synthetic graph 後，shuffle graph 的順序
* (done) 加入 Epochs"
* (done) change to leaky relu
* Metric: top1, 5, 10
* Metric: kendall tau distance
* wall-clock running time
* test step
