In [1]:
"""
    IMPORTING LIBS
"""
import dgl

import numpy as np
import os
import socket
import time
import random
import glob
import argparse, json
import pandas as pd

from os import listdir
from os.path import isfile, join

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from tensorboardX import SummaryWriter
from tqdm import tqdm

from sklearn.metrics import balanced_accuracy_score, confusion_matrix

import sklearn
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import math

# custom function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [2]:
# some_file.py
import sys
#change location to GNN_RIVM folder
sys.path.insert(1, '/home/jeroen/Documents/Toxic/GNN_rivm/benchmark')

from nets.TUs_graph_classification.load_net import gnn_model # import GNNs
from data.data import LoadData # import dataset
from train.train_TUs_graph_classification import train_epoch, evaluate_network, evaluate_network2 # import train functions

drop_last = False
device = torch.device("cpu")

In [3]:
#function to convert net params file to needed feature vector for model loading
def create_net_params(file_name):
    f = open(file_name, "r")
    n_params = (f.readlines()[5].split(" "))
    net_params = dict()
    pbt_params_int = ['in_dim', 'hidden_dim', 'n_classes', 'L', 'n_mlp_GIN']
    int_idx = [29,3,31,1,9]
    pbt_params_str = ['learn_eps_GIN', 'neighbor_aggr_GIN', 'readout', 'graph_norm', 'batch_norm', 'residual']
    str_idx = [11,13,7,19,21,5]
    
    for idx1, par in enumerate(pbt_params_int):
        net_params[par] = int(n_params[int_idx[idx1]].split(',')[0])
    net_params['in_feat_dropout'] = float(n_params[15].split(',')[0])
    net_params['dropout'] = float(n_params[17].split(',')[0])
    for idx2, par in enumerate(pbt_params_str):
        net_params[par] =  n_params[str_idx[idx2]].split(',')[0].strip("'")
    return net_params 

In [4]:
#load data and best representation
folder = "GINPBTn/"
model_str = "GIN"
gin_file = "GIN_PBT_Repn3_GPU0_08h22m01s_on_Jul_01_2020"
gin_np = create_net_params(folder+"config_"+gin_file+".txt")
#correct for error in config file
gin_np['L'] = 4
gin_np['hidden_dim'] = 128

In [6]:
#load both datasets
dataset_pZZS = LoadData("pZZS_Rep3")
dataset = LoadData("PBT_Repn3")

129
preparing 129 graphs for the ALL set...
[!] Dataset:  pZZS_Rep3
Time taken: 1.3272s
971
preparing 971 graphs for the ALL set...
[!] Dataset:  PBT_Repn3
Time taken: 9.6463s


In [7]:
#print accuracy for test dataset of standard data
def test_model(net_params, model_name, dataset, model_str):
    accs = []
    for i in range(10):
        model = gnn_model(model_str, net_params)
        onlyfiles = [f for f in listdir(model_name + str(i)) if isfile(join(model_name + str(i), f))]
        model.load_state_dict(torch.load(model_name + str(i) +"/"+ onlyfiles[-1]))
        trainset, valset, testset = dataset.train[i], dataset.val[i], dataset.test[i]
#         train_loader = DataLoader(trainset, batch_size=20, shuffle=True, drop_last=drop_last, collate_fn=dataset.collate)
#         val_loader = DataLoader(valset, batch_size=20, shuffle=False, drop_last=drop_last, collate_fn=dataset.collate)
        test_loader = DataLoader(testset, batch_size=1, shuffle=False, drop_last=drop_last, collate_fn=dataset.collate)
        epoch_val_loss, epoch_val_acc = evaluate_network(model, device, test_loader, 999)
        accs.append(epoch_val_acc)
    print(np.mean(accs),np.std(accs))
#     print(accs)

In [8]:
#returns 10 scores for 10 models of cross validation
def test_model_s(net_params, model_name, dataset, model_str):
    accs = []
    labs = []
    for i in range(10):
        model = gnn_model(model_str, net_params)
        onlyfiles = [f for f in listdir(model_name + str(i)) if isfile(join(model_name + str(i), f))]
        model.load_state_dict(torch.load(model_name + str(i) +"/"+ onlyfiles[-1]))
        allset = dataset.all
        test_loader = DataLoader(allset, batch_size=1, shuffle=False, drop_last=drop_last, collate_fn=dataset.collate)
        epoch_val_loss, labels, score = evaluate_network2(model, device, test_loader, 999)
        accs.append(score)
    return accs

In [11]:
#get accuracy and scores on new dataset
test_model(gin_np, folder+gin_file+"/RUN_", dataset, model_str)
accs = test_model_s(gin_np, folder+gin_file+"/RUN_", dataset_pZZS, model_str)

0.9521484682667992 0.032889738854116106


In [14]:
#create empty score vector
#add all scores as sum in the vector
#before adding, convert to sigmoid for normalized scores
#print mean of the scores, difference between scores and predicted label for pZZS dataset

score = np.zeros([len(accs[0]), 2])
for ac in accs:
    score += sigmoid(np.array(ac))
#Sort by highest score on 1
score = sorted(score, key=lambda tup: tup[1], reverse=True)
pred = np.argmax(score, axis=1)

for idx, pr in enumerate(pred):
    print("Predicted: %2d, Score 0: %.3f, Score 1: %.3f, diff: %.2f" %(pr, score[idx][0]/10, score[idx][1]/10, abs((score[idx][0]/10)-(score[idx][1]/10)))) 


Predicted:  1, Score 0: 0.993, Score 1: 1.000, diff: 0.01
Predicted:  1, Score 0: 0.526, Score 1: 0.994, diff: 0.47
Predicted:  1, Score 0: 0.885, Score 1: 0.992, diff: 0.11
Predicted:  1, Score 0: 0.974, Score 1: 0.991, diff: 0.02
Predicted:  1, Score 0: 0.646, Score 1: 0.986, diff: 0.34
Predicted:  0, Score 0: 1.000, Score 1: 0.978, diff: 0.02
Predicted:  0, Score 0: 0.997, Score 1: 0.966, diff: 0.03
Predicted:  1, Score 0: 0.913, Score 1: 0.961, diff: 0.05
Predicted:  0, Score 0: 0.993, Score 1: 0.901, diff: 0.09
Predicted:  1, Score 0: 0.900, Score 1: 0.900, diff: 0.00
Predicted:  1, Score 0: 0.891, Score 1: 0.899, diff: 0.01
Predicted:  1, Score 0: 0.756, Score 1: 0.899, diff: 0.14
Predicted:  1, Score 0: 0.803, Score 1: 0.899, diff: 0.10
Predicted:  1, Score 0: 0.845, Score 1: 0.896, diff: 0.05
Predicted:  1, Score 0: 0.704, Score 1: 0.892, diff: 0.19
Predicted:  0, Score 0: 0.908, Score 1: 0.885, diff: 0.02
Predicted:  0, Score 0: 0.947, Score 1: 0.880, diff: 0.07
Predicted:  1,