# Experiment: Performance Evaluation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
from itertools import product
from models import GNNClassifier, GNNSim
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader
import os

In [3]:
# import util
# graphs, _ = util.load_data("MUTAG", degree_as_tag=False)

In [4]:
random_state = 1
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(random_state)
# torch.cuda.manual_seed(random_state)
np.random.seed(random_state)

dataset_name = "PROTEINS"
dataset_name = "MUTAG"
path = os.path.join('./', 'data', 'TU')
# dataset = TUDataset(path, name='MUTAG').shuffle()
# dataset = TUDataset(path, name='MUTAG')
dataset = TUDataset(path, name=dataset_name).shuffle()
test_dataset = dataset[:len(dataset) // 10]
train_dataset = dataset[len(dataset) // 10:]
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
labels = [data.y.item() for data in dataloader]



In [5]:
from sklearn.metrics import accuracy_score

def train(model, optimizer, dataloader, device):
    model.train()

    total_loss = 0
    for data in dataloader:
        data = data.to(device)
        optimizer.zero_grad()
        loss = model.get_loss(data)
        loss.backward()
        total_loss += data.num_graphs * loss.item()
        optimizer.step()
        
    return total_loss / len(train_dataset)

def test(model, dataloader, device):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in dataloader:

            data = data.to(device)
            pred = model.predict(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.extend(pred)
            labels.extend(label)
    total_acc = accuracy_score(predictions, labels)
    return total_acc
    
w = 32
model_config = {}
model_config["input_dim"] = 7
model_config["hidden_dim"] = w
model_config["output_dim"] = w
model_config["n_class"] = 2
model_config["c_u"] = 1
model_config["c_sigma"] = 2
model_config["num_layers"] = 2
learning_rate = 0.01
epochs = 100

model = GNNClassifier(model_config)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

model = model.to(device)
for epoch in range(1, epochs + 1):
    train_loss = train(model, optimizer, train_dataloader, device)
    train_acc = test(model, train_dataloader, device)
    test_acc = test(model, test_dataloader, device)
    print('Epoch: {:03d}, train_loss: {:.4f}, train_acc: {:.4f}, test_acc: {:.4f}'.format(epoch, train_loss, train_acc, test_acc))
    

Epoch: 001, train_loss: 0.7084, train_acc: 0.6706, test_acc: 0.6111
Epoch: 002, train_loss: 0.5835, train_acc: 0.6706, test_acc: 0.6111
Epoch: 003, train_loss: 0.5784, train_acc: 0.6706, test_acc: 0.6111
Epoch: 004, train_loss: 0.5640, train_acc: 0.6706, test_acc: 0.6111
Epoch: 005, train_loss: 0.5650, train_acc: 0.6706, test_acc: 0.6111
Epoch: 006, train_loss: 0.5634, train_acc: 0.6706, test_acc: 0.6111
Epoch: 007, train_loss: 0.5559, train_acc: 0.6706, test_acc: 0.6111
Epoch: 008, train_loss: 0.5542, train_acc: 0.6706, test_acc: 0.6111
Epoch: 009, train_loss: 0.5568, train_acc: 0.6706, test_acc: 0.6111
Epoch: 010, train_loss: 0.5565, train_acc: 0.6706, test_acc: 0.6111
Epoch: 011, train_loss: 0.5480, train_acc: 0.6706, test_acc: 0.6111
Epoch: 012, train_loss: 0.5479, train_acc: 0.6706, test_acc: 0.6111
Epoch: 013, train_loss: 0.5434, train_acc: 0.6706, test_acc: 0.6111
Epoch: 014, train_loss: 0.5439, train_acc: 0.6706, test_acc: 0.6111
Epoch: 015, train_loss: 0.5456, train_acc: 0.670

## Calculate finite gntk

In [6]:
from models import clone_grads, paramdot
import tqdm

def normalize_matrix(matrix):
    m = np.max(matrix)
    out = matrix / m
    return out
def get_finite_ntk(model, dataloader):
    grads = []
    M = len(dataloader)
    print(M)
    i = 0

    for data in tqdm.tqdm(dataloader):
        model.train()
        model.zero_grad()
        loss =model(data)
        loss.backward()
        grads.append(clone_grads(model))
    
    finite_ntk = np.zeros((M,M))
    for i in tqdm.tqdm(range(M)):
        for j in range(i+1):
            finite_ntk[i, j] = finite_ntk[j, i] = paramdot(grads[i], grads[j])
    
    return finite_ntk


init_model = GNNSim(model_config)
finite_ntk = get_finite_ntk(init_model, dataloader)

188


100%|███████████████████████████████████████| 188/188 [00:00<00:00, 2634.53it/s]
100%|████████████████████████████████████████| 188/188 [00:00<00:00, 823.17it/s]


## Calculate infinite gntk

In [7]:
import networkx as nx
from util import S2VGraph
from kernels import calculate_inf_gntk
from models import clone_grads, paramdot

def map_dataloader_to_graphs(dataloader):
    graphs = []
    for data in dataloader:
        g = nx.Graph()
        label = data.y
        node_tags = None
        # add node
        for j in range(data.x.shape[0]):
            g.add_node(j)
        # add edge
        for i in range(data.edge_index.shape[-1]):
            node_a, node_b = data.edge_index[0][i].item(), data.edge_index[1][i].item()
            g.add_edge(node_a, node_b)
        s2v_graph = S2VGraph(g, label, node_tags)
        s2v_graph.node_features = data.x
        graphs.append(s2v_graph)
    
    
    for g in graphs:
        g.neighbors = [[] for i in range(len(g.g))]
        for i, j in g.g.edges():
            g.neighbors[i].append(j)
            g.neighbors[j].append(i)
        degree_list = []
        for i in range(len(g.g)):
            g.neighbors[i] = g.neighbors[i]
            degree_list.append(len(g.neighbors[i]))
        g.max_neighbor = max(degree_list)

    return graphs


graphs = map_dataloader_to_graphs(dataloader)

inf_ntk = calculate_inf_gntk(graphs)

In [8]:
from kernels import svc_search

train_fold_idx = [np.loadtxt('dataset/{}/10fold_idx/train_idx-{}.txt'.format(
        dataset_name, i)).astype(int) for i in range(1, 11)]
test_fold_idx = [np.loadtxt('dataset/{}/10fold_idx/test_idx-{}.txt'.format(
    dataset_name, i)).astype(int) for i in range(1, 11)]

result_df = svc_search(finite_ntk, labels, train_fold_idx,test_fold_idx)
result_df



Unnamed: 0,C,normalized,train,test
0,0.01,False,0.860588,0.85
1,0.046416,False,0.882353,0.861111
2,0.215443,False,0.908824,0.883333
3,1.0,False,0.915294,0.911111
4,4.641589,False,0.927647,0.883333
5,21.544347,False,0.937059,0.872222
6,100.0,False,0.943529,0.855556
7,464.158883,False,0.91,0.838889
8,2154.43469,False,0.748824,0.694444
9,10000.0,False,0.655882,0.644444
