In [1]:
# imports
import csv
import os
import pathlib
from collections import namedtuple
from time import time
from typing import List, Tuple, Optional
from random import randint, seed

import networkx as nx
import numpy as np
import torch
import torch_geometric.utils as tutils
from torch_geometric.data import Data
from torch_geometric.datasets import TUDataset
from tqdm import tqdm

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from grakel.kernels import WeisfeilerLehman, VertexHistogram, ShortestPath, WeisfeilerLehmanOptimalAssignment
from grakel import GraphKernel
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit

from utils import load_singleton_graphs_from_TUDataset

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Parameters
# not supported by grakel somehow 'KKI', 'OHSU', 'Peking_1',
datasets = ['AIDS', 'BZR', 'BZR_MD', 'COX2', 'COX2_MD', 'DHFR', 'DHFR_MD', 'ER_MD', 'MUTAG', 'Mutagenicity', 'NCI1', 'NCI109', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR',
            'DD', 'ENZYMES',  'PROTEINS_full',
            'MSRC_9', 'MSRC_21', 
            ]
unlabelled_datasets = ['COLLAB', 'REDDIT-BINARY', 'REDDIT-MULTI-5K', 'REDDIT-MULTI-12K',]
seed(42)
seeds = [randint(0, 10000) for _ in range(10)]

In [3]:
from grakel.datasets import fetch_dataset

def load_dataset_grakel(name_dataset):
    dataset = fetch_dataset(name_dataset, verbose=True)
    
    graphs = dataset.data
    y = dataset.target
    
    return graphs, y


In [6]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from collections import Counter

n_jobs = 10
cv = 5
verbose = False

results = {}

for dataset in datasets:
    print(dataset)

    graphs, labels = load_dataset_grakel(dataset)
    
    # Values of C parameter of SVM
    C_grid = np.logspace(-2, 2, 5)
    accuracies = []
    f1_scores = []
    
    for c_seed in seeds:
        
        G_train, G_test, y_train, y_test = train_test_split(graphs,
                                                            labels,
                                                            test_size=0.2,
                                                            random_state=c_seed,
                                                            stratify=labels)
        

        # gk = ShortestPath(normalize=True)
        gk = WeisfeilerLehman(n_iter=4, base_graph_kernel=VertexHistogram, normalize=True)
        # gk = VertexHistogram(normalize=False)
        # gk = WeisfeilerLehmanOptimalAssignment(n_iter=4, normalize=True)
        K_train = gk.fit_transform(G_train)
        K_test = gk.transform(G_test)

        clf = GridSearchCV(estimator=SVC(kernel='precomputed'),
                           param_grid={'C': C_grid},
                           n_jobs=n_jobs,
                           cv=cv,
                           verbose=int(verbose)*3)
        
        
        clf.fit(K_train, y_train)
        y_predictions = clf.predict(K_test)
        
        
        accuracies.append(accuracy_score(y_test, y_predictions))
        average = 'binary' if len(set(labels)) <= 2 else 'macro'
        f1_scores.append(f1_score(y_test, y_predictions, average=average))
    
    print(f'{dataset}, (acc): {np.mean(accuracies)*100:.2f} +- {np.std(accuracies)*100:.2f}')
    print(f'{dataset}, (f1): {np.mean(f1_scores)*100:.2f} +- {np.std(f1_scores)*100:.2f}')
    
    print('='*60)

AIDS
Extracting dataset  AIDS..
Parsing dataset  AIDS..
Parse was succesful..
Deleting unzipped dataset files..
Going back to the original directory..
AIDS, (acc): 98.38 +- 0.74
AIDS, (f1): 98.99 +- 0.45
BZR
Extracting dataset  BZR..
Parsing dataset  BZR..
Parse was succesful..
Deleting unzipped dataset files..
Going back to the original directory..
BZR, (acc): 87.65 +- 3.40
BZR, (f1): 65.82 +- 8.48
BZR_MD
Extracting dataset  BZR_MD..
Parsing dataset  BZR_MD..
Parse was succesful..
Deleting unzipped dataset files..
Going back to the original directory..
BZR_MD, (acc): 59.03 +- 6.26
BZR_MD, (f1): 66.97 +- 4.72
COX2
Extracting dataset  COX2..
Parsing dataset  COX2..
Parse was succesful..
Deleting unzipped dataset files..
Going back to the original directory..
COX2, (acc): 80.64 +- 3.04
COX2, (f1): 41.92 +- 9.88
COX2_MD
Extracting dataset  COX2_MD..
Parsing dataset  COX2_MD..
Parse was succesful..
Deleting unzipped dataset files..
Going back to the original directory..
COX2_MD, (acc): 59.

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.