In [15]:
import numpy as np
import json
from utils import get_file_results
from itertools import product
import os
import scipy.stats as stats

In [33]:
datasets = [
    'COLLAB', 'IMDB-BINARY', 'IMDB-MULTI', 'REDDIT-BINARY', 'REDDIT-MULTI-5K', 'REDDIT-MULTI-12K',
    'AIDS', 'BZR', 'BZR_MD', 'COX2', 'COX2_MD', 'DHFR', 'DHFR_MD', 'ER_MD', 'MUTAG',
    'Mutagenicity', 'NCI1', 'NCI109', 'PTC_FM', 'PTC_FR', 'PTC_MM', 'PTC_MR',
    'DD', 'ENZYMES', 'KKI', 'OHSU', 'Peking_1', 'PROTEINS_full', 'MSRC_9', 'MSRC_21',
    'FRANKENSTEIN', 'PROTEINS', 'COIL-DEL', 'COIL-RAG', 'Letter-high', 'Letter-low',
    'Letter-med',
]

use_degrees = [True]  #, False]

clf_methods = ['rbf', 'WL']

In [43]:
from typing import Dict


def load_data(filename: str) -> Dict:
    with open(os.path.join('..', filename), 'r') as file:
        data = json.load(file)

    return data


dataset = datasets[0]
classifier = 'rbf'
use_degree = False

for dataset in datasets:
    filename = get_file_results(None, dataset, classifier, use_degree)
    data = load_data(filename)

    filename = get_file_results(None, dataset, 'WL', False)
    data_to_compare = load_data(filename)

    print(dataset)
    scorings = ['test_acc', 'test_balanced_acc', 'test_f1_macro']

    for scoring in scorings:
        results_score, results_to_compare_score = [], []
        for result, result_to_compare in zip(data, data_to_compare):
            results_score.append(np.mean(result[scoring]))
            results_to_compare_score.append(np.mean(result_to_compare[scoring]))
        print(f'Metric: {scoring}')
        print(f'Mean rbf: {np.mean(results_score):.2f} -- Mean 4-WL {np.mean(results_to_compare_score):.2f}')
        confidence_interval_results = stats.t.interval(confidence=0.99,
                                                       df=len(results_score) - 1,
                                                       loc=np.mean(results_score),
                                                       scale=stats.sem(results_score))
        confidence_interval_results_to_compare = stats.t.interval(confidence=0.99,
                                                       df=len(results_to_compare_score) - 1,
                                                       loc=np.mean(results_to_compare_score),
                                                       scale=stats.sem(results_to_compare_score))
        print(confidence_interval_results)
        print(confidence_interval_results_to_compare)
        comparison_stats = stats.ttest_rel(results_score, results_to_compare_score)
        print(comparison_stats)
        print('HA' if comparison_stats.pvalue < 0.05 else 'H0')
        if comparison_stats.pvalue < 0.05 and comparison_stats.statistic > 0:
            print('RBF is better than 4-WL')
        if confidence_interval_results[1] >= confidence_interval_results_to_compare[0]:
            print('RBF is better reject dataset')
        print()

    print()

COLLAB
Metric: test_acc
Mean rbf: 0.56 -- Mean 4-WL 0.78
(0.5618036599018708, 0.5624763400981294)
(0.7804313382642853, 0.7842486617357145)
Ttest_relResult(statistic=-331.96399038012254, pvalue=1.0396973822828535e-19)
HA

Metric: test_balanced_acc
Mean rbf: 0.38 -- Mean 4-WL 0.76
(0.3788598823586124, 0.37949466216021305)
(0.7593881358734733, 0.7649487324892983)
Ttest_relResult(statistic=-417.7228656718524, pvalue=1.3145737580248868e-20)
HA

Metric: test_f1_macro
Mean rbf: 0.32 -- Mean 4-WL 0.75
(0.31853404945765496, 0.319667987093019)
(0.7522087706485729, 0.7569555532763612)
Ttest_relResult(statistic=-513.603642017777, pvalue=2.0471862860464276e-21)
HA


IMDB-BINARY
Metric: test_acc
Mean rbf: 0.54 -- Mean 4-WL 0.72
(0.5305551693116259, 0.5518448306883741)
(0.7076523629234163, 0.7309476370765838)
Ttest_relResult(statistic=-38.33186500852999, pvalue=2.7795952369924905e-11)
HA

Metric: test_balanced_acc
Mean rbf: 0.54 -- Mean 4-WL 0.72
(0.5305551693116259, 0.5518448306883741)
(0.7076523629