In [1]:
# Dependencies


import numpy as np
import networkx as nx
import functools as ft
import math
import random
from datasets import load_dataset

# import torch
# torch.set_default_device("cuda" if torch.cuda.is_available() else "cpu")
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    HalvingGridSearchCV,
    LearningCurveDisplay,
    learning_curve,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_predict,
)

import sys

sys.path.append("../")

from graph.graph import process_dataset, transform
from hdc import hdv, bind, bundle, ItemMemory, hdvw, hdva

In [2]:
# encode_graph -> graphHD (graph, vertices, dimensions)


def encode_graph(graph, vertices, dimensions):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdv(dimensions)

    Es = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Es.append(E)

    return bundle(Es)

In [3]:
class PagerankTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=0.55):
        self.alpha = alpha

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        graphs = []

        for graph in X:
            gpr = nx.pagerank(graph, self.alpha)
            graphs.append((graph, gpr))

        return graphs

In [4]:
class DigitsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, digits=5):
        self.digits = digits

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        graphs = []

        for graph, gpr in X:
            nodes = dict()
            for key, value in gpr.items():
                nodes[key] = str(round(value, self.digits))
            H = nx.relabel_nodes(graph, nodes)
            graphs.append(H)

        return graphs

In [5]:
class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, dimensions=10000, step=20):
        self.dimensions = dimensions
        self.step = step
        self.memory = ItemMemory()
        self.vertices = dict()

    def fit(self, X, y):
        classes = {label: [] for label in set(y)}

        for i in range(len(X)):
            G = encode_graph(X[i], self.vertices, self.dimensions)
            classes[y[i]].append(G)

        for key, value in classes.items():
            for i in range(0, len(value), self.step):
                H = bundle(value[i : i + self.step])
                self.memory.add_vector(str(key), H)

        return self

    def predict(self, X):
        p, s = [], []

        for query in X:
            query_vector = encode_graph(query, self.vertices, self.dimensions)
            (label, _, _) = self.memory.cleanup(query_vector)

            p.append(int(label))

        return p

In [6]:
MUTAG = load_dataset("graphs-datasets/MUTAG")["train"]
# PROTEINS = load_dataset("graphs-datasets/PROTEINS")["train"]
# AIDS = load_dataset("graphs-datasets/AIDS")["full"]

In [7]:
FOLDS, REPS = 10, 3
ALPHA, DIGITS, DIMENSIONS, STEP = 0.65, 2, 10000, 4

(graphs, labels) = process_dataset(MUTAG)


def reps(pipe, graphs, labels):
    sum = 0
    for i in range(REPS):
        scores = cross_val_score(pipe, graphs, labels, n_jobs=-1, cv=FOLDS)
        sum += scores.mean()
        print(i, "->", "%.5f" % scores.mean())
        del scores

    print("S => %.5f" % (sum / REPS))


def confusionMatrix(pipe, graphs, labels):
    y_pred = cross_val_predict(pipe, graphs, labels, n_jobs=-1, cv=FOLDS)
    print(confusion_matrix(labels, y_pred))


def hpSearch():
    pipe = Pipeline(
        [
            ("pagerank", PagerankTransformer()),
            ("digits", DigitsTransformer()),
            ("classifier", GraphHD()),
        ],
    )
    distributions = dict(
        pagerank__alpha=[0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85],
        digits__digits=[2, 3, 4, 5],
        classifier__dimensions=[10000],
        classifier__step=[4, 8, 16, 32, 64, 124, 256],
    )
    clf = HalvingGridSearchCV(pipe, distributions, n_jobs=-1, verbose=0, refit=True)
    search = clf.fit(graphs, labels)
    print("Search P =>", search.best_params_)
    print("Search S => %.5f" % search.best_score_)
    return search.best_estimator_


def learningCurve(pipe, graphs, labels):
    tr_si, tr_sc, te_sc = learning_curve(
        pipe, graphs, labels, cv=10, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
    )
    LearningCurveDisplay(
        train_sizes=tr_si, train_scores=tr_sc, test_scores=te_sc, score_name="Score"
    ).plot(std_display_style="errorbar")


def main():
    pipe = hpSearch()

    # pipe = Pipeline(
    #     [
    #         ("pagerank", PagerankTransformer(ALPHA)),
    #         ("digits", DigitsTransformer(DIGITS)),
    #         ("classifier", GraphHD(DIMENSIONS, STEP)),
    #     ],
    # )

    reps(pipe, graphs, labels)
    # confusionMatrix(pipe, graphs, labels)
    # learningCurve(pipe, graphs, labels)


main()

Search P => {'classifier__dimensions': 10000, 'classifier__step': 4, 'digits__digits': 2, 'pagerank__alpha': 0.75}
Search S => 0.85841
0 -> 0.80731
1 -> 0.80731
2 -> 0.80731
S => 0.80731
