In [1]:
import numpy as np
import networkx as nx
import functools as ft
import math
import random
from datasets import load_dataset
import torch
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    HalvingGridSearchCV,
    LearningCurveDisplay,
    learning_curve,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_predict,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.set_default_device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# def hdv(d):
#     return torch.sign(torch.randint(-10000000, 10000000, (d,), dtype=torch.float32))


# def bind(xs):
#     return torch.prod(torch.stack(xs), axis=0)


# def bundle(xs):
#     return torch.sign(torch.sum(torch.stack(xs), axis=0))


# def cosine_similarity(A, B):
#     dot_product = torch.dot(A, B)
#     norm_A = torch.norm(A)
#     norm_B = torch.norm(B)

#     if norm_A == 0 or norm_B == 0:
#         return 0

#     return dot_product / (norm_A * norm_B)


def hdv(d):
    return np.random.choice([-1, 1], d)


def bind(xs):
    return ft.reduce(lambda x, y: x * y, xs)


def bundle(xs):
    return np.sign(ft.reduce(lambda x, y: x + y, xs))


def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)

    if norm_A == 0 or norm_B == 0:
        return 0

    return dot_product / (norm_A * norm_B)


class ItemMemory:
    def __init__(self, vectors=[]):
        self.vectors = vectors

    def addVector(self, label, V):
        self.vectors.append((label, V))

    def count(self):
        return len(self.vectors)

    def cleanup(self, V):
        return max(self.vectors, key=lambda x: cosine_similarity(V, x[1]))

In [4]:
def encodeGraph(graph, vertices, dimensions):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdv(dimensions)

    Edges = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Edges.append(E)

    Graph = bundle(Edges)

    return Graph

In [5]:
def processDataset(dataset):
    graphs = []
    labels = []

    for graph in dataset:
        G = nx.Graph()
        G.add_edges_from(zip(graph["edge_index"][0], graph["edge_index"][1]))
        graphs.append(G)
        labels.append(graph["y"][0])

    return (graphs, labels)

In [6]:
class PagerankTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=0.55):
        self.alpha = alpha

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        graphs = []

        for graph in X:
            gpr = nx.pagerank(graph, self.alpha)
            graphs.append((graph, gpr))

        return graphs

In [7]:
class DigitsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, digits=5):
        self.digits = digits

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        graphs = []

        for graph, gpr in X:
            nodes = dict()
            for key, value in gpr.items():
                nodes[key] = str(round(value, self.digits))
            H = nx.relabel_nodes(graph, nodes)
            graphs.append(H)

        return graphs

In [8]:
class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, dimensions=10000, step=20):
        self.dimensions = dimensions
        self.step = step

    def fit(self, X, y):
        self.vertices = dict()
        self.memory = ItemMemory([])
        self.labels = list(set(y))
        dictLabels = dict()

        for label in self.labels:
            dictLabels[label] = []

        for i in range(len(X)):
            Graph = encodeGraph(X[i], self.vertices, self.dimensions)
            dictLabels[y[i]].append(Graph)

        for key, value in dictLabels.items():
            for i in range(0, len(value), self.step):
                H = bundle(value[i : i + self.step])
                self.memory.addVector(str(key), H)

        return self

    def predict(self, X):
        p = []
        s = []
        for testGraph in X:
            queryVector = encodeGraph(testGraph, self.vertices, self.dimensions)
            cleanVector = self.memory.cleanup(queryVector)

            p.append(int(cleanVector[0]))
            s.append(cosine_similarity(queryVector, cleanVector[1]))

        # print("%.5f" % round(np.mean(s), 5),"0:",p.count(0),"1:",p.count(1))
        return p

In [9]:
# MUTAG = load_dataset("graphs-datasets/MUTAG")
PROTEINS = load_dataset("graphs-datasets/PROTEINS")
# AIDS = load_dataset("graphs-datasets/AIDS")
# DATASET = MUTAG["train"]
DATASET = PROTEINS["train"]
# DATASET = AIDS["full"]

(graphs, labels) = processDataset(DATASET)

In [11]:
ALPHA = 0.65
DIGITS = 2
DIMENSIONS = 10000
STEP = 20

FOLDS = 10
REPS = 3


def reps(pipe, graphs, labels):
    sum = 0
    for i in range(REPS):
        scores = cross_val_score(pipe, graphs, labels, n_jobs=-1, cv=FOLDS)
        sum += scores.mean()
        print(i, "->", "%.5f" % scores.mean())
        del scores

    print("S => %.5f" % (sum / REPS))


def confusionMatrix(pipe, graphs, labels):
    y_pred = cross_val_predict(pipe, graphs, labels, n_jobs=-1, cv=FOLDS)
    print(confusion_matrix(labels, y_pred))


def hpSearch():
    pipe = Pipeline(
        [
            ("pagerank", PagerankTransformer()),
            ("digits", DigitsTransformer()),
            ("classifier", GraphHD()),
        ],
    )
    distributions = dict(
        pagerank__alpha=[0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85],
        digits__digits=[2, 3, 4, 5],
        classifier__dimensions=[10000],
        classifier__step=[4, 8, 16, 32, 64, 124, 256],
    )
    clf = GridSearchCV(pipe, distributions, n_jobs=-1, verbose=3, refit=True)
    search = clf.fit(graphs, labels)
    print("Search P =>", search.best_params_)
    print("Search S => %.5f" % search.best_score_)
    return search.best_estimator_


def learningCurve(pipe, graphs, labels):
    tr_si, tr_sc, te_sc = learning_curve(
        pipe, graphs, labels, cv=10, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
    )
    LearningCurveDisplay(
        train_sizes=tr_si, train_scores=tr_sc, test_scores=te_sc, score_name="Score"
    ).plot(std_display_style="errorbar")


def main():
    pipe = hpSearch()

    # pipe = Pipeline(
    #     [
    #         ("pagerank", PagerankTransformer(ALPHA)),
    #         ("digits", DigitsTransformer(DIGITS)),
    #         ("classifier", GraphHD(DIMENSIONS, STEP)),
    #     ],
    # )

    reps(pipe, graphs, labels)
    # confusionMatrix(pipe, graphs, labels)
    # learningCurve(pipe, graphs, labels)


main()

Fitting 5 folds for each of 196 candidates, totalling 980 fits
[CV 1/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.25;, score=0.592 total time=   4.1s
[CV 3/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.25;, score=0.592 total time=   4.4s
[CV 2/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.25;, score=0.601 total time=   4.8s
[CV 4/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.25;, score=0.613 total time=   5.2s
[CV 2/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.35;, score=0.583 total time=   5.0s
[CV 4/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.35;, score=0.590 total time=   5.1s
[CV 5/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.35;, score=0.541 total time=