In [10]:
# Dependencies


import numpy as np
import functools as ft
import networkx as nx
from sklearn.base import BaseEstimator, ClassifierMixin
from datasets import load_dataset
from sklearn.model_selection import cross_val_score, ShuffleSplit

from graph import process_dataset, transform
from hdc import hdv, bind, bundle, ItemMemory, hdvw, hdva

In [2]:
# encode_graph -> graphHD (graph, vertices, dimensions)


def encode_graph(graph, vertices, dimensions):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdv(dimensions)

    Es = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Es.append(E)

    return bundle(Es)

In [7]:
class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=0.45, digits=4, dimensions=10000, step=20):
        self.alpha = alpha
        self.digits = digits
        self.dimensions = dimensions
        self.step = step
        self.memory = ItemMemory()
        self.vertices = dict()

    def fit(self, X, y):
        classes = {label: [] for label in set(y)}
        graphs = transform(X, self.alpha, self.digits)

        for i in range(len(graphs)):
            G = encode_graph(graphs[i], self.vertices, self.dimensions)
            classes[y[i]].append(G)

        for key, value in classes.items():
            for i in range(0, len(value), self.step):
                H = bundle(value[i : i + self.step])
                self.memory.add_vector(str(key), H)

        return self

    def predict(self, X):
        p, s, queries = [], [], transform(X, self.alpha, self.digits)

        for query in queries:
            query_vector = encode_graph(query, self.vertices, self.dimensions)
            (label, _, _) = self.memory.cleanup(query_vector)

            p.append(int(label))

        return p

In [4]:
MUTAG = load_dataset("graphs-datasets/MUTAG")["train"]
# PROTEINS = load_dataset("graphs-datasets/PROTEINS")["train"]
# IMDB = load_dataset("graphs-datasets/IMDB-BINARY")["train"]

In [17]:
FOLDS, REPS = 10, 3
ALPHA, DIGITS, DIMENSIONS, STEP = 0.65, 2, 10000, 4

(graphs, labels) = process_dataset(MUTAG)


def main():
    clf = GraphHD(ALPHA, DIGITS, DIMENSIONS, STEP)
    sum = 0
    for i in range(REPS):
        scores = cross_val_score(clf, graphs, labels, n_jobs=-1, cv=FOLDS, verbose=0)
        sum += scores.mean()

    print("Acc => %.5f" % (sum / REPS))


main()

Acc => 0.85244
