In [16]:
# Dependencies


import numpy as np
import networkx as nx
from sklearn.base import BaseEstimator, ClassifierMixin
from datasets import load_dataset
from sklearn.model_selection import cross_val_score, ShuffleSplit

from graph import process_dataset, transform
from hdc import hdv, bind, bundle, sbundle, ItemMemory, hdvw, hdva, cosim

In [24]:
# encode_graph -> graphHD (graph, vertices, dimensions)


def encode_graph(graph, vertices, dimensions):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdv(dimensions)

    Es = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Es.append(E)

    return bundle(Es)

In [25]:
# encode_graphw -> vertices with hdw and edges (graph, vertices, base)


def encode_graphw(graph, vertices, base):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdvw(base, float(node))

    Es = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Es.append(E)

    return bundle(Es)

In [26]:
# encode_graphv -> vertices with hdv and no edges (graph, vertices, base)


def encode_graphv(graph, vertices, base):
    Vs = []

    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdvw(base, float(node))
        Vs.append(vertices[node])

    return bundle(Vs)

In [27]:
# encode_graphvw -> vertices with hdw and no edges (graph, vertices, base)


def encode_graphvw(graph, vertices, base):
    Vs = []

    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdvw(base, float(node))
        Vs.append(vertices[node])

    return bundle(Vs)

In [28]:
class GraphEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self, encoder, alpha=0.45, digits=4, step=20):
        self.encoder = encoder
        self.alpha = alpha
        self.digits = digits
        self.step = step
        self.memory = ItemMemory()
        self.vertices = dict()

    def fit(self, X, y):
        classes = {label: [] for label in set(y)}
        graphs = transform(X, self.alpha, self.digits)

        for i in range(len(graphs)):
            G = self.encoder(graphs[i], self.vertices)
            classes[y[i]].append(G)

        for key, value in classes.items():
            for i in range(0, len(value), self.step):
                H = bundle(value[i : i + self.step])
                self.memory.add_vector(str(key), H)

        return self

    def predict(self, X):
        p, s, queries = [], [], transform(X, self.alpha, self.digits)

        for query in queries:
            query_vector = self.encoder(query, self.vertices)
            (label, _, _) = self.memory.cleanup(query_vector)

            p.append(int(label))
            # s.append(cosine_similarity(queryVector, cleanVector[1]))

        # print("%.5f" % round(np.mean(s), 5), "0:", p.count(0), "1:", p.count(1))
        return p

In [29]:
MUTAG = load_dataset("graphs-datasets/MUTAG")["train"]
# PROTEINS = load_dataset("graphs-datasets/PROTEINS")["train"]
# AIDS = load_dataset("graphs-datasets/AIDS")["full"]
# IMDB = load_dataset("graphs-datasets/IMDB-BINARY")["train"]

In [30]:
(graphs, labels) = process_dataset(MUTAG)

In [31]:
from functools import partial
import time

FOLDS, REPS = 10, 5
ALPHA, DIGITS, DIMENSIONS, STEP = 0.65, 2, 10000, 4
CV = FOLDS

encoders = [
    partial(encode_graph, dimensions=DIMENSIONS),
    partial(encode_graphv, base=hdv(DIMENSIONS)),
    partial(encode_graphw, base=hdv(DIMENSIONS)),
    partial(encode_graphvw, base=hdv(DIMENSIONS)),
]


def main():
    for encoder in encoders:
        clf = GraphEstimator(encoder, alpha=ALPHA, digits=DIGITS, step=STEP)
        sum = 0
        start_time = time.time()
        for i in range(REPS):
            CV = ShuffleSplit()  # random_state=0
            scores = cross_val_score(
                clf, graphs, labels, n_jobs=-1, cv=CV, verbose=0, error_score="raise"
            )
            sum += scores.mean()
            del scores
        end_time = time.time()
        print(
            "S => %.5f" % (sum / REPS),
            "T => %.5f" % ((end_time - start_time) / REPS),
        )


main()

S => 0.83053 T => 0.44938
S => 0.85368 T => 0.41567
S => 0.82421 T => 0.42785
S => 0.82947 T => 0.40381
