In [1]:
# Dependencies

import torch
import networkx as nx
from sklearn.base import BaseEstimator, ClassifierMixin
from datasets import load_dataset
from sklearn.model_selection import cross_val_score, ShuffleSplit, cross_val_predict
from graph import process_dataset
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
import sys
import random


sys.path.append("../")

import thdc
from hdc import pm


torch.set_default_device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_dtype(torch.half)

In [2]:
DATASET = load_dataset("graphs-datasets/MUTAG")["train"]
# DATASET = load_dataset("graphs-datasets/AIDS")["full"]
# DATASET = load_dataset("graphs-datasets/PROTEINS")["train"]

In [3]:
def create_map(length):
    m = {}
    a = random.sample(range(length), length)
    for i in range(length):
        m[i] = a[i]

    return m

In [4]:
DIMENSIONS = 2500
NODES = 28

(graphs, labels) = process_dataset(DATASET)

for i in range(len(graphs)):
    graphs[i] = nx.relabel_nodes(graphs[i], create_map(len(list(graphs[i]))))


VECTORS = torch.randint(0, 2, (NODES, DIMENSIONS), dtype=torch.int32, device="cuda")
VECTORS[VECTORS == 0] = -1

# MAT = torch.from_numpy(pm(DIMENSIONS)).cuda()

In [5]:
def encode(graph):
    G = torch.zeros(DIMENSIONS, device="cuda")
    for n in random.sample(list(graph), 1):
        # for vs in nx.bfs_layers(graph, [0]):
        # for vs in nx.dfs_preorder_nodes(graph, 0):
        # for vs in nx.bfs_layers(g, [list(g)[0]]):
        for vs in nx.bfs_layers(graph, [n]):
            # for vs in nx.bfs_layers(graph, list(graph)):
            # if G is None:
            #     G = torch.sum(
            #         torch.index_select(vectors, 0, torch.tensor(vs)),
            #         dim=0,
            #     )
            # else:
            G = torch.sum(
                torch.cat(
                    [
                        # torch.matmul(G, MAT)[None, :],
                        G[None, :],
                        torch.index_select(VECTORS, 0, torch.tensor(vs)),
                    ],
                    0,
                ),
                dim=0,
            )
        yield G

In [6]:
class GraphClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.memory = thdc.ItemMemory()

    def fit(self, X, y):
        for i in range(len(X)):
            for x in encode(X[i]):
                self.memory.add_vector_wn(y[i], x)
        return self

    # def fit(self, X, y):
    #     for i in range(len(X)):
    #         G = torch.zeros(DIMENSIONS, device="cuda")
    #         for x in encode(X[i]):
    #             G = torch.sum(torch.cat([G[None, :], x[None, :]], 0), dim=0)
    #         self.memory.add_vector_wn(y[i], G)
    #     return self

    def predict(self, X):
        p = []
        for query in X:
            max_score = 0

            for x in encode(query):
                # p.append(self.memory.cleanup(x)[0])
                vs = self.memory.cleanup_all(x, 5)
                s = sum(1 * (v[2] ** 4) if v[0] == 1 else -1 * (v[2] ** 4) for v in vs)
                if abs(s) > max_score:
                    max_score = s

            p.append(1 if max_score >= 0 else 0)

        return p

In [7]:
FOLDS, REPS = 10, 3


def main():
    sum = 0
    for _ in range(REPS):
        clf = GraphClassifier()
        scores = cross_val_score(
            clf,
            graphs,
            labels,
            cv=ShuffleSplit(n_splits=FOLDS),
            n_jobs=1,
            verbose=4,
        )
        print("Acc =>", scores.mean())
        sum += scores.mean()

    print("Avg Acc =>", sum / REPS)


def conf():
    clf = GraphClassifier()
    y_pred = cross_val_predict(clf, graphs, labels, cv=FOLDS, n_jobs=1)
    print(accuracy_score(labels, y_pred))
    print(balanced_accuracy_score(labels, y_pred, adjusted=True))
    print(confusion_matrix(labels, y_pred))


main()
# conf()

[CV] END ................................ score: (test=0.947) total time=   1.8s
[CV] END ................................ score: (test=0.789) total time=   1.0s
[CV] END ................................ score: (test=0.789) total time=   1.0s
[CV] END ................................ score: (test=0.842) total time=   1.0s
[CV] END ................................ score: (test=0.737) total time=   1.0s
[CV] END ................................ score: (test=0.947) total time=   1.0s
[CV] END ................................ score: (test=0.895) total time=   1.0s
[CV] END ................................ score: (test=0.895) total time=   1.0s
[CV] END ................................ score: (test=0.842) total time=   1.0s
[CV] END ................................ score: (test=0.947) total time=   1.0s
Acc => 0.8631578947368421
[CV] END ................................ score: (test=0.842) total time=   1.0s
[CV] END ................................ score: (test=0.895) total time=   1.0s
[C