In [None]:
import numpy as np
import networkx as nx
import functools as ft
import math
import random
from datasets import load_dataset

In [2]:
MUTAG = load_dataset("graphs-datasets/MUTAG")
PROTEINS = load_dataset("graphs-datasets/PROTEINS")
AIDS = load_dataset("graphs-datasets/AIDS")

In [3]:
def hdv(d):
    return np.random.choice([-1, 1], d)


def bind(xs):
    return ft.reduce(lambda x, y: x * y, xs)


def bundle(xs):
    return np.sign(ft.reduce(lambda x, y: x + y, xs))


def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)

    return dot_product / (norm_A * norm_B)


class ItemMemory:
    def __init__(self, vectors=[]):
        self.vectors = vectors

    def addVector(self, label, V):
        self.vectors.append((label, V))

    def count(self):
        return len(self.vectors)

    def cleanup(self, V):
        return max(self.vectors, key=lambda x: cosine_similarity(V, x[1]))

In [4]:
def encodeGraph(graph, vertices, dimensions, digits, alpha):
    gpr = nx.pagerank(graph, alpha)

    for pr in nx.pagerank(graph, alpha).values():
        key = str(round(pr, digits))
        vertices[key] = hdv(dimensions)

    Edges = []

    for edge in graph.edges:
        keyV1 = str(round(gpr[edge[0]], digits))
        keyV2 = str(round(gpr[edge[1]], digits))
        v1 = vertices[keyV1]
        v2 = vertices[keyV2]
        E = bind([v1, v2])
        Edges.append(E)

    Graph = bundle(Edges)

    return Graph

In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin


class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, dimensions=10000, digits=7, alpha=0.85):
        self.dimensions = dimensions
        self.digits = digits
        self.alpha = alpha
        self.vertices = dict()

    def fit(self, X, y):
        self.memory = ItemMemory([])
        self.labels = list(set(y))
        dictLabels = dict()

        for label in self.labels:
            dictLabels[label] = []

        for i in range(len(X)):
            Graph = encodeGraph(
                X[i], self.vertices, self.dimensions, self.digits, self.alpha
            )
            dictLabels[y[i]].append(Graph)

        for key, value in dictLabels.items():
            Label = bundle(value)
            self.memory.addVector(str(key), Label)

        return self

    def predict(self, X):
        predictions = []
        similarities = []
        for testGraph in X:
            queryVector = encodeGraph(
                testGraph, self.vertices, self.dimensions, self.digits, self.alpha
            )
            cleanVector = self.memory.cleanup(queryVector)

            predictions.append(int(cleanVector[0]))
            similarities.append(cosine_similarity(queryVector, cleanVector[1]))

        print(
            "%.5f" % round(np.mean(similarities), 5),
            "0:",
            predictions.count(0),
            "1:",
            predictions.count(1),
        )
        return predictions

In [6]:
def processDataset(dataset):
    graphs = []
    labels = []

    for graph in dataset:
        G = nx.Graph()
        G.add_edges_from(zip(graph["edge_index"][0], graph["edge_index"][1]))
        graphs.append(G)  # no need for to_undirected()
        labels.append(graph["y"][0])

    return (graphs, labels)

In [7]:
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

DATASET = MUTAG["train"]
# DATASET = PROTEINS['train']
# DATASET = AIDS["full"]
DIMENSIONS = 10000
DIGITS = 3
ALPHA = 0.35


def main():
    (graphs, labels) = processDataset(DATASET)

    sum = 0
    reps = 5
    for i in range(reps):
        print(i)
        clf = GraphHD(DIMENSIONS, DIGITS, ALPHA)

        scores = cross_val_score(clf, graphs, labels, cv=10, n_jobs=-1)
        sum += scores.mean()
        print("%.5f" % scores.mean())

    print("final score: %.5f" % (sum / reps))


def gridSearch():
    (graphs, labels) = processDataset(DATASET)
    distributions = dict(
        dimensions=[1000, 2500, 10000],
        digits=[3, 4, 5, 6],
        alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
    )
    clf = GridSearchCV(GraphHD(), distributions, n_jobs=-1, verbose=1)
    search = clf.fit(graphs, labels)
    print(search.best_params_)
    print(search.best_score_)
    print(search.best_estimator_)


def confusionMatrix():
    (graphs, labels) = processDataset(DATASET)
    clf = GraphHD(DIMENSIONS, DIGITS, ALPHA)
    y_pred = cross_val_predict(clf, graphs, labels, n_jobs=-1)
    conf_mat = confusion_matrix(labels, y_pred)
    accuracy = accuracy_score(y_pred, labels)
    print(accuracy)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=[0, 1])
    disp.plot()
    print(conf_mat)
    plt.show()


main()
# gridSearch()
# confusionMatrix()

0
0.30415 0: 0 1: 19
0.27969 0: 0 1: 19
0.24827 0: 3 1: 16
0.29638 0: 1 1: 18
0.30042 0: 1 1: 18
0.31968 0: 1 1: 18
0.29739 0: 1 1: 17
0.31498 0: 0 1: 19
0.21487 0: 4 1: 15
0.23214 0: 1 1: 17
0.69649
1
0.31433 0: 1 1: 18
0.31948 0: 0 1: 19
0.30403 0: 0 1: 19
0.29804 0: 2 1: 17
0.29502 0: 1 1: 18
0.29261 0: 2 1: 16
0.25147 0: 2 1: 17
0.28032 0: 2 1: 17
0.21345 0: 4 1: 15
0.23538 0: 4 1: 14
0.68596
2
0.25046 0: 2 1: 17
0.21568 0: 4 1: 15
0.23729 0: 2 1: 16
0.31974 0: 0 1: 19
0.30198 0: 2 1: 17
0.29430 0: 0 1: 18
0.31254 0: 1 1: 18
0.28028 0: 1 1: 18
0.29248 0: 1 1: 18
0.30802 0: 0 1: 19
0.67018
3
0.30206 0: 1 1: 18
0.30171 0: 0 1: 19
0.27925 0: 0 1: 19
0.24956 0: 2 1: 17
0.31365 0: 1 1: 18
0.29386 0: 1 1: 18
0.21800 0: 4 1: 15
0.29458 0: 0 1: 18
0.31969 0: 1 1: 18
0.23430 0: 3 1: 15
0.72310
4
0.27635 0: 0 1: 19
0.31139 0: 1 1: 18
0.29296 0: 3 1: 16
0.25161 0: 1 1: 18
0.30041 0: 1 1: 18
0.23078 0: 3 1: 15
0.21108 0: 5 1: 14
0.29330 0: 1 1: 17
0.30759 0: 1 1: 18
0.31889 0: 0 1: 19
0.68655


# PROTEINS Dataset

GridSearch result

```python
dimensions=[1000, 2500, 10000],
digits=[2, 3, 4, 5, 6],
alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
```

```json
{'alpha': 0.25, 'digits': 2, 'dimensions': 1000}
0.5956853714701248
```

3 repetition 10 fold validation and confusion matrix

```python
0.59568
[[663   0]
 [450   0]]
```

# AIDS Dataset

GridSearch result

```python
dimensions=[1000, 2500, 10000],
digits=[3, 4, 5, 6],
alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
```

```json
{'alpha': 0.55, 'digits': 6, 'dimensions': 10000}
0.6055
```

3 repetition 10 fold validation and confusion matrix

```python
0.59467
[[ 159  241]
 [ 568 1032]]
```

```
# {'alpha': 0.45, 'digits': 4, 'dimensions': 10000} 10_iter_score: 0.65167 [[17 46] [29 96]]
# {'alpha': 0.75, 'digits': 3, 'dimensions': 5000}
# {'alpha': 0.45, 'digits': 4, 'dimensions': 1000} 0.65079
# {'alpha': 0.25, 'digits': 4, 'dimensions': 1000}
# {'alpha': 0.35, 'digits': 4, 'dimensions': 1000}
# 0.7073099415204679

# {'alpha': 0.35, 'digits': 3, 'dimensions': 1000}
# 0.7131578947368421
# 10 iter 0.68120

# {'alpha': 0.45, 'digits': 3, 'dimensions': 2500}
# 0.7330409356725146
# 10_iter_score: 0.69327 [[11  52] [ 9 116]]

# MUTAG 10reps {'alpha': 0.45, 'digits': 3, 'dimensions': 2500}
# U -> final score: 0.69392, 6.9s
# G -> final score: 0.68175, 7.3s

# AIDS 5reps {'alpha': 0.55, 'digits': 6, 'dimensions': 10000}
# U -> final score: 0.58420, 42.6s
# G -> final score: 0.58930, 44.1s
```
