In [20]:
import numpy as np
import networkx as nx
from functools import reduce as ftreduce
import math
import random
from datasets import load_dataset
import torch

In [60]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

In [110]:
import timeit
import numpy as np

# Tests numpy vs torch
t1 = timeit.timeit(lambda: np.random.choice([-1, 1], 10000), number=10000)
t2 = timeit.timeit(
    lambda: torch.sign(torch.randint(-10000, 10000, (10000,), dtype=torch.float32)),
    number=10000,
)

A = np.random.choice([-1, 1], 10000)
B = np.random.choice([-1, 1], 10000)


def npmul():
    A = np.random.choice([-1, 1], 10000)
    B = np.random.choice([-1, 1], 10000)
    np.multiply(A, B)


def torchmul():
    C = torch.randint(0, 2, (10000,), dtype=torch.float32)
    D = torch.randint(0, 2, (10000,), dtype=torch.float32)
    torch.mul(C, D)


def npadd():
    A = np.random.choice([-1, 1], 10000)
    B = np.random.choice([-1, 1], 10000)
    np.add(A, B)


def torchadd():
    C = torch.randint(-1, 2, (10000,), dtype=torch.float32)
    D = torch.randint(-1, 2, (10000,), dtype=torch.float32)
    torch.add(C, D)


def npsign():
    A = np.random.choice([-1, 1], 10000)
    np.sign(A)


def torchsign():
    C = torch.randint(-1, 2, (10000,), dtype=torch.float32)
    torch.sign(C)


def npdot():
    A = np.random.choice([-1, 1], 10000)
    B = np.random.choice([-1, 1], 10000)
    np.dot(A, B)


def torchdot():
    C = torch.randint(-1, 2, (10000,), dtype=torch.float32)
    D = torch.randint(-1, 2, (10000,), dtype=torch.float32)
    torch.dot(C, D)


def npnorm():
    A = np.random.choice([-1, 1], 10000)
    np.linalg.norm(A)


def torchnorm():
    C = torch.randint(-1, 2, (10000,), dtype=torch.float32)
    torch.norm(C)


print(torch.randint(-1, 2, (10000,)))

t3 = timeit.timeit(lambda: npmul(), number=10000)
t4 = timeit.timeit(lambda: torchmul(), number=10000)
t5 = timeit.timeit(lambda: npadd(), number=10000)
t6 = timeit.timeit(lambda: torchadd(), number=10000)
t7 = timeit.timeit(lambda: npsign(), number=10000)
t8 = timeit.timeit(lambda: torchsign(), number=10000)
t9 = timeit.timeit(lambda: npdot(), number=10000)
t10 = timeit.timeit(lambda: torchdot(), number=10000)
t11 = timeit.timeit(lambda: npnorm(), number=10000)
t12 = timeit.timeit(lambda: torchnorm(), number=10000)

print(t1)
print(t2)
print(t3)
print(t4)
print(t5)
print(t6)
print(t7)
print(t8)
print(t9)
print(t10)
print(t11)
print(t12)

tensor([-1,  1,  0,  ...,  1,  0,  1], device='cuda:0')
0.46220976599988717
0.15653415899987522
0.9918594949999715
0.2454984420000983
0.9842482300000484
0.24686337200000708
0.8075713090001955
0.15588547200013636
1.0000987489997897
0.29788914199980354
0.5560962019999351
0.24648063000040565


In [3]:
MUTAG = load_dataset("graphs-datasets/MUTAG")
PROTEINS = load_dataset("graphs-datasets/PROTEINS")
AIDS = load_dataset("graphs-datasets/AIDS")

In [112]:
def hdv(d):
    return torch.sign(torch.randint(-10000, 10000, (d,), dtype=torch.float32))


def bind(xs):
    return ftreduce(torch.mul, xs)


def bundle(xs):
    return torch.sign(ftreduce(torch.add, xs))


def cosine_similarity(A, B):
    dot_product = torch.dot(A, B)
    norm_A = torch.norm(A)
    norm_B = torch.norm(B)

    if norm_A == 0 or norm_B == 0:
        return 0

    return dot_product / (norm_A * norm_B)


class ItemMemory:
    def __init__(self, vectors=[]):
        self.vectors = vectors

    def addVector(self, label, V):
        self.vectors.append((label, V))

    def count(self):
        return len(self.vectors)

    def cleanup(self, V):
        return max(self.vectors, key=lambda x: cosine_similarity(V, x[1]))

In [4]:
def hdv(d):
    return np.random.choice([-1, 1], d)


def bind(xs):
    return ftreduce(np.multiply, xs)


def bundle(xs):
    return np.sign(ftreduce(np.add, xs))


def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)

    if norm_A == 0 or norm_B == 0:
        return 0

    return dot_product / (norm_A * norm_B)


class ItemMemory:
    def __init__(self, vectors=[]):
        self.vectors = vectors

    def addVector(self, label, V):
        self.vectors.append((label, V))

    def count(self):
        return len(self.vectors)

    def cleanup(self, V):
        return max(self.vectors, key=lambda x: cosine_similarity(V, x[1]))

In [6]:
def encodeGraph(graph, vertices, dimensions):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdv(dimensions)

    Edges = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Edges.append(E)

    Graph = bundle(Edges)

    return Graph

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin


class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, dimensions=10000, step=20):
        self.dimensions = dimensions
        self.step = step
        self.vertices = dict()

    def fit(self, X, y):
        self.memory = ItemMemory([])
        self.labels = list(set(y))
        dictLabels = dict()

        for label in self.labels:
            dictLabels[label] = []

        for i in range(len(X)):
            Graph = encodeGraph(X[i], self.vertices, self.dimensions)
            dictLabels[y[i]].append(Graph)

        for key, value in dictLabels.items():
            for i in range(0, len(value), self.step):
                H = bundle(value[i : i + self.step])
                self.memory.addVector(str(key), H)

        return self

    def predict(self, X):
        predictions = []
        similarities = []
        for testGraph in X:
            queryVector = encodeGraph(testGraph, self.vertices, self.dimensions)
            cleanVector = self.memory.cleanup(queryVector)

            predictions.append(int(cleanVector[0]))
            similarities.append(cosine_similarity(queryVector, cleanVector[1]))

        # print(
        #     "%.5f" % round(np.mean(similarities), 5),
        #     "0:",
        #     predictions.count(0),
        #     "1:",
        #     predictions.count(1),
        # )
        return predictions

In [8]:
def processDataset(dataset):
    graphs = []
    labels = []

    for graph in dataset:
        G = nx.Graph()
        G.add_edges_from(zip(graph["edge_index"][0], graph["edge_index"][1]))
        graphs.append(G)
        labels.append(graph["y"][0])

    return {"graphs": graphs, "y": labels}


def transformDataset(dataset, digits, alpha):
    graphs = []

    for graph in dataset:
        gpr = nx.pagerank(graph, alpha)
        nodes = dict()
        for key, value in gpr.items():
            nodes[key] = str(round(value, digits))
        H = nx.relabel_nodes(graph, nodes)
        graphs.append(H)

    return graphs

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin


class GraphHDTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, digits=5, alpha=0.55):
        self.digits = digits
        self.alpha = alpha

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return transformDataset(X, self.digits, self.alpha)

In [10]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline
from resource import *

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from joblib import Memory
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.model_selection import LearningCurveDisplay, learning_curve

# DATASET = MUTAG["train"]
DATASET = PROTEINS["train"]
# DATASET = AIDS["full"]
DIMENSIONS = 10000
DIGITS = 4
ALPHA = 0.35
STEP = 1

dataset = processDataset(DATASET)


def reps(clf, graphs, labels, reps):
    sum = 0
    for i in range(reps):
        scores = cross_val_score(clf, graphs, labels, cv=10, n_jobs=-1)
        sum += scores.mean()
        print(i + 1, "->", "%.5f" % scores.mean())

    print("r10f10 S => %.5f" % (sum / reps))


def hpSearch(graphs, labels):
    pipe = Pipeline(
        [("transformer", GraphHDTransformer()), ("classifier", GraphHD())],
    )

    distributions = dict(
        transformer__alpha=[0.35, 0.55, 0.75],
        transformer__digits=[2, 4, 6],
        classifier__dimensions=[1000],
        classifier__step=[1, 4, 32, 124],
    )
    clf = GridSearchCV(pipe, distributions, n_jobs=-1, verbose=3)
    search = clf.fit(dataset["graphs"], dataset["y"])
    print("Search P =>", search.best_params_)
    print("Search S => %.5f" % search.best_score_)
    return search.best_estimator_


def confusionMatrix(clf, graphs, labels):
    y_pred = cross_val_predict(clf, graphs, labels, n_jobs=-1)
    conf_mat = confusion_matrix(labels, y_pred)
    accuracy = accuracy_score(y_pred, labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=[0, 1])
    disp.plot()
    print(conf_mat)
    plt.show()


def main():
    pipe = hpSearch(dataset["graphs"], dataset["y"])
    pipe = Pipeline(
        [
            ("transformer", GraphHDTransformer(DIGITS, ALPHA)),
            ("classifier", GraphHD(DIMENSIONS, STEP)),
        ],
    )
    # print(pipe.get_params())
    reps(pipe, dataset["graphs"], dataset["y"], 9)
    confusionMatrix(pipe, dataset["graphs"], dataset["y"])


def main2():
    graphs = GraphHDTransformer(DIGITS, ALPHA).transform(dataset["graphs"])
    clf = GraphHD(DIMENSIONS, STEP)
    reps(clf, graphs, dataset["y"], 10)
    # confusionMatrix(clf, graphs, dataset["y"])


def learningCurve():
    pipe = Pipeline(
        [
            ("transformer", GraphHDTransformer(DIGITS, ALPHA)),
            ("classifier", GraphHD(DIMENSIONS, STEP)),
        ],
    )
    train_sizes, train_scores, test_scores = learning_curve(
        pipe,
        dataset["graphs"],
        dataset["y"],
        cv=10,
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-validation score")
    plt.title("Learning curve")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()
    display = LearningCurveDisplay(
        train_sizes=train_sizes,
        train_scores=train_scores,
        test_scores=test_scores,
        score_name="Score",
    )
    display.plot()
    plt.show()


# learningCurve()

# setrlimit(RLIMIT_AS, (RLIM_INFINITY, RLIM_INFINITY))

main2()

# reps(GraphHD(DIMENSIONS, DIGITS, ALPHA, STEP), 3)
# gridSearch()
# confusionMatrix(GraphHD(DIMENSIONS, DIGITS, ALPHA, STEP))

1 -> 0.61544
2 -> 0.61184
3 -> 0.61003
4 -> 0.61725
5 -> 0.61544
6 -> 0.62264
7 -> 0.61543
8 -> 0.60916
9 -> 0.61815
10 -> 0.60649
r10f10 S => 0.61419


improvements in aids with multiple hvs per class -> /20
final score: 0.65210

improvements in proteins with multiple hvs per class -> /10
final score: 0.65496
[[550 113]
 [268 182]]

Mutag
{'alpha': 0.45, 'digits': 3, 'dimensions': 10000, 'step': 1}
0.7284495021337127

## AIDS

{'alpha': 0.55, 'digits': 6, 'dimensions': 10000, 'step': 20}
0.708
final score: 0.70400


# PROTEINS Dataset

GridSearch result

```python
dimensions=[1000, 2500, 10000],
digits=[2, 3, 4, 5, 6],
alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
```

```json
{'alpha': 0.25, 'digits': 2, 'dimensions': 1000}
0.5956853714701248
```

3 repetition 10 fold validation and confusion matrix

```python
0.59568
[[663   0]
 [450   0]]
```

# AIDS Dataset

### Best result 10 reps

```python
{'alpha': 0.55, 'digits': 6, 'dimensions': 10000, 'step': 20}
S => 0.70006
[[  83  317]
 [ 293 1307]]
```

---

GridSearch result

```python
dimensions=[1000, 2500, 10000],
digits=[3, 4, 5, 6],
alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
```

```json
{'alpha': 0.55, 'digits': 6, 'dimensions': 10000}
0.6055
```

3 repetition 10 fold validation and confusion matrix

```python
0.59467
[[ 159  241]
 [ 568 1032]]
```

```
# {'alpha': 0.45, 'digits': 4, 'dimensions': 10000} 10_iter_score: 0.65167 [[17 46] [29 96]]
# {'alpha': 0.75, 'digits': 3, 'dimensions': 5000}
# {'alpha': 0.45, 'digits': 4, 'dimensions': 1000} 0.65079
# {'alpha': 0.25, 'digits': 4, 'dimensions': 1000}
# {'alpha': 0.35, 'digits': 4, 'dimensions': 1000}
# 0.7073099415204679

# {'alpha': 0.35, 'digits': 3, 'dimensions': 1000}
# 0.7131578947368421
# 10 iter 0.68120

# {'alpha': 0.45, 'digits': 3, 'dimensions': 2500}
# 0.7330409356725146
# 10_iter_score: 0.69327 [[11  52] [ 9 116]]
```
