In [6]:
import numpy as np
import networkx as nx
from functools import reduce as ftreduce
import math
import random
from datasets import load_dataset
import torch
from sklearn.base import BaseEstimator, TransformerMixin

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

In [8]:
MUTAG = load_dataset("graphs-datasets/MUTAG")
PROTEINS = load_dataset("graphs-datasets/PROTEINS")
AIDS = load_dataset("graphs-datasets/AIDS")

In [9]:
def hdv(d):
    return torch.sign(torch.randint(-10000000, 10000000, (d,), dtype=torch.float32))


def bind(xs):
    return torch.sign(ftreduce(torch.add, xs))
    # return torch.prod(torch.stack(xs), axis=0)


def bundle(xs):
    return ftreduce(torch.mul, xs)
    # return torch.sign(torch.sum(torch.stack(xs), axis=0))


def cosine_similarity(A, B):
    dot_product = torch.dot(A, B)
    norm_A = torch.norm(A)
    norm_B = torch.norm(B)

    if norm_A == 0 or norm_B == 0:
        return 0

    return dot_product / (norm_A * norm_B)


class ItemMemory:
    def __init__(self, vectors=[]):
        self.vectors = vectors

    def addVector(self, label, V):
        self.vectors.append((label, V))

    def count(self):
        return len(self.vectors)

    def cleanup(self, V):
        return max(self.vectors, key=lambda x: cosine_similarity(V, x[1]))

In [10]:
def encodeGraph(graph, vertices, dimensions):
    for node in graph.nodes:
        if node not in vertices:
            vertices[node] = hdv(dimensions)

    Edges = []

    for edge in graph.edges:
        v1 = vertices[edge[0]]
        v2 = vertices[edge[1]]
        E = bind([v1, v2])
        Edges.append(E)

    Graph = bundle(Edges)

    return Graph

In [11]:
from sklearn.base import BaseEstimator, ClassifierMixin


class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, dimensions=10000, step=20):
        self.dimensions = dimensions
        self.step = step
        self.vertices = dict()

    def fit(self, X, y):
        self.memory = ItemMemory([])
        self.labels = list(set(y))
        dictLabels = dict()

        for label in self.labels:
            dictLabels[label] = []

        for i in range(len(X)):
            Graph = encodeGraph(X[i], self.vertices, self.dimensions)
            dictLabels[y[i]].append(Graph)

        for key, value in dictLabels.items():
            for i in range(0, len(value), self.step):
                H = bundle(value[i : i + self.step])
                self.memory.addVector(str(key), H)

        return self

    def predict(self, X):
        p = []
        s = []
        for testGraph in X:
            queryVector = encodeGraph(testGraph, self.vertices, self.dimensions)
            cleanVector = self.memory.cleanup(queryVector)

            p.append(int(cleanVector[0]))
            s.append(cosine_similarity(queryVector, cleanVector[1]))

        # print("%.5f" % round(np.mean(s), 5),"0:",p.count(0),"1:",p.count(1))
        return p

In [12]:
def processDataset(dataset):
    graphs = []
    labels = []

    for graph in dataset:
        G = nx.Graph()
        G.add_edges_from(zip(graph["edge_index"][0], graph["edge_index"][1]))
        graphs.append(G)
        labels.append(graph["y"][0])

    return (graphs, labels)

In [13]:
def transformDataset(dataset, digits, alpha):
    graphs = []

    for graph in dataset:
        gpr = nx.pagerank(graph, alpha)
        nodes = dict()
        for key, value in gpr.items():
            nodes[key] = str(round(value, digits))
        H = nx.relabel_nodes(graph, nodes)
        graphs.append(H)

    return graphs

In [14]:
# class PagerankTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, alpha=0.55):
#         self.alpha = alpha

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None):
#         graphs = []

#         for graph in X:
#             gpr = nx.pagerank(graph, self.alpha)
#             graphs.append((graph, gpr))

#         return graphs

In [15]:
# class DigitsTransformer(BaseEstimator, TransformerMixin):
#     def __init__(self, digits=5):
#         self.digits = digits

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None):
#         graphs = []

#         for graph, gpr in X:
#             nodes = dict()
#             for key, value in gpr.items():
#                 nodes[key] = str(round(value, self.digits))
#             H = nx.relabel_nodes(graph, nodes)
#             graphs.append(H)

#         return graphs

In [20]:
from PagerankTransformer import PagerankTransformer
from DigitsTransformer import DigitsTransformer
from sklearn.pipeline import Pipeline

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    HalvingGridSearchCV,
)

DATASET = PROTEINS["train"]

(graphs, labels) = processDataset(DATASET)


def memory():
    cachedir = mkdtemp()
    pipe = Pipeline(
        [
            ("pagerank", PagerankTransformer()),
            ("digits", DigitsTransformer()),
            ("classifier", GraphHD()),
        ],
    )
    distributions = dict(
        pagerank__alpha=[0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85],
        digits__digits=[2, 3, 4, 5, 6, 7],
        classifier__dimensions=[10000],
        classifier__step=[1, 2, 4, 8, 16, 32, 64, 124, 256],
    )
    clf = HalvingGridSearchCV(pipe, distributions, n_jobs=-1, verbose=3)
    search = clf.fit(graphs, labels)
    print(search.best_params_)


memory()

n_iterations: 4
n_required_iterations: 6
n_possible_iterations: 4
min_resources_: 20
max_resources_: 1113
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 378
n_resources: 20
Fitting 5 folds for each of 378 candidates, totalling 1890 fits
[CV 2/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=2, pagerank__alpha=0.25;, score=(train=1.000, test=1.000) total time=   0.0s
[CV 1/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=2, pagerank__alpha=0.25;, score=(train=0.933, test=1.000) total time=   0.0s
[CV 3/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=2, pagerank__alpha=0.25;, score=(train=0.933, test=1.000) total time=   0.0s
[CV 4/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=2, pagerank__alpha=0.25;, score=(train=0.938, test=0.667) total time=   0.0s
[CV 2/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=2, pagerank__alpha=0.35;, score=(train=



[CV 4/5] END classifier__dimensions=10000, classifier__step=2, digits__digits=4, pagerank__alpha=0.45;, score=(train=0.701, test=0.657) total time=   0.8s
[CV 1/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.45;, score=(train=0.657, test=0.833) total time=   0.5s
[CV 3/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.45;, score=(train=0.671, test=0.667) total time=   0.4s
[CV 2/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.45;, score=(train=0.699, test=0.694) total time=   0.3s
[CV 5/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.45;, score=(train=0.667, test=0.571) total time=   0.3s
[CV 4/5] END classifier__dimensions=10000, classifier__step=4, digits__digits=2, pagerank__alpha=0.45;, score=(train=0.667, test=0.629) total time=   0.3s
[CV 1/5] END classifier__dimensions=10000, classifier__step=2, digits_



[CV 3/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=5, pagerank__alpha=0.35;, score=(train=0.761, test=0.639) total time=   5.1s
[CV 2/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=5, pagerank__alpha=0.35;, score=(train=0.796, test=0.630) total time=   5.0s
[CV 1/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=5, pagerank__alpha=0.35;, score=(train=0.775, test=0.704) total time=   5.1s
[CV 5/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=5, pagerank__alpha=0.55;, score=(train=0.780, test=0.589) total time=   5.7s
[CV 4/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=5, pagerank__alpha=0.35;, score=(train=0.806, test=0.664) total time=   5.4s
[CV 2/5] END classifier__dimensions=10000, classifier__step=1, digits__digits=5, pagerank__alpha=0.65;, score=(train=0.794, test=0.657) total time=   4.7s
[CV 5/5] END classifier__dimensions=10000, classifier__step=1, digits_

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline
from resource import *

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from joblib import Memory
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.model_selection import LearningCurveDisplay, learning_curve

DATASET = MUTAG["train"]
# DATASET = PROTEINS["train"]
# DATASET = AIDS["full"]
DIMENSIONS = 10000
DIGITS = 3
ALPHA = 0.35
STEP = 2

(graphs, labels) = processDataset(DATASET)


def reps(clf, graphs, labels, reps):
    sum = 0
    for i in range(reps):
        scores = cross_val_score(clf, graphs, labels, cv=10, n_jobs=-1)
        sum += scores.mean()
        print(i + 1, "->", "%.5f" % scores.mean())

    print("r10f10 S => %.5f" % (sum / reps))


def hpSearch(graphs, labels):
    pipe = Pipeline(
        [
            ("pagerank", PagerankTransformer()),
            ("digits", DigitsTransformer()),
            ("classifier", GraphHD()),
        ],
    )

    distributions = dict(
        pagerank__alpha=[0.35, 0.55, 0.75],
        digits__digits=[2, 4, 6],
        classifier__dimensions=[10000],
        classifier__step=[1, 4, 32, 124],
    )
    clf = GridSearchCV(pipe, distributions, n_jobs=-1, verbose=3)
    search = clf.fit(graphs, labels)
    print("Search P =>", search.best_params_)
    print("Search S => %.5f" % search.best_score_)
    return search.best_estimator_


def confusionMatrix(clf, graphs, labels):
    y_pred = cross_val_predict(clf, graphs, labels, n_jobs=-1)
    conf_mat = confusion_matrix(labels, y_pred)
    accuracy = accuracy_score(y_pred, labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=[0, 1])
    disp.plot()
    print(conf_mat)
    plt.show()


def main():
    pipe = hpSearch(graphs, labels)
    pipe = Pipeline(
        [
            ("pagerank", PagerankTransformer(ALPHA)),
            ("digits", DigitsTransformer(DIGITS)),
            ("classifier", GraphHD(DIMENSIONS, STEP)),
        ],
    )
    # print(pipe.get_params())
    reps(pipe, graphs, labels, 9)
    confusionMatrix(pipe, graphs, labels)


def main2():
    g1 = PagerankTransformer(ALPHA).transform(graphs)
    g2 = DigitsTransformer(DIGITS).transform(g1)
    clf = GraphHD(DIMENSIONS, STEP)
    reps(clf, g2, labels, 10)
    # confusionMatrix(clf, graphs, dataset["y"])


def learningCurve():
    pipe = Pipeline(
        [
            ("pagerank", PagerankTransformer(ALPHA)),
            ("digits", DigitsTransformer(DIGITS)),
            ("classifier", GraphHD(DIMENSIONS, STEP)),
        ],
    )
    train_sizes, train_scores, test_scores = learning_curve(
        pipe,
        graphs,
        labels,
        cv=10,
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.plot(train_sizes, train_scores_mean, label="Training score")
    plt.plot(train_sizes, test_scores_mean, label="Cross-validation score")
    plt.title("Learning curve")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.show()
    display = LearningCurveDisplay(
        train_sizes=train_sizes,
        train_scores=train_scores,
        test_scores=test_scores,
        score_name="Score",
    )
    display.plot()
    plt.show()


# learningCurve()

# setrlimit(RLIMIT_AS, (RLIM_INFINITY, RLIM_INFINITY))

main()

# reps(GraphHD(DIMENSIONS, DIGITS, ALPHA, STEP), 3)
# gridSearch()
# confusionMatrix(GraphHD(DIMENSIONS, DIGITS, ALPHA, STEP))

improvements in aids with multiple hvs per class -> /20
final score: 0.65210

improvements in proteins with multiple hvs per class -> /10
final score: 0.65496
[[550 113]
 [268 182]]

Mutag
{'alpha': 0.45, 'digits': 3, 'dimensions': 10000, 'step': 1}
0.7284495021337127

## AIDS

{'alpha': 0.55, 'digits': 6, 'dimensions': 10000, 'step': 20}
0.708
final score: 0.70400


# PROTEINS Dataset

GridSearch result

```python
dimensions=[1000, 2500, 10000],
digits=[2, 3, 4, 5, 6],
alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
```

```json
{'alpha': 0.25, 'digits': 2, 'dimensions': 1000}
0.5956853714701248
```

3 repetition 10 fold validation and confusion matrix

```python
0.59568
[[663   0]
 [450   0]]
```

# AIDS Dataset

### Best result 10 reps

```python
{'alpha': 0.55, 'digits': 6, 'dimensions': 10000, 'step': 20}
S => 0.70006
[[  83  317]
 [ 293 1307]]
```

---

GridSearch result

```python
dimensions=[1000, 2500, 10000],
digits=[3, 4, 5, 6],
alpha=[0.25, 0.35, 0.45, 0.55, 0.75],
```

```json
{'alpha': 0.55, 'digits': 6, 'dimensions': 10000}
0.6055
```

3 repetition 10 fold validation and confusion matrix

```python
0.59467
[[ 159  241]
 [ 568 1032]]
```

```
# {'alpha': 0.45, 'digits': 4, 'dimensions': 10000} 10_iter_score: 0.65167 [[17 46] [29 96]]
# {'alpha': 0.75, 'digits': 3, 'dimensions': 5000}
# {'alpha': 0.45, 'digits': 4, 'dimensions': 1000} 0.65079
# {'alpha': 0.25, 'digits': 4, 'dimensions': 1000}
# {'alpha': 0.35, 'digits': 4, 'dimensions': 1000}
# 0.7073099415204679

# {'alpha': 0.35, 'digits': 3, 'dimensions': 1000}
# 0.7131578947368421
# 10 iter 0.68120

# {'alpha': 0.45, 'digits': 3, 'dimensions': 2500}
# 0.7330409356725146
# 10_iter_score: 0.69327 [[11  52] [ 9 116]]
```
