In [93]:
import numpy as np
import networkx as nx
import functools as ft
import math
import random
from datasets import load_dataset

In [2]:
MUTAG = load_dataset("graphs-datasets/MUTAG")

In [110]:
def generateRandomVector(d):
    return np.random.randint(2, size=d)


def similarity(A, B):
    if A.size != B.size:
        raise Exception("A and B have different dimensions.")

    count = 0
    for i in range(0, (A.size - 1)):
        if A[i] == B[i]:
            count += 1

    return count / A.size


def mult(A, B):
    return np.bitwise_xor(A, B)


def applyThreshold(V, n=2):
    if n % 2 == 1:
        threshold = lambda x: 1 if x > n / 2 else 0
    else:
        threshold = (
            lambda x: 1
            if x > n / 2
            else (1 if x == n / 2 and bool(random.getrandbits(1)) else 0)
        )

    vectorized_threshold = np.vectorize(threshold)
    return vectorized_threshold(V)


def arithmeticSumVectors(vectors):
    sum = ft.reduce(lambda x, y: x + y, vectors)

    return applyThreshold(sum, len(vectors))


def comparison(A, B):
    dividend = sum(A * B)
    divisor = math.sqrt(sum(A**2) * sum(B**2))

    return dividend / divisor


def cosine_similarity2(A, B):
    dot_product = np.dot(A, B)
    divisor = np.sqrt(np.sum(A**2)) * np.sqrt(np.sum(B**2))

    return dot_product / divisor


def cosine_similarity(A, B):
    dot_product = np.dot(A, B)
    norm_A = np.linalg.norm(A)
    norm_B = np.linalg.norm(B)

    return dot_product / (norm_A * norm_B)


class ItemMemory:
    def __init__(self, vectors=[]):
        self.vectors = vectors

    def addVector(self, label, V):
        self.vectors.append((label, V))

    def count(self):
        return len(self.vectors)

    def cleanup(self, V):
        return max(self.vectors, key=lambda x: similarity(V, x[1]))

In [96]:
def encodeGraph(graph, vertices, dimensions, digits, alpha):
    gpr = nx.pagerank(graph, alpha)

    for pr in nx.pagerank(graph, alpha).values():
        key = str(round(pr, digits))
        vertices[key] = generateRandomVector(dimensions)

    Edges = []

    for edge in graph.edges:
        keyV1 = str(round(gpr[edge[0]], digits))
        keyV2 = str(round(gpr[edge[1]], digits))
        v1 = vertices[keyV1]
        v2 = vertices[keyV2]
        E = mult(v1, v2)
        Edges.append(E)

    Graph = arithmeticSumVectors(Edges)

    return Graph

In [97]:
from sklearn.base import BaseEstimator, ClassifierMixin


class GraphHD(BaseEstimator, ClassifierMixin):
    def __init__(self, dimensions=10000, digits=7, alpha=0.85):
        self.dimensions = dimensions
        self.digits = digits
        self.alpha = alpha
        self.vertices = dict()

    def fit(self, X, y):
        self.memory = ItemMemory([])
        self.labels = list(set(y))
        dictLabels = dict()

        for label in self.labels:
            dictLabels[label] = []

        for i in range(len(y)):
            Graph = encodeGraph(
                X[i], self.vertices, self.dimensions, self.digits, self.alpha
            )
            dictLabels[y[i]].append(Graph)

        for key, value in dictLabels.items():
            Label = arithmeticSumVectors(value)
            self.memory.addVector(str(key), Label)

        return self

    def predict(self, X):
        predictions = []
        similarities = []
        for testGraph in X:
            queryVector = encodeGraph(
                testGraph, self.vertices, self.dimensions, self.digits, self.alpha
            )
            cleanVector = self.memory.cleanup(queryVector)

            predictions.append(int(cleanVector[0]))
            similarities.append(similarity(queryVector, cleanVector[1]))

        print("%.5f" % round(np.mean(similarities), 5), predictions)
        return predictions

In [98]:
def processDataset(dataset):
    graphs = []
    labels = []

    for graph in dataset:
        G = nx.Graph()
        G.add_edges_from(zip(graph["edge_index"][0], graph["edge_index"][1]))
        graphs.append(G)
        labels.append(graph["y"][0])

    return (graphs, labels)

In [129]:
import numpy as np
from numpy.linalg import norm

# Define your two vectors as NumPy arrays
vector_a = np.random.randint(2, size=10000)
vector_b = np.random.randint(2, size=10000)
print(vector_a)
print(vector_b)

# Calculate the dot product
dot_product = np.dot(vector_a, vector_b)

# Calculate the Euclidean norms
norm_a = norm(vector_a)
norm_b = norm(vector_b)

# Calculate the cosine similarity
cosine_similarity = dot_product / (norm_a * norm_b)

print("Cosine Similarity:", cosine_similarity)

[1 0 0 ... 0 1 0]
[0 1 0 ... 0 1 1]
Cosine Similarity: 0.49541126214899356


In [121]:
A = generateRandomVector(10000)
B = generateRandomVector(10000)

print(A)
print(similarity(A, B))
print(cosine_similarity2(A, B))
print(cosine_similarity(A, B))

[1 0 0 ... 0 1 0]
0.4954
0.4944031134101148
0.4944031134101148


In [99]:
from sklearn.model_selection import cross_val_score


def main():
    DIMENSIONS = 10000
    DIGITS = 5
    ALPHA = 0.65

    (graphs, labels) = processDataset(MUTAG["train"])

    clf = GraphHD(DIMENSIONS, DIGITS, ALPHA)
    scores = cross_val_score(clf, graphs, labels, cv=10, error_score="raise")
    print("%.5f" % scores.mean())


main()

0.50611 [1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0]
0.50746 [1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0]
0.50777 [0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]
0.50911 [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]
0.50629 [1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1]
0.50368 [0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1]
0.50923 [1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1]
0.50889 [0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1]
0.51270 [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1]
0.50856 [1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0]
0.49532
