In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import time


# Note: this example requires the torch_geometric library: https://pytorch-geometric.readthedocs.io
from torch_geometric.datasets import TUDataset

# Note: this example requires the torchmetrics library: https://torchmetrics.readthedocs.io
import torchmetrics

import torchhd
from torchhd import embeddings
from torchhd.models import Centroid

import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

DIMENSIONS = 10000  # hypervectors dimension

# for other available datasets see: https://pytorch-geometric.readthedocs.io/en/latest/notes/data_cheatsheet.html?highlight=tudatasets
dataset = "PROTEINS_full"

graphs = TUDataset("../data", dataset)
train_size = int(0.7 * len(graphs))
test_size = len(graphs) - train_size


def sparse_stochastic_graph(G):
    """
    Returns a sparse adjacency matrix of the graph G.
    The values indicate the probability of leaving a vertex.
    This means that each column sums up to one.
    """
    _, columns = G.edge_index
    # Calculate the probability for each column
    values_per_column = 1.0 / torch.bincount(columns, minlength=G.num_nodes)
    values_per_node = values_per_column[columns]
    size = (G.num_nodes, G.num_nodes)
    return torch.sparse_coo_tensor(G.edge_index, values_per_node, size)


def pagerank(G, alpha=0.45, max_iter=100, tol=1e-06):
    N = G.num_nodes
    M = sparse_stochastic_graph(G) * alpha
    v = torch.zeros(N, device=G.edge_index.device) + 1 / N
    p = torch.zeros(N, device=G.edge_index.device) + 1 / N
    for _ in range(max_iter):
        v_prev = v
        v = M @ v + p * (1 - alpha)

        err = (v - v_prev).abs().sum()
        if tol != None and err < N * tol:
            return v
    return v


def degree_centrality(G):
    """
    Compute the degree centrality for nodes.
    """
    _, columns = G.edge_index
    degree = torch.bincount(columns, minlength=G.num_nodes)
    return degree / G.num_nodes


def to_undirected(edge_index):
    """
    Returns the undirected edge_index
    [[0, 1], [1, 0]] will result in [[0], [1]]
    """
    edge_index = edge_index.sort(dim=0)[0]
    edge_index = torch.unique(edge_index, dim=1)
    return edge_index


def min_max_graph_size(graph_dataset):
    if len(graph_dataset) == 0:
        return None, None

    max_num_nodes = float("-inf")
    min_num_nodes = float("inf")

    for G in graph_dataset:
        num_nodes = G.num_nodes
        max_num_nodes = max(max_num_nodes, num_nodes)
        min_num_nodes = min(min_num_nodes, num_nodes)

    return min_num_nodes, max_num_nodes


class Default(nn.Module):
    def __init__(self, out_features, size):
        super(Default, self).__init__()
        self.out_features = out_features
        self.node_ids = embeddings.Random(size, out_features)

    def forward(self, x):
        pr = pagerank(x)
        _, pr_argsort = pr.sort()

        node_id_hvs = torch.zeros((x.num_nodes, self.out_features), device=device)
        node_id_hvs[pr_argsort] = self.node_ids.weight[: x.num_nodes]

        row, col = to_undirected(x.edge_index)

        hvs = torchhd.bind(node_id_hvs[row], node_id_hvs[col])
        return torchhd.multiset(hvs)


class Encoder(nn.Module):
    def __init__(self, out_features, size):
        super(Encoder, self).__init__()
        self.out_features = out_features
        # self.node_ids = embeddings.Level(size, out_features, sparse=True)
        self.node_ids = embeddings.Random(size, out_features)

    def forward(self, x):
        # pr = pagerank(x)
        pr = degree_centrality(x)
        sort, pr_argsort = pr.sort()
        # pr_argsort = [i for i in range(x.num_nodes)]
        # print(sort)

        node_id_hvs = torch.zeros((x.num_nodes, self.out_features), device=device)
        # node_id_hvs[pr_argsort] = self.node_ids(sort)
        node_id_hvs[pr_argsort] = self.node_ids.weight[: x.num_nodes]

        row, col = to_undirected(x.edge_index)

        hvs = torchhd.bind(node_id_hvs[row], node_id_hvs[col])
        return torchhd.multiset(hvs)

        return torchhd.multiset(node_id_hvs)


default_acc = []
test_acc = []
comparison_acc = []
iters = 30

min_graph_size, max_graph_size = min_max_graph_size(graphs)
default_encoder = Default(DIMENSIONS, max_graph_size)
default_encoder = default_encoder.to(device)

test_encoder = Encoder(DIMENSIONS, max_graph_size)
test_encoder = test_encoder.to(device)

time_acc = []


for i in tqdm(range(iters), desc=f"Testing {iters} times"):
    start = time.time()

    train_ld, test_ld = torch.utils.data.random_split(graphs, [train_size, test_size])

    default_model = Centroid(DIMENSIONS, graphs.num_classes)
    default_model = default_model.to(device)

    test_model = Centroid(DIMENSIONS, graphs.num_classes)
    test_model = test_model.to(device)

    with torch.no_grad():
        for samples in tqdm(train_ld, desc="Training", disable=True):
            samples.edge_index = samples.edge_index.to(device)
            samples.y = samples.y.to(device)

            default_samples_hv = default_encoder(samples).unsqueeze(0)
            default_model.add(default_samples_hv, samples.y)

            test_samples_hv = test_encoder(samples).unsqueeze(0)
            test_model.add(test_samples_hv, samples.y)

    default_accuracy = torchmetrics.F1Score("multiclass", num_classes=graphs.num_classes)
    test_accuracy = torchmetrics.F1Score("multiclass", num_classes=graphs.num_classes)
    comparison_accuracy = torchmetrics.F1Score("multiclass", num_classes=graphs.num_classes)

    with torch.no_grad():
        default_model.normalize()
        test_model.normalize()

        for index, samples in enumerate(tqdm(test_ld, desc="Testing_", disable=True)):
            samples.edge_index = samples.edge_index.to(device)

            default_samples_hv = default_encoder(samples).unsqueeze(0)
            default_outputs = default_model(default_samples_hv, dot=True)

            test_samples_hv = test_encoder(samples).unsqueeze(0)
            test_outputs = test_model(test_samples_hv, dot=True)

            default_accuracy.update(default_outputs.cpu(), samples.y)
            test_accuracy.update(test_outputs.cpu(), samples.y)
            comparison_accuracy.update(
                torch.argmax(test_outputs, dim=-1).cpu(), torch.argmax(default_outputs, dim=-1).cpu()
            )

    default_acc.append(default_accuracy.compute().item() * 100)
    test_acc.append(test_accuracy.compute().item() * 100)
    comparison_acc.append(comparison_accuracy.compute().item() * 100)
    end = time.time()
    time_acc.append(end - start)

print(f"Testing {dataset}")
print(f"GraphHD: accuracy of {(np.mean(default_acc)):.3f}% with std {(np.std(default_acc)):.3f}%")
print(f"Test: accuracy of {(np.mean(test_acc)):.3f}% with std {(np.std(test_acc)):.3f}%")
print(f"Comparison between predictions: {(np.mean(comparison_acc)):.3f}% with std {(np.std(comparison_acc)):.3f}%")
print(f"Time: {(np.mean(time_acc)):.3f}s with std {(np.std(time_acc)):.3f}s")

###################################################################################################################

iters = 30
time_acc = []

for i in tqdm(range(iters), desc=f"Testing {iters} times"):
    start = time.time()

    train_ld, test_ld = torch.utils.data.random_split(graphs, [train_size, test_size])

    test_model = Centroid(DIMENSIONS, graphs.num_classes)
    test_model = test_model.to(device)

    with torch.no_grad():
        for samples in tqdm(train_ld, desc="Training", disable=True):
            samples.edge_index = samples.edge_index.to(device)
            samples.y = samples.y.to(device)

            test_samples_hv = test_encoder(samples).unsqueeze(0)
            test_model.add(test_samples_hv, samples.y)

    test_accuracy = torchmetrics.F1Score("multiclass", num_classes=graphs.num_classes)

    with torch.no_grad():
        test_model.normalize()

        for index, samples in enumerate(tqdm(test_ld, desc="Testing_", disable=True)):
            samples.edge_index = samples.edge_index.to(device)

            test_samples_hv = test_encoder(samples).unsqueeze(0)
            test_outputs = test_model(test_samples_hv, dot=True)

            test_accuracy.update(test_outputs.cpu(), samples.y)

    test_acc.append(test_accuracy.compute().item() * 100)
    end = time.time()
    time_acc.append(end - start)

print(f"Test: accuracy of {(np.mean(test_acc)):.3f}% with std {(np.std(test_acc)):.3f}%")
print(f"Time: {(np.mean(time_acc)):.3f}s with std {(np.std(time_acc)):.3f}s")

Using cuda device


Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS_full.zip
Extracting ../data/PROTEINS_full/PROTEINS_full.zip
Processing...
Done!
Testing 30 times: 100%|██████████| 30/30 [02:11<00:00,  4.37s/it]


Testing PROTEINS_full
GraphHD: accuracy of 68.214% with std 2.106%
Test: accuracy of 68.902% with std 2.041%
Comparison between predictions: 94.002% with std 1.274%
Time: 4.366s with std 0.097s


Testing 30 times: 100%|██████████| 30/30 [00:23<00:00,  1.26it/s]

Test: accuracy of 68.837% with std 2.231%
Time: 0.791s with std 0.033s



