In [93]:
import os
import random

import gensim
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GAE
from torch_geometric.utils import negative_sampling
from torch.utils.data import Dataset
from graph_loader import load_graphs
from gensim.models import Word2Vec

# Hyperparameters

In [94]:
SEED = 0
random.seed(SEED)

# Embedding creation:
num_walks = 20
walk_length = 20

# Embedding model:
embedding_vector_size=64      # Size of the embedding vector
window=2            # Context window size --> Wie viele Wörter außenherum werden beachtet?
min_count=1         # Minimum frequency for a node to be included
sg=1                # Use Skip-Gram (sg=1) instead of CBOW (sg=0)
workers=4           # Number of CPU threads to use
epochs=10           # Number of training epochs

# GNN model:
learning_rate = 0.01

# Training:
num_epochs = 10


In [95]:
"""

    graph.get_edges()

    Edge:
        node: Aktuelle Node
        connected_nodes: List an Nodes, von node

        E.g.: edges.get_items() liefer alle edges:
        node: Node
        connected_nodes: [Dict(Nodes)]
        Verbindung Node 2 zu Node 0:
            Node(NodeID=2, Part=Part(PartID=58, FamilyID=31))
            [Node(NodeID=0, Part=Part(PartID=1621, FamilyID=0))]

        Verbindung Node 0 zu Nodes 1, 2, 3, 4, 5:
            Node(NodeID=0, Part=Part(PartID=1621, FamilyID=0)),
            [Node(NodeID=1, Part=Part(PartID=58, FamilyID=31)), Node(NodeID=2, Part=Part(PartID=58, FamilyID=31)), Node(NodeID=3, Part=Part(PartID=58, FamilyID=31)), Node(NodeID=4, Part=Part(PartID=58, FamilyID=31))]




"""

'\n\n    graph.get_edges()\n\n    Edge:\n        node: Aktuelle Node\n        connected_nodes: List an Nodes, von node\n\n        E.g.: edges.get_items() liefer alle edges:\n        node: Node\n        connected_nodes: [Dict(Nodes)]\n        Verbindung Node 2 zu Node 0:\n            Node(NodeID=2, Part=Part(PartID=58, FamilyID=31))\n            [Node(NodeID=0, Part=Part(PartID=1621, FamilyID=0))]\n\n        Verbindung Node 0 zu Nodes 1, 2, 3, 4, 5:\n            Node(NodeID=0, Part=Part(PartID=1621, FamilyID=0)),\n            [Node(NodeID=1, Part=Part(PartID=58, FamilyID=31)), Node(NodeID=2, Part=Part(PartID=58, FamilyID=31)), Node(NodeID=3, Part=Part(PartID=58, FamilyID=31)), Node(NodeID=4, Part=Part(PartID=58, FamilyID=31))]\n\n\n\n\n'

In [124]:
def create_edge_list(graph):
    edge_set = set()
    edges = graph.get_edges()
    for node, connected_nodes in edges.items():
        for connected_node in connected_nodes:

            # Store edges by node_ID and part_ID + node_ID and part_ID or source and target
            # Make sure each edge is only stored once (unidirectionally)
            edge = tuple(sorted((
                (node.get_id(), int(node.get_part().get_part_id())),
                (connected_node.get_id(), int(connected_node.get_part().get_part_id()))
            )))
            edge_set.add(edge)

    return list(edge_set)


In [117]:
def create_part_list(graph):
    part_list = []
    nodes = graph.get_nodes()
    for node in nodes:
        part_list.append((node.get_id(), node.get_part().get_part_id()))
    return part_list

In [118]:
class GraphDataset(Dataset):
    def __init__(self, file_path: str, train=False, validation=False, test=False, seed=42):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Dataset file not found at {file_path}")

        self.graphs = load_graphs(file_path)

        if sum([train, validation, test]) != 1:
            raise ValueError("Exactly one of 'train', 'validation', or 'test' must be True.")

    def __getitem__(self, idx):
        graph = self.graphs[idx]
        return graph

In [119]:
def prepare_graph_data(graph_dataset):
    edge_list_dict = {}
    part_list_dict = {}

    for index, graph in enumerate(graph_dataset):
        edge_list_dict[index] =  create_edge_list(graph)
        part_list_dict[index] = create_part_list(graph)

    return edge_list_dict, part_list_dict


In [120]:
#training_set = GraphDataset("data/graphs.dat", train = True, seed=SEED)
#validation_set = GraphDataset("data/graphs.dat", validation = True, seed=SEED)
testing_set = GraphDataset("data/graphs.dat", test = True, seed=SEED)

In [125]:
edge_list, parts_list = prepare_graph_data(testing_set)
print(len(edge_list), len(parts_list))
print(edge_list[0])
print(parts_list[0])

11159 11159
[((0, 1621), (2, 58)), ((0, 1621), (3, 58)), ((0, 1621), (1, 58)), ((0, 1621), (4, 58))]
[(4, 58), (0, 1621), (2, 58), (3, 58), (1, 58)]


# 2. Embeddings

## 2.1. Generating Random Walks

In [103]:
def generate_random_walks_single_graph(edges, num_walks=10, walk_length=5):
    """
    Generate random walks for a single graph.

    Parameters:
        edges (list): Edge list for a single graph.
        num_walks (int): Number of random walks to generate per node.
        walk_length (int): Length of each random walk.

    Returns:
        list: A list of random walks, where each walk is a list of PartIDs.
    """
    # TODO in das ReadMe: Hier fraglich of random-walks funktionieren, da viel hin und her wegen non-cyclical
    # TODO Maximilian: Context


    walks = []
    graph = {}
    # Build adjacency list
    for edge in edges:
        node1, node2 = edge[0][1], edge[1][1]  # Extract PartIDs
        graph.setdefault(node1, []).append(node2)
        graph.setdefault(node2, []).append(node1)

    # Perform random walks
    for _ in range(num_walks):
        for node in graph.keys():
            walk = [node]  # Start the walk with the current node
            while len(walk) < walk_length:
                cur = walk[-1]  # Get the last node in the walk
                if cur in graph:
                    walk.append(random.choice(graph[cur]))  # Add a random neighbor
                else:
                    break
            walks.append(walk)  # Add the walk to the list of walks

    return walks

In [104]:
# Generate Random Walks for all graphs:
"""
    execution Dauer: ca. 12 Sekunden
"""

random_walks = {}
for index, graph in enumerate(testing_set):
    random_walks[index] = generate_random_walks_single_graph(edge_list[index], num_walks=num_walks, walk_length=walk_length)
print(random_walks[0])

[['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621'], ['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621'], ['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621'], ['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58'

## 2.2. Training Embeddings


In [105]:
"""
    flattening Dauer: instant
    training Dauer:
"""
flat_random_walks = [walk for walks in random_walks.values() for walk in walks]         # flat_random_walks Länge: 131.0240

print(len(flat_random_walks))

# Use only the first 100 random walks
limited_walks = flat_random_walks[:10000]

print("Walks flattened")
print(flat_random_walks[:10])
word2vec_model = Word2Vec(
    sentences=limited_walks,    # The random walks
    vector_size=embedding_vector_size,    # Size of the embedding vector
    window=window,              # Context window size
    min_count=min_count,        # Minimum frequency for a node to be included
    sg=sg,                      # Use Skip-Gram (sg=1) instead of CBOW (sg=0)
    workers=workers,            # Number of CPU threads to use
    epochs=epochs               # Number of training epochs
)
word2vec_model.save("node_embeddings.model")

1310240
Walks flattened
[['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621'], ['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621'], ['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58'], ['58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621'], ['1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '58', '1621', '5

In [65]:
# Get the embedding for a specific node (e.g., '10')
embedding = word2vec_model.wv['1000']
print("Embedding for node 58:", embedding)

KeyError: "Key '1000' not present"

## Training Data Generation

In [126]:
# 1. Prepare Node Features:

# Convert the list of embeddings to a NumPy array first
node_features_array = np.array([word2vec_model.wv[str(node_id)] for node_id in word2vec_model.wv.index_to_key])

# Convert the NumPy array to a PyTorch tensor
node_features = torch.tensor(node_features_array, dtype=torch.float)

print("------------------------------------------------------------------")
print("Node Features (node_features):")
print(node_features)  # Prints the tensor values
print("Shape:", node_features.shape)  # Prints the shape of the tensor

"""
    print(node_features.shape)

    Output: node_features.shape:
    e.g. torch.Size([221, 64]):
        - 221 Vocabulary in embedding dictionary: 221 PartIDs
        - 64 dimensionality: 64 floating point embedding
"""

# 2. Prepare Edge Index: Flatten your edge_list (from prepare_graph_data)
edge_index_list = []
for edges in edge_list.values():
    for edge in edges:
        edge_index_list.append([edge[0][1], edge[1][1]])  # Extract NodeIDs

# Convert edge_index_list to tensor
edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()

print("------------------------------------------------------------------")
print("\nEdge Index (edge_index):")
print(edge_index)  # Prints the tensor values
print("Shape:", edge_index.shape)  # Prints the shape of the tensor
# Optional: Print a few edges for inspection
print("\nSample edges:")
for i in range(min(10, edge_index.size(1))):  # Print the first 10 edges or fewer
    print(f"Edge {i}: Source Node = {edge_index[0, i].item()}, Target Node = {edge_index[1, i].item()}")

"""
    print(edge_index.shape)

    Output: edge_index.shape:
    e.g. torch.Size([2, 73981])
        - 2 Rows: Source and Target Nodes
        - 73981 columns: 73981 edges between source and target nodes
        edge_list = torch.tensor([
            [0, 1, 2],  # Source nodes
            [1, 2, 3]   # Target nodes
        ])
"""

# 3. Edge Labels:
# Positive edges
pos_edge_index = edge_index
pos_edge_label = torch.ones(pos_edge_index.size(1))  # Labels = 1 for positive edges

# Negative edges
neg_edge_index = negative_sampling(
    edge_index=edge_index,
    num_nodes=node_features.size(0),
    num_neg_samples=pos_edge_index.size(1)  # Same number as positive edges
)
neg_edge_label = torch.zeros(neg_edge_index.size(1))  # Labels = 0 for negative edges

# Combine positive and negative edges
edge_label_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
edge_label = torch.cat([pos_edge_label, neg_edge_label], dim=0)

print("------------------------------------------------------------------")
print("\nPositive Edges (pos_edge_index):")
print(pos_edge_index)  # Prints the positive edge tensor
print("Shape:", pos_edge_index.shape)

print("\nPositive Edge Labels (pos_edge_label):")
print(pos_edge_label)  # Prints labels for positive edges
print("Shape:", pos_edge_label.shape)


print("------------------------------------------------------------------")
print("\nNegative Edges (neg_edge_index):")
print(neg_edge_index)  # Prints the negative edge tensor
print("Shape:", neg_edge_index.shape)

print("\nNegative Edge Labels (neg_edge_label):")
print(neg_edge_label)  # Prints labels for negative edges
print("Shape:", neg_edge_label.shape)

------------------------------------------------------------------
Node Features (node_features):
tensor([[-0.1659, -0.2757, -0.1877,  ..., -0.3490, -0.3277, -0.5999],
        [ 0.2603, -0.2521, -0.7242,  ..., -0.3345,  0.1911,  0.8414],
        [ 0.1541, -0.3777, -0.9810,  ..., -0.7086, -0.4216,  0.2896],
        ...,
        [ 0.5048, -0.1711,  0.9390,  ..., -0.9987,  0.0703,  0.0722],
        [ 0.4811, -0.7730,  0.0211,  ..., -0.8020, -0.7575, -0.9186],
        [ 0.2690, -0.6496,  0.3211,  ...,  0.1330, -1.0856, -0.1125]])
Shape: torch.Size([221, 64])
------------------------------------------------------------------

Edge Index (edge_index):
tensor([[1621, 1621, 1621,  ...,  587,  587,  587],
        [  58,   58,   58,  ...,   61,   11,   25]])
Shape: torch.Size([2, 73981])

Sample edges:
Edge 0: Source Node = 1621, Target Node = 58
Edge 1: Source Node = 1621, Target Node = 58
Edge 2: Source Node = 1621, Target Node = 58
Edge 3: Source Node = 1621, Target Node = 58
Edge 4: Source N

# Model: Graph Neural Network

In [107]:
class LinkPredictionGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LinkPredictionGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, x, edge_index):
        # GNN layers
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

    def decode(self, z, edge_label_index):
        # Compute edge scores
        source_embeddings = z[edge_label_index[0]]  # Embeddings of source nodes
        target_embeddings = z[edge_label_index[1]]  # Embeddings of target nodes
        edge_scores = (source_embeddings * target_embeddings).sum(dim=-1)  # Dot product
        return edge_scores



### Model Initialization

In [108]:
# Model parameters:
input_dim = node_features.size(1)   # Embedding dimension from Word2Vec
hidden_dim = embedding_vector_size

# Initialize GNN:
model = LinkPredictionGNN(input_dim, hidden_dim)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.BCEWithLogitsLoss()    # Binary cross-entropy with logits


### Training Loop

In [109]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass: Generate node embeddings
    z = model(node_features, edge_index)

    # Decode edges: Predict scores for edges
    edge_scores = model.decode(z, edge_label_index)

    # Compute loss
    loss = criterion(edge_scores, edge_label)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


Epoch 1/10, Loss: 0.0000
Epoch 2/10, Loss: 0.0000
Epoch 3/10, Loss: 0.0000
Epoch 4/10, Loss: 0.0000
Epoch 5/10, Loss: 0.0000
Epoch 6/10, Loss: 0.0000
Epoch 7/10, Loss: 0.0000
Epoch 8/10, Loss: 0.0000
Epoch 9/10, Loss: 0.0000
Epoch 10/10, Loss: 0.0000


### Model Evaluation


In [110]:
model.eval()
with torch.no_grad():
    z = model(node_features, edge_index)    # Node embeddings
    edge_scores = model.decode(z, edge_label_index)  # Edge predictions

    # Convert logits to probabilities
    edge_probs = torch.sigmoid(edge_scores)

    # Classify edges (threshold at 0.5)
    predicted_labels = (edge_probs > 0.5).long()

### Model Accuracy


In [111]:
# Calculate accuracy
correct = (predicted_labels == edge_label).sum().item()
accuracy = correct / edge_label.size(0)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000
