In [3]:
import os
from collections import Counter
import torch
import time
import random
from matplotlib import pyplot as plt
import numpy as np
from torch import nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from abc import ABC, abstractmethod
from typing import Set, List, Tuple
from graph_loader import load_graphs
from graph import Graph
from part import Part
from node import Node
from typing import Dict, List, Set, Tuple, Union

In [4]:
# MPS for Metal acceleration for Mac
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.has_mps else "cpu") 
SEED = 0
random.seed(SEED)
# setgrad = lambda g, *ms: [setattr(p,'requires_grad', g) for m in ms for p in m.parameters() ]  

  device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.has_mps else "cpu")


In [5]:
class GraphDataset(Dataset):
    def __init__(self, file_path: str, train=False, validation=False, test=False, seed=42):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Dataset file not found at {file_path}")

        self.graphs = load_graphs(file_path)

        if sum([train, validation, test]) != 1:
            raise ValueError("Exactly one of 'train', 'validation', or 'test' must be True.")


        # Create global mapping for unique parts
        self.family_part_dict = {}


        unique_parts = set()
        for graph in self.graphs:
            parts = graph.get_parts()
            for part in parts:
                unique_parts.add(int(part.get_part_id()))
                self.family_part_dict[int(part.get_part_id())] = int(part.get_family_id())

        # unique parts and mapping across all graphs (not just within a certain split)
        unique_parts = sorted(list(unique_parts))
        self.total_global_part_to_idx = {part: idx for idx, part in enumerate(unique_parts)} # mapping part_id to index
        self.idx_to_part_id = {idx: part for part, idx in self.total_global_part_to_idx.items()}  # Reverse mapping
        self.total_num_unique_parts = len(unique_parts)

        # Split: 70% training, 15% validation, 15% test
        train_graphs, test_graphs = train_test_split(self.graphs, test_size=0.3, random_state=seed)
        validation_graphs, test_graphs = train_test_split(test_graphs, test_size=0.5, random_state=seed)

        if train:
            self.graphs = train_graphs
        elif validation:
            self.graphs = validation_graphs
        elif test:
            self.graphs = test_graphs

        
    def __len__(self):
        return len(self.graphs)

    
    def __getitem__(self, idx):
        # return parts und graphen
        graph = self.graphs[idx]
        
        # Initialize a count vector for parts
        part_frequency_vector = np.zeros(self.total_num_unique_parts, dtype=np.int32)

        # Count occurrences of each part
        parts = graph.get_parts()
        for part in parts:
            part_id = int(part.get_part_id())
            mapped_id = self.total_global_part_to_idx[part_id]
            part_frequency_vector[mapped_id] += 1  # Increment the count
        
        # Ensure part_features is 2D
        part_frequency_vector = torch.tensor(part_frequency_vector, dtype=torch.float).unsqueeze(-1)

        return self.graphs[idx].get_parts(), self.graphs[idx]


        
    def graph_to_global_adjacency_matrix_with_weights(self, graph):
        """
        Converts a graph to a weighted global adjacency matrix.
        Each edge weight represents the number of times a connection occurs.
        """
        # Initialize the adjacency matrix
        adj_matrix = np.zeros((self.total_num_unique_parts, self.total_num_unique_parts), dtype=np.float32)

        # Get all edges from the graph
        edges = graph.get_edges()

    
        for node, connected_nodes in edges.items():
            part_id = int(node.get_part().get_part_id()) # cast str to int
            mapped_part_idx = self.total_global_part_to_idx[part_id]

            for connected_node in connected_nodes:
                connected_part_id = int(connected_node.get_part().get_part_id())
                mapped_connected_part_idx = self.total_global_part_to_idx[connected_part_id]

                # Update the adjacency matrix symmetrically
                adj_matrix[mapped_part_idx, mapped_connected_part_idx] += 1
                adj_matrix[mapped_connected_part_idx, mapped_part_idx] += 1

        return adj_matrix



In [6]:
training_set = GraphDataset("data/graphs.dat", train = True, seed=SEED)
validation_set = GraphDataset("data/graphs.dat", validation = True, seed=SEED)
testing_set = GraphDataset("data/graphs.dat", test = True, seed=SEED)

In [6]:
import numpy as np

def compute_normalized_and_accumulated_adjacency(dataset):
    """
    Compute a global adjacency matrix where each entry represents the accumulated probability of an edge
    over all graphs in the dataset, accounting for repeated connections.
    """
    num_parts = dataset.total_num_unique_parts
    global_adj_matrix = np.zeros((num_parts, num_parts), dtype=np.float32)

    for graph in dataset.graphs:
        # Step 1: Compute weighted adjacency matrix for the current graph
        graph_adj_matrix = dataset.graph_to_global_adjacency_matrix_with_weights(graph)

        # Step 2: Normalize the local adjacency matrix to probabilities
        total_edges = graph_adj_matrix.sum() / 2  # Divide by 2 for undirected graphs

        if total_edges > 0:
            graph_adj_matrix /= total_edges  # Normalize edge weights to probabilities
        
        # Step 3: Accumulate normalized probabilities into the global adjacency matrix
        global_adj_matrix += graph_adj_matrix

    return global_adj_matrix

# Instantiate the training dataset
training_set = GraphDataset("data/graphs.dat", train=True, seed=42)

# Compute the global adjacency matrix
global_adjacency_matrix = compute_normalized_and_accumulated_adjacency(training_set)

# We want the probabilities 
global_adjacency_matrix = global_adjacency_matrix / len(training_set.graphs)

# Display or save the matrix
print("Global adjacency matrix:\n", global_adjacency_matrix)

Global adjacency matrix:
 [[0.         0.00033688 0.00016714 ... 0.         0.         0.        ]
 [0.00033688 0.         0.         ... 0.         0.         0.        ]
 [0.00016714 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [7]:
from evaluation import MyPredictionModel
import random
from graph import Graph 

class NeighbourGraphPredictionModel(MyPredictionModel):
    def __init__(self, dataset):
        """
        Initialize the model with the dataset to access the global adjacency matrix and part-to-index mapping.
        :param dataset: The dataset object containing global adjacency matrix and part mappings.
        """
        self.global_adjacency_matrix = compute_normalized_and_accumulated_adjacency(dataset)
        self.total_global_part_to_idx = dataset.total_global_part_to_idx  # PartID -> Index mapping


    def predict_graph(self, parts: Set[Part]) -> Graph:
        """
        Predicts a random graph from the given set of parts.
        :param parts: Set of Part objects.
        :return: A randomly generated Graph object.
        """

        # create empty graph
        graph = Graph()

        parts = list(parts)
        parts = sorted(parts, key=lambda part: part.get_part_id())
        
        part_1 = parts.pop()
        pos_in_parts = 0
        max_part = None

        for i, part in enumerate(parts):
            max_probability = -1
            if part == part_1:
                continue
            # look in matrix for edge probability
            part_1_idx = self.total_global_part_to_idx[int(part_1.get_part_id())]
            part_idx = self.total_global_part_to_idx[int(part.get_part_id())]
            edge_probability = self.global_adjacency_matrix[part_1_idx, part_idx]
            if edge_probability > max_probability:
                max_probability = edge_probability
                max_part = part
                pos_in_parts = i
            
        # add edge to graph
        graph.add_undirected_edge(part_1, max_part)
        # delete part with index pos_in_parts from parts
        parts.pop(pos_in_parts)
    
        for part in list(parts):  # Use a copy of the list to avoid modifying it while iterating
            graph_parts = graph.get_parts()
            max_probability = -1
            max_graph_part = None
            
            for graph_part in graph_parts:
                if graph_part == part:
                    continue

                part_idx = self.total_global_part_to_idx[int(part.get_part_id())]
                graph_part_idx = self.total_global_part_to_idx[int(graph_part.get_part_id())]
                edge_probability = self.global_adjacency_matrix[part_idx, graph_part_idx]

                if edge_probability > max_probability:
                    max_probability = edge_probability
                    max_graph_part = graph_part

            graph.add_undirected_edge(part, max_graph_part)
            
        # Remove parts as edges are added (maintain duplicates in the graph as needed)
        processed_part = parts.pop(0)
        if processed_part not in parts:
            parts.append(processed_part)


        print("menge von nodes in graph", len(graph.get_parts()))
        return graph

In [7]:
testing_list = []
for parts, graph in testing_set: 
    tuple = (parts, graph)
    testing_list.append(tuple)

len(testing_list)

1674

In [9]:
from evaluation import evaluate

neighbour_graph_builder = NeighbourGraphPredictionModel(training_set)
accuracy = evaluate(neighbour_graph_builder, testing_list)
print("Accuracy of NeighbourGraphPredictionModel: ", accuracy)


menge von nodes in graph 9
menge von nodes in graph 6
menge von nodes in graph 7
menge von nodes in graph 6
menge von nodes in graph 5
menge von nodes in graph 7
menge von nodes in graph 7
menge von nodes in graph 5
menge von nodes in graph 10
menge von nodes in graph 6
menge von nodes in graph 14
menge von nodes in graph 5
menge von nodes in graph 7
menge von nodes in graph 9
menge von nodes in graph 14
menge von nodes in graph 5
menge von nodes in graph 6
menge von nodes in graph 5
menge von nodes in graph 7
menge von nodes in graph 6
menge von nodes in graph 5
menge von nodes in graph 5
menge von nodes in graph 7
menge von nodes in graph 6
menge von nodes in graph 7
menge von nodes in graph 8
menge von nodes in graph 5
menge von nodes in graph 5
menge von nodes in graph 8
menge von nodes in graph 5
menge von nodes in graph 6
menge von nodes in graph 5
menge von nodes in graph 6
menge von nodes in graph 6
menge von nodes in graph 10
menge von nodes in graph 13
menge von nodes in grap