#### Dataset Creation

In [1]:
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [2]:
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [3]:
from data_loading.data import ModelDataset

config_params = dict(
    timeout = 120,
    min_enr = 1.2,
    min_edges = 10
)
ecore = ModelDataset('ecore_555', reload=False, **config_params)
# modelset = ModelDataset('modelset', reload=False, remove_duplicates=True, **config_params)
# mar = ModelDataset('mar-ecore-github', reload=False, **config_params)


datasets = {
    'ecore': ecore,
    # 'modelset': modelset,
    # 'mar': mar
}

Loading ecore_555 from pickle
Loaded ecore_555 with 281 graphs
Loaded ecore_555 with 281 graphs
Graphs: 281


In [4]:
from data_loading.graph_dataset import GraphDataset

graph_data_params = dict(
    distance=2,
    reload=False,
    add_negative_train_samples=True,
    neg_sampling_ratio=1,
    use_edge_types=False,
)

ecore_graph_dataset = GraphDataset(ecore, **graph_data_params)
# modelset_graph_dataset = GraphDataset(modelset, **graph_data_params)
# mar_graph_dataset = GraphDataset(mar, **graph_data_params)

Processing ecore_555:   0%|          | 0/281 [00:00<?, ?it/s]

In [5]:
from models.gnn_layers import (
    GATv2, 
    FeedForward, 
    GNNLinkPredictor
)
from trainers.link_predictor import GNNLinkPredictorTrainer


input_dim = ecore_graph_dataset[0].x.size(1)
hidden_dim = 64
output_dim = 128
num_layers = 3
num_heads = 4
edge_dim = ecore_graph_dataset[0].edge_attr.size(1)
residual = True


gat = GATv2(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    edge_dim=edge_dim,
    residual=residual,
    dropout=0.2,
)

lp_head = FeedForward(
    input_dim=(output_dim*num_heads if num_heads > 1 else output_dim) * 2,
    hidden_dim=hidden_dim,
    output_dim=1,
    num_layers=3,
)

ec_head = FeedForward(
    input_dim=(output_dim*num_heads if num_heads > 1 else output_dim) * 2,
    hidden_dim=hidden_dim,
    output_dim=3,
    num_layers=3,
    final_activation='softmax',
)

gnn_lp = GNNLinkPredictor(gat, lp_head, ec_head)
lp_trainer = GNNLinkPredictorTrainer(
    gnn_lp, 
    ecore_graph_dataset, 
    use_link_predictor=True, 
    use_edge_classifier=True
)

In [None]:
lp_trainer.train_epochs(100)

In [None]:
# import torch
# import torch.nn.functional as F
# import torch.optim as optim
# from torch_geometric.data import Data
# from torch_geometric.loader import DataLoader
# from torch_geometric.nn import GCNConv
# from torch_geometric.nn.aggr import SortAggregation
# import networkx as nx
# from torch_geometric.transforms import RandomLinkSplit


# def remap_node_indices(subgraph, center_node):
#     mapping = {node: i for i, node in enumerate(subgraph.nodes())}
#     subgraph = nx.relabel_nodes(subgraph, mapping)
#     sub_edge_index = torch.tensor(list(subgraph.edges)).t().contiguous()
#     sub_x = torch.ones(subgraph.number_of_nodes(), 1)  # Example node features
#     center_node_idx = mapping[center_node]
#     return sub_x, sub_edge_index, center_node_idx

# # Prepare the train and test datasets for SEAL model
# class SEALGraphData:
#     def __init__(
#             self, 
#             graph,
#             edge_index,
#             pos_edge_index,
#             neg_edge_index,
#             hops=1
#         ):
#         self.edge_index = edge_index
#         self.pos_edge_index = pos_edge_index
#         self.neg_edge_index = neg_edge_index
#         self.graph = graph
#         self.hops = hops



#     def __len__(self):
#         return self.pos_edge_index.size(1) + self.neg_edge_index.size(1)

#     def __getitem__(self, idx):
#         if idx < self.pos_edge_index.size(1):
#             u, v = self.pos_edge_index[:, idx]
#             y = 1
#         else:
#             u, v = self.neg_edge_index[:, idx - self.pos_edge_index.size(1)]
#             y = 0

#         subgraph = nx.ego_graph(self.graph, u.item(), radius=self.hops)
#         subgraph = nx.subgraph(subgraph, list(subgraph.nodes) + [v.item()])
#         sub_x, sub_edge_index, center_node_idx = remap_node_indices(subgraph, u.item())

#         return Data(
#             x=sub_x, 
#             edge_index=sub_edge_index, 
#             y=y, 
#             center_node_idx=center_node_idx
#         )


# def get_link_prediction_train_test_graph_data(
#         graph, 
#         num_val=0, 
#         num_test=0.2, 
#         add_negative_train_samples=True,
#         neg_sampling_ratio=1,
#     ):
#     transform = RandomLinkSplit(
#         num_val=num_val, 
#         num_test=num_test, 
#         neg_sampling_ratio=neg_sampling_ratio,
#         add_negative_train_samples=add_negative_train_samples,
#         split_labels=True
#     )

#     # Apply the transform
#     train_data, _, test_data = transform(
#         Data(
#             edge_index=graph.edge_index, 
#             num_nodes=ecore_graph.number_of_nodes()
#         )
#     )

#     return train_data, test_data

#     # train_graph_data = SEALGraphData(
#     #     graph, 
#     #     train_data.edge_index, 
#     #     train_data.pos_edge_label_index, 
#     #     train_data.neg_edge_label_index,
#     #     hops=hops
    
#     # )

#     # test_graph_data = SEALGraphData(
#     #     graph, 
#     #     test_data.edge_index, 
#     #     test_data.pos_edge_label_index, 
#     #     test_data.neg_edge_label_index,
#     #     hops=hops
#     # )

#     # return train_graph_data, test_graph_data
    

# train_data, test_data = get_link_prediction_train_test_graph_data(ecore_graph)
# # Create train and test dataloaders
# batch_size = 32

# # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Epoch: 001, Loss: 0.6956, Test Accuracy: 0.5000


In [None]:
from models.gnn_layers import GNNConv