#### Dataset Creation

In [1]:
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [2]:
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [3]:
from data_loading.data import ModelDataset

config_params = dict(
    timeout = 120,
    min_enr = 1.2,
    min_edges = 10
)
ecore = ModelDataset('ecore_555', reload=False, **config_params)
modelset = ModelDataset('modelset', reload=False, remove_duplicates=True, **config_params)
mar = ModelDataset('mar-ecore-github', reload=False, **config_params)


datasets = {
    'ecore': ecore,
    'modelset': modelset,
    'mar': mar
}

Loading ecore_555 from pickle
Loaded ecore_555 with 281 graphs
Loaded ecore_555 with 281 graphs
Graphs: 281
Loading modelset from pickle
Loaded modelset with 830 graphs
Loaded modelset with 830 graphs
Graphs: 830
Loading mar-ecore-github from pickle
Loaded mar-ecore-github with 5388 graphs
Loaded mar-ecore-github with 5388 graphs
Graphs: 5388


In [4]:
from data_loading.graph_dataset import GraphDataset

graph_data_params = dict(
    distance=2,
    reload=False,
    add_negative_train_samples=True,
    neg_sampling_ratio=1,
)

ecore_graph_dataset = GraphDataset(ecore, **graph_data_params)
# modelset_graph_dataset = GraphDataset(modelset, **graph_data_params)
# mar_graph_dataset = GraphDataset(mar, **graph_data_params)

Processing ecore_555:   0%|          | 0/281 [00:00<?, ?it/s]

In [5]:
max(ecore_graph_dataset, key=lambda d: d.x.shape[1])

Data(x=[45, 768], edge_index=[2, 45], edge_attr=[45, 768], y=1, overall_edge_index=[2, 56], train_pos_edge_label_index=[2, 45], train_pos_edge_label=[45], train_neg_edge_label_index=[2, 45], train_neg_edge_label=[45], test_pos_edge_label_index=[2, 11], test_pos_edge_label=[11], test_neg_edge_label_index=[2, 11], test_neg_edge_label=[11], num_nodes=45)

In [6]:
ecore_graph_dataset[0].x.shape[1]

768

In [7]:
from torch_geometric.loader import DataLoader

batch_size = 32
ecore_loader_train = DataLoader(ecore_graph_dataset, batch_size=batch_size, shuffle=False)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class MultiTaskGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_edge_types):
        super(MultiTaskGNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.link_pred_head = nn.Linear(hidden_channels * 2, 1)  # Link prediction head
        self.edge_class_head = nn.Linear(hidden_channels * 2, num_edge_types)  # Edge classification head


    def forward(self, x, edge_index):
        # x: Node features [num_nodes, in_channels]
        # edge_index: Graph connectivity [2, num_edges]
        
        # GNN layers
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        # For link prediction and edge classification, we use edge embeddings
        row, col = edge_index
        edge_features = torch.cat([x[row], x[col]], dim=1)  # [num_edges, hidden_channels*2]
        
        # Link prediction
        link_pred = torch.sigmoid(self.link_pred_head(edge_features)).squeeze()  # [num_edges]
        
        # Edge classification
        edge_class = self.edge_class_head(edge_features)  # [num_edges, num_edge_types]
        
        return link_pred, edge_class


In [9]:
from models.gnn_layers import LinkPredictor
from settings import device

lp_model = LinkPredictor(
    'GATv2Conv',
    ecore_graph_dataset[0].x.shape[1],
    64,
    3,
    num_heads=4,
    dropout = 0.1,
    residual = False, 
    use_edge_attrs = True,
    edge_attrs_dim=768,
    add_classification_head=True,
    num_edge_types=3,
    ff_hidden_dim=64
)

lp_model.to(device)

LinkPredictor(
  (conv): GNNConv(
    (conv_layers): ModuleList(
      (0): GATv2Conv(768, 64, heads=4)
      (1-2): 2 x GATv2Conv(256, 64, heads=4)
    )
    (activation): ReLU()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (link_pred_head): FeedForward(
    (ff): Sequential(
      (0): Linear(in_features=1280, out_features=64, bias=False)
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=64, out_features=1, bias=False)
    )
  )
  (edge_class_head): FeedForward(
    (ff): Sequential(
      (0): Linear(in_features=1280, out_features=64, bias=False)
      (1): ReLU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=64, out_features=3, bias=False)
    )
  )
)

In [10]:
for batch in ecore_loader_train:
    batch = batch.to(device)
    out = lp_model(batch)
    break

In [11]:
out

(tensor([0.4335, 0.4738, 0.4692,  ..., 0.4525, 0.4680, 0.4391], device='cuda:0',
        grad_fn=<SqueezeBackward0>),
 tensor([[-0.0176,  0.0209,  0.1657],
         [-0.0385, -0.0550,  0.1525],
         [-0.0484, -0.0621,  0.0976],
         ...,
         [-0.0576, -0.1183,  0.0383],
         [-0.0693, -0.0172,  0.0558],
         [-0.0091,  0.0272,  0.0776]], device='cuda:0', grad_fn=<MmBackward0>))

In [None]:
# import torch
# import torch.nn.functional as F
# import torch.optim as optim
# from torch_geometric.data import Data
# from torch_geometric.loader import DataLoader
# from torch_geometric.nn import GCNConv
# from torch_geometric.nn.aggr import SortAggregation
# import networkx as nx
# from torch_geometric.transforms import RandomLinkSplit


# def remap_node_indices(subgraph, center_node):
#     mapping = {node: i for i, node in enumerate(subgraph.nodes())}
#     subgraph = nx.relabel_nodes(subgraph, mapping)
#     sub_edge_index = torch.tensor(list(subgraph.edges)).t().contiguous()
#     sub_x = torch.ones(subgraph.number_of_nodes(), 1)  # Example node features
#     center_node_idx = mapping[center_node]
#     return sub_x, sub_edge_index, center_node_idx

# # Prepare the train and test datasets for SEAL model
# class SEALGraphData:
#     def __init__(
#             self, 
#             graph,
#             edge_index,
#             pos_edge_index,
#             neg_edge_index,
#             hops=1
#         ):
#         self.edge_index = edge_index
#         self.pos_edge_index = pos_edge_index
#         self.neg_edge_index = neg_edge_index
#         self.graph = graph
#         self.hops = hops



#     def __len__(self):
#         return self.pos_edge_index.size(1) + self.neg_edge_index.size(1)

#     def __getitem__(self, idx):
#         if idx < self.pos_edge_index.size(1):
#             u, v = self.pos_edge_index[:, idx]
#             y = 1
#         else:
#             u, v = self.neg_edge_index[:, idx - self.pos_edge_index.size(1)]
#             y = 0

#         subgraph = nx.ego_graph(self.graph, u.item(), radius=self.hops)
#         subgraph = nx.subgraph(subgraph, list(subgraph.nodes) + [v.item()])
#         sub_x, sub_edge_index, center_node_idx = remap_node_indices(subgraph, u.item())

#         return Data(
#             x=sub_x, 
#             edge_index=sub_edge_index, 
#             y=y, 
#             center_node_idx=center_node_idx
#         )


# def get_link_prediction_train_test_graph_data(
#         graph, 
#         num_val=0, 
#         num_test=0.2, 
#         add_negative_train_samples=True,
#         neg_sampling_ratio=1,
#     ):
#     transform = RandomLinkSplit(
#         num_val=num_val, 
#         num_test=num_test, 
#         neg_sampling_ratio=neg_sampling_ratio,
#         add_negative_train_samples=add_negative_train_samples,
#         split_labels=True
#     )

#     # Apply the transform
#     train_data, _, test_data = transform(
#         Data(
#             edge_index=graph.edge_index, 
#             num_nodes=ecore_graph.number_of_nodes()
#         )
#     )

#     return train_data, test_data

#     # train_graph_data = SEALGraphData(
#     #     graph, 
#     #     train_data.edge_index, 
#     #     train_data.pos_edge_label_index, 
#     #     train_data.neg_edge_label_index,
#     #     hops=hops
    
#     # )

#     # test_graph_data = SEALGraphData(
#     #     graph, 
#     #     test_data.edge_index, 
#     #     test_data.pos_edge_label_index, 
#     #     test_data.neg_edge_label_index,
#     #     hops=hops
#     # )

#     # return train_graph_data, test_graph_data
    

# train_data, test_data = get_link_prediction_train_test_graph_data(ecore_graph)
# # Create train and test dataloaders
# batch_size = 32

# # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Epoch: 001, Loss: 0.6956, Test Accuracy: 0.5000


In [None]:
from models.gnn_layers import GNNConv