#### Dataset Creation

In [6]:
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [7]:
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [8]:
from data_loading.data import ModelDataset

config_params = dict(
    timeout = 120,
    min_enr = 1.2,
    min_edges = 10
)
# dataset = ModelDataset('ecore_555', reload=False, **config_params)
dataset = ModelDataset('modelset', reload=False, remove_duplicates=True, **config_params)
# dataset = ModelDataset('mar-ecore-github', reload=True, **config_params)

Loading modelset from pickle
Loaded modelset with 830 graphs
Loaded modelset with 830 graphs
Graphs: 830


In [9]:
from data_loading.graph_dataset import GraphDataset

graph_data_params = dict(
    distance=2,
    reload=False,
    add_negative_train_samples=True,
    neg_sampling_ratio=1,
    use_edge_types=False,
    use_embeddings=True,
    embed_model_name='bert-base-uncased',
    ckpt='results/modelset_ec_ft'
)

graph_dataset = GraphDataset(dataset, **graph_data_params)
# modelset_graph_dataset = GraphDataset(modelset, **graph_data_params)
# mar_graph_dataset = GraphDataset(mar, **graph_data_params)

Processing graphs:   0%|          | 0/830 [00:00<?, ?it/s]

Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Generating...
Embeddings not found. Genera

In [6]:
from embeddings.bert import BertEmbedder

model_name = 'bert-base-uncased'
ckpt = 'results/modelset_ec_ft'
embedder = BertEmbedder(model_name, ckpt)

In [5]:
from models.gnn_layers import (
    GATv2, 
    FeedForward, 
    GNNLinkPredictor
)
from trainers.link_predictor import GNNLinkPredictorTrainer


input_dim = graph_dataset[0].x.size(1)
hidden_dim = 64
output_dim = 128
num_layers = 3
num_heads = 4
edge_dim = graph_dataset[0].edge_attr.size(1)
residual = True


gat = GATv2(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    edge_dim=edge_dim,
    residual=residual,
    dropout=0.2,
)

lp_head = FeedForward(
    input_dim=(output_dim*num_heads if num_heads > 1 else output_dim) * 2,
    hidden_dim=hidden_dim,
    output_dim=1,
    num_layers=3,
)

ec_head = FeedForward(
    input_dim=(output_dim*num_heads if num_heads > 1 else output_dim) * 2,
    hidden_dim=hidden_dim,
    output_dim=3,
    num_layers=3,
    final_activation='softmax',
)

gnn_lp = GNNLinkPredictor(gat, lp_head, ec_head)
lp_trainer = GNNLinkPredictorTrainer(
    gnn_lp, 
    graph_dataset, 
    use_link_predictor=True, 
    use_edge_classifier=True
)

In [None]:
lp_trainer.train_epochs(100)