#### Dataset Creation

In [1]:
from data_loading.models_dataset import EcoreModelDataset

ecore_model_dataset = EcoreModelDataset('ecore_555', reload=True)

Loading Ecore_555:   0%|          | 0/548 [00:00<?, ?it/s]

EcoreNxG(001_001_001_01_BibTeX.ecore, nodes=46, edges=47, references=3, containment=2, supertypes=42)
EcoreNxG(002_001_002_01_BibTeX1.1.ecore, nodes=23, edges=43, references=10, containment=2, supertypes=31)
EcoreNxG(003_001_003_01_BIBTEXML.ecore, nodes=29, edges=74, references=24, containment=2, supertypes=48)
EcoreNxG(004_001_004_01_Book.ecore, nodes=4, edges=5, references=4, containment=1, supertypes=0)
EcoreNxG(005_001_005_01_HAL.ecore, nodes=44, edges=64, references=23, containment=13, supertypes=28)
EcoreNxG(006_001_006_01_Publication.ecore, nodes=3, edges=2, references=2, containment=0, supertypes=0)
EcoreNxG(007_001_007_01_SWRC.ecore, nodes=57, edges=133, references=84, containment=2, supertypes=47)
EcoreNxG(008_001_008_BibTeX--1918941776.ecore, nodes=23, edges=43, references=10, containment=2, supertypes=31)
EcoreNxG(009_001_009_bibtex--1935117659.ecore, nodes=2, edges=1, references=1, containment=0, supertypes=0)
EcoreNxG(010_001_010_BibTeX--1975458327.ecore, nodes=40, edges=

In [1]:
from data_loading.graph_dataset import GraphEdgeDataset
from data_loading.models_dataset import ArchiMateModelDataset

dataset = ArchiMateModelDataset('eamodelset')

graph_data_params = dict(
    test_ratio=0.2,
    add_negative_train_samples=True,
    neg_sampling_ratio=1,
    distance=1,
    use_embeddings=True,
    embed_model_name='bert-base-uncased',
    ckpt='results/eamodelset/edge_cls/checkpoint-8370'
)

print("Loading graph dataset")
graph_dataset = GraphEdgeDataset(dataset, **graph_data_params)
print("Loaded graph dataset")


Loading eamodelset from pickle
Loaded eamodelset with 936 graphs
Loaded eamodelset with 936 graphs
Graphs: 936
Loading graph dataset


Creating graphs:   0%|          | 0/936 [00:00<?, ?it/s]

Processing graphs:   0%|          | 0/936 [00:00<?, ?it/s]

['AndJunction' 'ApplicationCollaboration' 'ApplicationComponent'
 'ApplicationEvent' 'ApplicationFunction' 'ApplicationInteraction'
 'ApplicationInterface' 'ApplicationProcess' 'ApplicationService'
 'Artifact' 'Assessment' 'BusinessActor' 'BusinessCollaboration'
 'BusinessEvent' 'BusinessFunction' 'BusinessInteraction'
 'BusinessInterface' 'BusinessObject' 'BusinessProcess' 'BusinessRole'
 'BusinessService' 'Capability' 'CommunicationNetwork' 'Constraint'
 'Contract' 'CourseOfAction' 'DataObject' 'Deliverable' 'Device'
 'DistributionNetwork' 'Driver' 'Equipment' 'Facility' 'Gap' 'Goal'
 'Grouping' 'ImplementationEvent' 'Junction' 'Location' 'Material'
 'Meaning' 'Node' 'OrJunction' 'Outcome' 'Path' 'Plateau' 'Principle'
 'Product' 'Representation' 'Requirement' 'Resource' 'Stakeholder'
 'SystemSoftware' 'TechnologyCollaboration' 'TechnologyEvent'
 'TechnologyFunction' 'TechnologyInteraction' 'TechnologyInterface'
 'TechnologyProcess' 'TechnologyService' 'Value' 'ValueStream'
 'WorkPack

In [9]:
from models.gnn_layers import EdgeClassifer, GNNConv


input_dim = 768

cls_label = 'type'
model_name = 'GATv2Conv'

hidden_dim = 128
output_dim = 128
num_conv_layers = 3
num_mlp_layers = 3
num_heads = 4
residual = True
l_norm = False
dropout = 0.3
aggregation = 'sum'

num_edges_label = f"num_edges_{cls_label}"
assert hasattr(graph_dataset, num_edges_label), f"Graph dataset does not have attribute {num_edges_label}"
num_classes = getattr(graph_dataset, num_edges_label)


gnn_conv_model = GNNConv(
    model_name=model_name,
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    out_dim=output_dim,
    num_layers=num_conv_layers,
    num_heads=num_heads,
    residual=residual,
    l_norm=l_norm,
    dropout=dropout,
    aggregation=aggregation,
    edge_dim=768
)

mlp_predictor = EdgeClassifer(
    input_dim=output_dim,
    hidden_dim=hidden_dim,
    num_layers=num_mlp_layers, 
    num_classes=num_classes,
    bias=False,
)

gnn_conv_model

GNNConv(
  (aggregation): SumAggregation()
  (conv_layers): ModuleList(
    (0): GATv2Conv(768, 128, heads=4)
    (1-2): 2 x GATv2Conv(512, 128, heads=4)
  )
  (activation): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
)

In [3]:
from torch_geometric.loader import DataLoader

torch_dataset = graph_dataset.get_torch_geometric_data()

dataloader = DataLoader(
    torch_dataset, 
    batch_size=32, 
    shuffle=True
)

In [4]:
for data in dataloader:
    print(data)
    break

DataBatch(train_edge_idx=[6737], test_edge_idx=[1668], train_pos_edge_label_index=[2, 6737], train_pos_edge_label=[6737], train_neg_edge_label_index=[2, 6737], train_neg_edge_label=[6737], test_pos_edge_label_index=[2, 1668], test_pos_edge_label=[1668], test_neg_edge_label_index=[2, 1668], test_neg_edge_label=[1668], overall_edge_index=[2, 8405], edge_index=[2, 6737], num_nodes=5418, x=[5418, 768], edge_attr=[8405, 768], node_type=[5418], node_layer=[5418], edge_type=[8405], batch=[5418], ptr=[33])


In [21]:
import torch

t = torch.tensor([True, False, True, True])
torch.where(t)[0].tolist()

[0, 2, 3]

In [12]:
gnn_conv_model(data.x.to('cuda'), data.edge_index.to('cuda'), data.edge_attr[data.train_edge_idx].to('cuda')).shape

torch.Size([5418, 512])

In [18]:
import json
import os

for file in os.listdir('datasets/eamodelset/processed-models'):
    if os.path.isdir(os.path.join('datasets/eamodelset/processed-models', file)):
        with open(f'datasets/eamodelset/processed-models/{file}/model.json') as f:
            model = json.load(f)
            json.dump(model, open(f'datasets/eamodelset/processed-models/{file}/model.json', 'w'), indent=4)