#### Dataset Creation

In [1]:
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [2]:
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [3]:
from data_loading.data import ModelDataset

config_params = dict(
    timeout = 120,
    min_enr = 1.2,
    min_edges = 10
)
ecore = ModelDataset('ecore_555', reload=False, **config_params)
modelset = ModelDataset('modelset', reload=False, remove_duplicates=True, **config_params)
mar = ModelDataset('mar-ecore-github', reload=False, **config_params)


datasets = {
    'ecore': ecore,
    'modelset': modelset,
    'mar': mar
}

Loading ecore_555 from pickle
Loaded ecore_555 with 281 graphs
Loaded ecore_555 with 281 graphs
Graphs: 281
Loading modelset from pickle
Loaded modelset with 830 graphs
Loaded modelset with 830 graphs
Graphs: 830
Loading mar-ecore-github from pickle
Loaded mar-ecore-github with 5389 graphs
Loaded mar-ecore-github with 5389 graphs
Graphs: 5389


In [4]:
from data_loading.graph_dataset import GraphDataset

graph_data_params = dict(
    distance=2,
    reload=True
)

ecore_graph_dataset = GraphDataset(ecore, **graph_data_params)
# modelset_graph_dataset = GraphDataset(modelset, **graph_data_params)
# mar_graph_dataset = GraphDataset(mar, **graph_data_params)

Processing ecore_555:   0%|          | 0/281 [00:00<?, ?it/s]

In [5]:
import pickle

graph, edge_index = pickle.load(open('subgraph.pkl', 'rb'))
graph.number_of_edges(), edge_index.shape

(124, torch.Size([2, 100]))

In [32]:
import torch
ei = torch.tensor(list(graph.numbered_graph.edges)).t()

In [45]:
import networkx as nx
n = nx.DiGraph()
n.add_edges_from([(u, v, graph.numbered_graph.edges[u, v]) for u, v in edge_index.t().tolist()])
for node, data in n.nodes(data=True):
    data = graph.numbered_graph.nodes[node]
    n.nodes[node].update(data)

n.number_of_nodes(), n.number_of_edges()

(54, 100)

In [None]:
ecore_graph = [
    g
    for g in ecore.graphs if g.number_of_nodes() >= 12 and g.number_of_edges() >= 12 and g.number_of_nodes() <= 20 and g.number_of_edges() <= 20

][0]
ecore_graph.number_of_nodes(), ecore_graph.number_of_edges()

(17, 17)

In [None]:
ecore_graph.nodes(), ecore_graph.edges(data=True)

In [None]:
import networkx as nx
from lang2graph.uml import SUPERTYPE, EcoreNxG

def create_graph_from_edge_index(G: EcoreNxG, edge_index):
    """
    Create a subgraph from G using only the edges specified in edge_index.
    
    Parameters:
    G (networkx.Graph): The original graph.
    edge_index (torch.Tensor): A tensor containing edge indices.
    
    Returns:
    networkx.Graph: A subgraph of G containing only the edges in edge_index.
    """
    # Create a new graph
    subgraph = nx.Graph()

    # Add nodes and edges from the edge_index to the subgraph
    for i in range(edge_index.size(1)):
        u = edge_index[0, i].item()
        v = edge_index[1, i].item()
        u_str, v_str = G.id2label[u], G.id2label[v]
        if G.numbered_graph.has_edge(u, v):
            subgraph.add_edge(u, v, **G.edges[u_str, v_str])

    subgraph.label2id = G.label2id
        
    subgraph.id2label = G.id2label
    return subgraph


def get_node_texts(graph: nx.DiGraph, h: int):
    """
    Create node string for each node n in a graph using neighbors of n up to h hops.
    
    Parameters:
    G (networkx.Graph): The graph.
    h (int): The number of hops.
    
    Returns:
    dict: A dictionary where keys are nodes and values are node strings.
    """
    node_texts = {}

    for node in graph.nodes():
        node_str = f"{node}"
        current_level_nodes = {node}
        all_visited_nodes = {node}

        for _ in range(1, h + 1):
            next_level_nodes = set()
            for n in current_level_nodes:
                neighbors = set(graph.neighbors(n))
                next_level_nodes.update(neighbors - all_visited_nodes)
            all_visited_nodes.update(next_level_nodes)
            if next_level_nodes:
                node_strs = [graph.id2label[i] for i in sorted(next_level_nodes)]
                node_str += f" -> {', '.join(map(str, node_strs))}"
            current_level_nodes = next_level_nodes

        node_texts[node] = node_str

    return node_texts


def get_edge_type(edge_data):

    # Reference = 0
    # Containment = 1
    # Supertype = 2

    if edge_data['type'] == SUPERTYPE:
        return 2
    if edge_data['containment']:
        return 1
    return 0


def get_edge_texts(graph: nx.DiGraph):
    """
    Create edge string for each edge in a graph.
    
    Parameters:
    G (networkx.Graph): The graph.
    
    Returns:
    dict: A dictionary where keys are edges and values are edge strings.
    """
    edge_texts = {}

    for u, v, data in graph.edges(data=True):
        edge_texts[(u, v)] = f"{graph.id2label[u]} - {get_edge_type(data)} - {graph.id2label[v]}"

    return edge_texts

In [None]:
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
transform = RandomLinkSplit(
    num_val=0, 
    num_test=0.2, 
    is_undirected=False, 
    split_labels=True,
    neg_sampling_ratio=1,
    add_negative_train_samples=True
)

# Apply the transform
train_data, _, test_data = transform(Data(edge_index=ecore_graph.edge_index, num_nodes=ecore_graph.number_of_nodes()))

In [None]:
print(train_data.pos_edge_label_index.shape, test_data.pos_edge_label_index.shape)
print(train_data.neg_edge_label_index.shape, test_data.neg_edge_label_index.shape)

torch.Size([2, 14]) torch.Size([2, 3])
torch.Size([2, 14]) torch.Size([2, 3])


In [None]:
train_subgraph = create_graph_from_edge_index(ecore_graph, train_data.edge_index)
print(train_subgraph.edges(data=True))
node_texts = get_node_texts(train_subgraph, 1)  # Get node texts up to 2 hops
for node, text in node_texts.items():
    print(node, text)

edge_texts = get_edge_texts(train_subgraph)
for edge, text in edge_texts.items():
    print(edge, text)

In [None]:
from embeddings.bert import BertEmbedder

embedder = BertEmbedder(model_name='bert-base-uncased')

In [None]:
# import torch
# import torch.nn.functional as F
# import torch.optim as optim
# from torch_geometric.data import Data
# from torch_geometric.loader import DataLoader
# from torch_geometric.nn import GCNConv
# from torch_geometric.nn.aggr import SortAggregation
# import networkx as nx
# from torch_geometric.transforms import RandomLinkSplit


# def remap_node_indices(subgraph, center_node):
#     mapping = {node: i for i, node in enumerate(subgraph.nodes())}
#     subgraph = nx.relabel_nodes(subgraph, mapping)
#     sub_edge_index = torch.tensor(list(subgraph.edges)).t().contiguous()
#     sub_x = torch.ones(subgraph.number_of_nodes(), 1)  # Example node features
#     center_node_idx = mapping[center_node]
#     return sub_x, sub_edge_index, center_node_idx

# # Prepare the train and test datasets for SEAL model
# class SEALGraphData:
#     def __init__(
#             self, 
#             graph,
#             edge_index,
#             pos_edge_index,
#             neg_edge_index,
#             hops=1
#         ):
#         self.edge_index = edge_index
#         self.pos_edge_index = pos_edge_index
#         self.neg_edge_index = neg_edge_index
#         self.graph = graph
#         self.hops = hops



#     def __len__(self):
#         return self.pos_edge_index.size(1) + self.neg_edge_index.size(1)

#     def __getitem__(self, idx):
#         if idx < self.pos_edge_index.size(1):
#             u, v = self.pos_edge_index[:, idx]
#             y = 1
#         else:
#             u, v = self.neg_edge_index[:, idx - self.pos_edge_index.size(1)]
#             y = 0

#         subgraph = nx.ego_graph(self.graph, u.item(), radius=self.hops)
#         subgraph = nx.subgraph(subgraph, list(subgraph.nodes) + [v.item()])
#         sub_x, sub_edge_index, center_node_idx = remap_node_indices(subgraph, u.item())

#         return Data(
#             x=sub_x, 
#             edge_index=sub_edge_index, 
#             y=y, 
#             center_node_idx=center_node_idx
#         )


# def get_link_prediction_train_test_graph_data(
#         graph, 
#         num_val=0, 
#         num_test=0.2, 
#         add_negative_train_samples=True,
#         neg_sampling_ratio=1,
#     ):
#     transform = RandomLinkSplit(
#         num_val=num_val, 
#         num_test=num_test, 
#         neg_sampling_ratio=neg_sampling_ratio,
#         add_negative_train_samples=add_negative_train_samples,
#         split_labels=True
#     )

#     # Apply the transform
#     train_data, _, test_data = transform(
#         Data(
#             edge_index=graph.edge_index, 
#             num_nodes=ecore_graph.number_of_nodes()
#         )
#     )

#     return train_data, test_data

#     # train_graph_data = SEALGraphData(
#     #     graph, 
#     #     train_data.edge_index, 
#     #     train_data.pos_edge_label_index, 
#     #     train_data.neg_edge_label_index,
#     #     hops=hops
    
#     # )

#     # test_graph_data = SEALGraphData(
#     #     graph, 
#     #     test_data.edge_index, 
#     #     test_data.pos_edge_label_index, 
#     #     test_data.neg_edge_label_index,
#     #     hops=hops
#     # )

#     # return train_graph_data, test_graph_data
    

# train_data, test_data = get_link_prediction_train_test_graph_data(ecore_graph)
# # Create train and test dataloaders
# batch_size = 32

# # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Epoch: 001, Loss: 0.6956, Test Accuracy: 0.5000


In [None]:
def get_node_embeddings(graph, embedder: BertEmbedder, hops=1):
    """
    Embed nodes in a graph using a given embedder.
    
    Parameters:
    graph (networkx.Graph): The graph.
    embedder (Embedder): The embedder.
    hops (int): The number of hops.
    
    Returns:
    torch.Tensor: The node embeddings.
    """
    node_texts = get_node_texts(graph, hops)
    node_texts = list(node_texts.values())
    node_embeddings = embedder.embed(node_texts)
    return node_embeddings


def get_edge_embeddings(graph, embedder: BertEmbedder):
    """
    Embed edges in a graph using a given embedder.
    
    Parameters:
    graph (networkx.Graph): The graph.
    embedder (Embedder): The embedder.
    
    Returns:
    torch.Tensor: The edge embeddings.
    """
    edge_texts = get_edge_texts(graph)
    edge_texts = list(edge_texts.values())
    edge_embeddings = embedder.embed(edge_texts)
    return edge_embeddings

def embed_graph(graph, embedder: BertEmbedder, hops=1):
    """
    Embed a graph using a given embedder.
    
    Parameters:
    graph (networkx.Graph): The graph.
    embedder (Embedder): The embedder.
    hops (int): The number of hops.
    
    Returns:
    torch.Tensor: The graph embedding.
    """
    node_embeddings = get_node_embeddings(graph, embedder, hops)
    edge_embeddings = get_edge_embeddings(graph, embedder)
    return node_embeddings, edge_embeddings

node_embeddings, edge_embeddings = embed_graph(ecore_graph, embedder, hops=2)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.nn import GATConv


class GraphDataset(Dataset):
    def __init__(
            self, 
            graphs, 
            hops=1,
        ):
        self.graphs = graphs
        self.hops = hops

    def embed_graphs(self):
        for graph in self.graphs:
            yield graph.x


    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return self.graphs[idx]

In [8]:
from tqdm.auto import tqdm
import json
from random import shuffle
from sklearn.model_selection import StratifiedKFold


class TorchGraph:
    def __init__(
            self, 
            graph: EcoreNxG, 
            save_dir: str,
            distance = 1,
            lptr=0.2,
            use_neg_samples=False,
            neg_samples_ratio=1,
        ):
        self.graph = graph
        self.distance = distance
        self.add_negative_train_samples = use_neg_samples
        self.neg_sampling_ratio = neg_samples_ratio
        self.lptr = lptr
        self.save_dir = save_dir
        self.process_graph()
    

    def process_graph(self):
        if not self.load_pyg_data():
            self.data = self.get_pyg_data()
            self.validate_data()
                    
        self.save()

    
    def get_pyg_data(self):
        transform = RandomLinkSplit(
            num_val=0, 
            num_test=self.lptr, 
            add_negative_train_samples=self.add_negative_train_samples,
            neg_sampling_ratio=self.neg_sampling_ratio
        )

        train_data, _, test_data = transform(Data(
            edge_index=self.graph.edge_index, 
            num_nodes=self.graph.number_of_nodes()
        ))
        edge_index = train_data.edge_index
        subgraph = self.graph.create_graph_from_edge_index(edge_index)
        node_texts = self.graph.get_node_texts(subgraph, self.distance)
        node_embeddings = embedder.embed(list(node_texts.values()))

        edge_texts = self.graph.get_edge_texts(subgraph)
        edge_embeddings = embedder.embed(list(edge_texts.values()))


        data = Data(
            x=node_embeddings,
            edge_index=edge_index,
            edge_attr=edge_embeddings,
            train_data=train_data,
            test_data=test_data,
        )

        return data
    

    def validate_data(self):
        pass

    @property
    def name(self):
        return '.'.join(self.graph.graph_id.replace('/', '_').split('.')[:-1])


    @property
    def save_idx(self):
        path = os.path.join(self.save_dir, f'{self.graph.id}')
        if embedder.finetuned:
            path = f'{path}_finetuned'
        return path


    def save_to_mapping(self):
        graph_embedding_file_map = dict()
        fp = f'{self.save_dir}/mapping.json'
        if os.path.exists(fp):
            graph_embedding_file_map = json.load(open(fp, 'r'))
        else:
            graph_embedding_file_map = dict()
        
        graph_embedding_file_map[self.name] = self.graph.id
        json.dump(graph_embedding_file_map, open(fp, 'w'), indent=4)


    def load_pyg_data(self):

        if os.path.exists(self.save_idx):
            self.save_to_mapping()
            self.data = torch.load(f"{self.save_idx}/data.pt")
            return True

        return False


    def save(self):
        os.makedirs(self.save_idx, exist_ok=True)
        torch.save(self.data, f"{self.save_idx}/data.pt")
        self.save_to_mapping()


class GraphDataset(torch.utils.data.Dataset):
    def __init__(
            self, 
            models_dataset: ModelDataset,
            save_dir='datasets/graph_data',
            distance=1
        ):
        self.save_dir = f'{save_dir}/{models_dataset.name}'
        os.makedirs(self.save_dir, exist_ok=True)
        self.graphs = [
            TorchGraph(
                g, 
                save_dir=self.save_dir,
                distance=distance
            ) 
            for g in tqdm(models_dataset, desc=f'Processing {models_dataset.name}')
        ]

        self._c = {label:j for j, label in enumerate({g.label for g in models_dataset})}
        self.labels = torch.tensor([self._c[g.label] for g in models_dataset], dtype=torch.long)
        self.num_classes = len(self._c)
        self.num_features = self.graphs[0].data.x.shape[-1]

    def __len__(self):
        return len(self.graphs)
    
    def __getitem__(self, index: int):
        return self.graphs[index].data, self.labels[index]
    
    def get_train_test_split(self, tr=0.8):
        n = len(self.graphs)
        train_size = int(n * tr)
        idx = list(range(n))
        shuffle(idx)
        train_idx = idx[:train_size]
        test_idx = idx[train_size:]
        return train_idx, test_idx
    

    def k_fold_split(self, k=10):
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        n = len(self.graphs)
        for train_idx, test_idx in kfold.split(np.zeros(n), np.zeros(n)):
            yield train_idx, test_idx




NameError: name 'EcoreNxG' is not defined

In [92]:
train_data, test_data

(Data(edge_index=[2, 14], num_nodes=17, edge_label=[28], edge_label_index=[2, 28]),
 Data(edge_index=[2, 14], num_nodes=17, edge_label=[0], edge_label_index=[2, 0]),
 Data(edge_index=[2, 14], num_nodes=17, edge_label=[6], edge_label_index=[2, 6]))

In [67]:
def create_train_test_val_graphs(
        graphs,
        num_val=0,
        num_test=0.2,
        add_negative_train_samples=True,
        neg_sampling_ratio=1
    ):

    # Apply RandomLinkSplit to each graph
    split_graphs = []
    for graph in graphs:
        transform = RandomLinkSplit(
            num_val=num_val, 
            num_test=num_test, 
            add_negative_train_samples=add_negative_train_samples,
            neg_sampling_ratio=neg_sampling_ratio
        )
        train_data, val_data, test_data = transform(graph)
        split_graphs.append((graph, train_data, val_data, test_data))

    # Flatten the split_graphs list to get individual datasets
    train_graphs = [(graph, train) for train, _, _ in split_graphs]
    val_graphs = [(graph, val) for _, val, _ in split_graphs]
    test_graphs = [(graph, test) for _, _, test in split_graphs]

    train_dataset = GraphDataset(train_graphs)
    val_dataset = GraphDataset(val_graphs)
    test_dataset = GraphDataset(test_graphs)

    return {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }

In [68]:
def collate_fn(batch):
    x = []
    edge_index = []
    edge_label = []
    edge_label_index = []
    
    offset = 0
    for data in batch:
        x.append(data.x)
        edge_index.append(data.edge_index + offset)
        edge_label.append(data.edge_label)
        edge_label_index.append(data.edge_label_index + offset)
        
        offset += data.x.size(0)
    
    x = torch.cat(x, dim=0)
    edge_index = torch.cat(edge_index, dim=1)
    edge_label = torch.cat(edge_label, dim=0)
    edge_label_index = torch.cat(edge_label_index, dim=1)
    
    return Data(x=x, edge_index=edge_index, edge_label=edge_label, edge_label_index=edge_label_index)


def create_graph_dataloaders(data, batch_size=2):
    train_loader = DataLoader(data['train'], batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_loader = DataLoader(data['val'], batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
    test_loader = DataLoader(data['test'], batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

    return {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }

In [69]:
class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_heads):
        super(GAT, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=num_heads, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * num_heads, out_channels, heads=1, concat=False, dropout=0.6)

        self.link_pred_head = torch.nn.Linear(hidden_channels * num_heads, 1)  # Link prediction head
        self.edge_class_head = torch.nn.Linear(hidden_channels * num_heads, 3)  # Edge classification head


    def forward(self, x, edge_index):
        x = torch.nn.functional.dropout(x, p=0.6, training=self.training)
        x = torch.nn.functional.elu(self.conv1(x, edge_index))
        x = torch.nn.functional.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)

        row, col = edge_index
        edge_features = torch.cat([x[row], x[col]], dim=1)  # [num_edges, hidden_channels*2]
        
        # Link prediction
        link_pred = torch.sigmoid(self.link_pred_head(edge_features)).squeeze()  # [num_edges]
        
        # Edge classification
        edge_class = self.edge_class_head(edge_features)  # [num_edges, num_edge_types]


        return link_pred, edge_class


In [71]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GAT(in_channels=1, hidden_channels=8, out_channels=8, num_heads=8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.BCEWithLogitsLoss()

def train(data_loader):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        batch = batch.to(device)
        
        out = model(batch.x, batch.edge_index)
        
        pos_edge_index = batch.edge_label_index[:, batch.edge_label == 1]
        neg_edge_index = batch.edge_label_index[:, batch.edge_label == 0]
        
        pos_pred = torch.sigmoid((out[pos_edge_index[0]] * out[pos_edge_index[1]]).sum(dim=1))
        neg_pred = torch.sigmoid((out[neg_edge_index[0]] * out[neg_edge_index[1]]).sum(dim=1))
        
        pos_label = torch.ones(pos_pred.size(0), device=device)
        neg_label = torch.zeros(neg_pred.size(0), device=device)
        
        loss = criterion(pos_pred, pos_label) + criterion(neg_pred, neg_label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(data_loader)


def test(data_loader):
    model.eval()
    correct = 0
    total_pos = 0
    total_neg = 0
    for batch in data_loader:
        batch = batch.to(device)
        
        out = model(batch.x, batch.edge_index)
        
        pos_edge_index = batch.edge_label_index[:, batch.edge_label == 1]
        neg_edge_index = batch.edge_label_index[:, batch.edge_label == 0]
        
        pos_pred = torch.sigmoid((out[pos_edge_index[0]] * out[pos_edge_index[1]]).sum(dim=1))
        neg_pred = torch.sigmoid((out[neg_edge_index[0]] * out[neg_edge_index[1]]).sum(dim=1))
        
        pos_correct = (pos_pred > 0.5).sum().item()
        neg_correct = (neg_pred <= 0.5).sum().item()
        
        correct += pos_correct + neg_correct
        total_pos += pos_pred.size(0)
        total_neg += neg_pred.size(0)
    
    return correct / (total_pos + total_neg)

In [3]:
from torch_geometric.data import Data
import torch
## random data object

data = Data(
    x=torch.randn(5, 16),
    edge_index=torch.tensor([[0, 1, 1, 2, 2, 3, 4], [1, 0, 2, 1, 3, 2, 2]]),
    edge_label=torch.tensor([1, 0, 1, 0, 1, 0, 1]),
    edge_label_index=torch.tensor([[0, 1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 0, 1]])
)