#### Dataset Creation

In [1]:
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [2]:
import json
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [3]:
from data_loading.dataset import ModelDataset


reload = False
ecore = ModelDataset('ecore_555', reload=reload)
modelset = ModelDataset('modelset', reload=reload, remove_duplicates=True)
mar = ModelDataset('mar-ecore-github', reload=reload)


datasets = {
    'ecore': ecore,
    'modelset': modelset,
    'mar': mar
}

Loading ecore_555 from pickle
Loaded ecore_555 from pickle
Graphs: 548
Loading modelset from pickle
Loaded modelset from pickle
Graphs: 2043
Loading mar-ecore-github from pickle
Loaded mar-ecore-github from pickle
Graphs: 18110


In [4]:
ecore[0].find_node_str_upto_distance(8, 2)

'Article | BIBTEX Entry | Chapter Booklet fields AbstractField Year Authors AuthorUrls Isbn Issn Institution Publisher School Howpublished Url key LocatedElement Day Type Inproceedings Manual BookTitle Organization Editor Field Series Doi Text Bibtex Edition Book Month Pages Title Number MastersThesis Note Techreport Misc Volume Address Proceedings Inbook Journal Incollection PhdThesis'

#### Training Fasttext

##### Fasttext classification

In [None]:
import fasttext
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

from encoding.common import doc_tokenizer
from encoding.encoders import (
    BertTokenizerEncoder,
    FasttextEncoder,
    ClassLabelEncoder,
    TFIDFEncoder,
    BertTFIDF
)

for name, dataset in datasets.items():
    if name not in ['ecore', 'modelset']:
        continue
    print("Dataset: ", name)
    i = 0
    accuracies, bal_accuracies = [], []
    for X_train, X_test, y_train, y_test in dataset.k_fold_split():
        print("Fold number: ", i+1)
        f_train = f'datasets/fasttext_train_{name}_{i}.txt'
        f_test = f'datasets/fasttext_test_{name}_{i}.txt'
        if not os.path.exists(f_train):
            with open(f_train, 'w') as f:
                for x, y in zip(X_train, y_train):
                    x = " ".join(doc_tokenizer(x))
                    f.write(f"__label__{y} {x}\n")
        
        if not os.path.exists(f_test):
            with open(f_test, 'w') as f:
                for x, y in zip(X_test, y_test):
                    x = " ".join(doc_tokenizer(x))
                    f.write(f"__label__{y} {x}\n")
        
        if os.path.exists(f'models/{name}_{i}.bin'):
            model = fasttext.load_model(f'models/{name}_{i}.bin')
        else:
            model = fasttext.train_supervised(
                input=f_train, 
                epoch=100, 
                lr=0.2, 
                wordNgrams=2, 
            )
            model.save_model(f'models/{name}_{i}.bin')        
        y_pred = model.predict([i.strip() for i in open(f_test).readlines()])[0]
        y_true = [i.split()[0].split('__label__')[1] for i in open(f_test).readlines()]
        y_pred = [i[0].split('__label__')[1] for i in y_pred]


        accuracy = accuracy_score(y_true, y_pred)
        bal_accuracy = balanced_accuracy_score(y_true, y_pred)
        print(f"Accuracy: {accuracy}, Balanced Accuracy: {bal_accuracy}")
        accuracies.append(accuracy)
        bal_accuracies.append(bal_accuracy)

        i += 1            
    print(f"Average Accuracy: {np.mean(accuracies)}, Average Balanced Accuracy: {np.mean(bal_accuracies)}")
        

##### Fasttext word embeddings

In [42]:
X_udata = list(set([g.text for dataset in datasets.values() for g in dataset]))
X_udata = [f" ".join(doc_tokenizer(x)) for x in X_udata]
f_udata = 'datasets/fasttext_udata.txt'
with open(f'{f_udata}', 'w') as f:
    for x in X_udata:
        f.write(f"{x}\n")


In [57]:
model = fasttext.train_unsupervised(
    input=f_udata, 
    epoch=500, 
    lr=0.1,
    minn=2,
    maxn=5,
    dim=128
)
model.save_model("models/uml_fasttext.bin")

Read 0M words
Number of words:  8120
Number of labels: 0
Progress: 100.0% words/sec/thread:    7026 lr: -0.000001 avg.loss:  1.111646 ETA:   0h 0m 0s 60.3% words/sec/thread:    7034 lr:  0.039708 avg.loss:  1.177690 ETA:   0h 3m12s100.0% words/sec/thread:    7026 lr:  0.000000 avg.loss:  1.111496 ETA:   0h 0m 0s


#### Model Encoding

In [102]:
tf_idf_encoder = TFIDFEncoder()
bert_encoder = BertTokenizerEncoder('bert-base-uncased')
bert_tfidf_encoder = BertTFIDF('bert-base-uncased')
fasttext_encoder = FasttextEncoder('models/uml_fasttext.bin')
class_label_encoder = ClassLabelEncoder()

In [103]:
from sklearn import svm
from sklearn.metrics import (
    accuracy_score, 
    balanced_accuracy_score
)
from typing import Union


def train_svm(dataset: ModelDataset, encoder: Union[TFIDFEncoder, BertTFIDF, FasttextEncoder]):
    accuracies, bal_accuracies = [], []
    for train_idx, test_idx in dataset.k_fold_split():
        X = encoder.encode(dataset.data[0])
        y = class_label_encoder.encode(dataset.data[1])

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        svm_classifier = svm.SVC(kernel='linear')  # You can change the kernel as needed
        svm_classifier.fit(X_train, y_train)
        # Predict on the test set
        y_pred = svm_classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        # print(f'SVM Classifier Accuracy: {accuracy}')
        bal_accuracy = balanced_accuracy_score(y_test, y_pred)
        # print(f'SVM Classifier Balanced Accuracy: {bal_accuracy}')

        accuracies.append(accuracy)
        bal_accuracies.append(bal_accuracy)
    
    print(f'Mean Accuracy: {np.mean(accuracies)}')
    print(f'Mean Balanced Accuracy: {np.mean(bal_accuracies)}')


In [None]:
train_svm(modelset, tf_idf_encoder)

In [106]:
model.get_nearest_neighbors('petrinet', k=5)

[(0.5969380140304565, 'petrinetv3'),
 (0.5963557362556458, 'petrinetv1'),
 (0.5946762561798096, 'petrinetv2'),
 (0.5399251580238342, 'petri'),
 (0.5047121047973633, 'tokens')]

In [115]:
from transformers import Trainer

In [6]:
from transformers import BertTokenizer
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def split_into_chunks(text, max_length=512):
    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return [' '.join(chunk) for chunk in chunks]

# # Example usage
long_text = max(modelset, key=lambda x: len(x.text)).text
chunks = split_into_chunks(long_text)
len(chunks)

15

In [43]:
from transformers import (
    Trainer, 
    TrainingArguments
)
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer
)
import torch
import numpy as np
import random
from data_loading.dataset import Dataset
from settings import device, seed
from sklearn.preprocessing import LabelEncoder
from trainers.metrics import compute_metrics


random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

max_length_map = {
    'bert-base-uncased': 512,
    'allenai/longformer-base-4096': 4096
}


# Create your dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.inputs = tokenizer(
            texts, 
            return_tensors='pt', 
            truncation=True, 
            padding='max_length', 
            max_length=max_length
        )
        self.inputs['labels'] = torch.tensor(labels, dtype=torch.long)
 

    def __len__(self):
        return len(self.inputs['input_ids'])
    

    def __getitem__(self, index):
        item = {key: val[index] for key, val in self.inputs.items()}
        return item


def train_hf(model_name, model_ds: Dataset, epochs):
    max_len = max_length_map[model_name]
    i = 0
    print(f'Device used: {device}')

    for train_idx, test_idx in model_ds.k_fold_split():
        print(f'Fold number: {i+1}')
        X, y = model_ds.data
        print(f'X: {len(X)}, y: {len(y)}')
        y = LabelEncoder().fit_transform(y)
        X_train, X_test = [X[i] for i in train_idx], [X[i] for i in test_idx]
        y_train, y_test = [y[i] for i in train_idx], [y[i] for i in test_idx]

        print(f'Train: {len(X_train)}, Test: {len(X_test)}')


        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(y)))
        model.to(device)

        train_ds = CustomDataset(X_train, y_train, tokenizer, max_length=max_len)
        test_ds = CustomDataset(X_test, y_test, tokenizer, max_length=max_len)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=epochs,
            eval_strategy="epoch",
            save_strategy="epoch",
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            load_best_model_at_end=True,
            save_total_limit=1,
            fp16=True,
            seed=42
        )

        # Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=test_ds,
            compute_metrics=compute_metrics            
        )

        # Train the model
        trainer.train()
        results = trainer.evaluate()
        print(results)

        i += 1
        break

In [None]:
train_hf('bert-base-uncased', modelset, 10)

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from settings import device
from data_loading.dataset import EncodingDataset

model_name = 'bert-base-uncased'
max_len = 512

i = 0
for train_idx, test_idx in modelset.k_fold_split():
    print(f'Fold number: {i+1}')
    X, y = modelset.data
    print(f'X: {len(X)}, y: {len(y)}')
    y = LabelEncoder().fit_transform(y)
    X_train, X_test = [X[i] for i in train_idx], [X[i] for i in test_idx]
    y_train, y_test = [y[i] for i in train_idx], [y[i] for i in test_idx]

    print(f'Train: {len(X_train)}, Test: {len(X_test)}')

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained('results/checkpoint-1380', num_labels=len(set(y)))
    model.to(device)

    train_ds = EncodingDataset(tokenizer, X_train, y_train, max_length=max_len)
    test_ds = EncodingDataset(tokenizer, X_test, y_test, max_length=max_len)

    break

Fold number: 1
X: 2043, y: 2043
Train: 1838, Test: 205


In [49]:
test_ds[:]['input_ids'].shape

torch.Size([205, 512])

In [51]:
with torch.no_grad():
    model.eval()
    #### Put vaues of custom data on device

    test_ds = {k: v.to(device) for k, v in test_ds[:].items()}

    outputs = model(**test_ds)
    pred_classes = torch.argmax(outputs.logits, dim=1)

In [53]:
y_pred = pred_classes.cpu().numpy()

[4, 27, 10]

In [5]:
from embeddings.bert import BertEmbedder

ft_embedder = BertEmbedder('bert-base-uncased', 'results/checkpoint-1380')
bert_embedder = BertEmbedder('bert-base-uncased')

In [6]:
texts = modelset[0].get_node_texts()
len(texts), modelset[0].number_of_nodes()

(27, 27)

In [None]:
modelset[0]

In [6]:
from data_loading.dataset import GraphDataset

graph_dataset_ft = GraphDataset(modelset, ft_embedder)
graph_dataset = GraphDataset(modelset, bert_embedder)

Processing modelset:   0%|          | 0/2043 [00:00<?, ?it/s]

Processing modelset:   0%|          | 0/2043 [00:00<?, ?it/s]

In [8]:
from models.gnn_layers import GNNClassifier

graph_classifier = GNNClassifier(
    gnn_conv_model='SAGEConv',
    input_dim=graph_dataset_ft.num_features,
    hidden_dim=64,
    output_dim=graph_dataset_ft.num_classes,
    num_layers=2,
    num_heads=None,
    dropout=0.1,
    residual=False,
    pool='sum',
    use_appnp=True,
    K=10,
    alpha=0.1
)

In [None]:
from trainers.graph_classifier import GNNTrainer
gnn_trainer = GNNTrainer(
    graph_classifier,
    graph_dataset_ft,
)

In [15]:
import json

mapping = json.load(open('datasets/graph_data/modelset/mapping.json'))

In [None]:
for data in gnn_trainer.dataloaders['train']:
    loss = gnn_trainer.step(data)
    break

In [2]:
import torch
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

# Example graph dataset
class ExampleGraphDataset(torch.utils.data.Dataset):
    def __init__(self, num_graphs):
        self.graphs = []
        for i in range(num_graphs):
            # Create dummy data for the example
            num_nodes = 18
            num_node_features = 768
            num_edges = 35
            num_classes = 12

            x = torch.randn(num_nodes, num_node_features)  # Node features
            edge_index = torch.randint(0, num_nodes, (2, num_edges), dtype=torch.long)  # Random edge indices
            y = torch.tensor([i % num_classes], dtype=torch.long)  # Graph label
            self.graphs.append(Data(x=x, edge_index=edge_index, y=y))

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return self.graphs[idx]

# Instantiate the dataset
dataset = ExampleGraphDataset(num_graphs=100)

# Create the DataLoader for batching
loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Iterating over the DataLoader
for batch in loader:
    print(batch)
    print(f'Batch x shape: {batch.x.shape}')
    print(f'Batch edge_index shape: {batch.edge_index.shape}')
    print(f'Batch y shape: {batch.y.shape}')
    break  # Remove this break to iterate over all batches

DataBatch(x=[288, 768], edge_index=[2, 560], y=[16], batch=[288], ptr=[17])
Batch x shape: torch.Size([288, 768])
Batch edge_index shape: torch.Size([2, 560])
Batch y shape: torch.Size([16])


In [4]:
from torch_geometric.utils import (
    negative_sampling, 
    train_test_split_edges
)

In [17]:
import torch
import torch_geometric.transforms as T
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling

def get_pos_neg_graphs(X, E, test_ratio=0.2):
    # Create a Data object
    data = Data(x=X, edge_index=E)

    # Apply RandomLinkSplit
    transform = T.RandomLinkSplit(
        num_val=0, 
        num_test=test_ratio, 
        is_undirected=True, 
        add_negative_train_samples=False
    )
    train_data, _, test_data = transform(data)

    # Positive edges
    train_pos_edge_index = train_data.edge_index
    test_pos_edge_index = test_data.edge_index

    # Negative edges
    train_neg_edge_index = negative_sampling(
        edge_index=train_pos_edge_index, 
        num_nodes=data.num_nodes, 
        num_neg_samples=train_pos_edge_index.size(1)
    )

    test_neg_edge_index = negative_sampling(
        edge_index=test_pos_edge_index, 
        num_nodes=data.num_nodes, 
        num_neg_samples=test_pos_edge_index.size(1)
    )

    # Create the graph objects
    train_pos_g = Data(x=X, edge_index=train_pos_edge_index)
    train_neg_g = Data(x=X, edge_index=train_neg_edge_index)
    test_pos_g = Data(x=X, edge_index=test_pos_edge_index)
    test_neg_g = Data(x=X, edge_index=test_neg_edge_index)

    graphs = {
        'train_pos_g': train_pos_g,
        'train_neg_g': train_neg_g,
        'test_pos_g': test_pos_g,
        'test_neg_g': test_neg_g,
        'train_g': train_data
    }
    return graphs

# Example usage:
num_nodes = 18
num_node_features = 768
num_edges = 35

X = torch.randn(num_nodes, num_node_features)  # Node features
E = torch.randint(0, num_nodes, (2, num_edges), dtype=torch.long)  # Random edge indices

graphs = get_pos_neg_graphs(X, E, 0.6)

print("Train Positive Graph:", graphs['train_pos_g'])
print("Train Negative Graph:", graphs['train_neg_g'])
print("Test Positive Graph:", graphs['test_pos_g'])
print("Test Negative Graph:", graphs['test_neg_g'])
print("Train Graph with Masked Edges:", graphs['train_g'])


Train Positive Graph: Data(x=[18, 768], edge_index=[2, 16])
Train Negative Graph: Data(x=[18, 768], edge_index=[2, 16])
Test Positive Graph: Data(x=[18, 768], edge_index=[2, 16])
Test Negative Graph: Data(x=[18, 768], edge_index=[2, 16])
Train Graph with Masked Edges: Data(x=[18, 768], edge_index=[2, 16], edge_label=[8], edge_label_index=[2, 8])


In [24]:
graphs['train_g']

Data(x=[18, 768], edge_index=[2, 16], edge_label=[8], edge_label_index=[2, 8])