#### Dataset Creation

In [1]:
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [2]:
import json
import os

datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import fasttext
from scipy.sparse import csr_matrix

from re import finditer


SEP = ' '
def camel_case_split(identifier):
    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]


def doc_tokenizer(doc):
    words = doc.split()
    # split _
    words = [w2 for w1 in words for w2 in w1.split('_') if w2 != '']
    # camelcase
    words = [w2.lower() for w1 in words for w2 in camel_case_split(w1) if w2 != '']
    return words


class TFIDFEncoder:
    def __init__(self, X=None):
        self.encoder = TfidfVectorizer(
            lowercase=False, tokenizer=doc_tokenizer, min_df=3
        )

        if X:
            self.encode(X)

    def encode(self, X):
        # print('Fitting TFIDF')
        X_t = self.encoder.fit_transform(X)
        X_sp = csr_matrix(np.vstack([x.toarray() for x in X_t]))
        # print('TFIDF Encoded')
        return X_sp


class BertTokenizerEncoder:
    def __init__(self, name, X=None):
        self.tokenizer = AutoTokenizer.from_pretrained(name)

        if X:
            self.encode(X)
        

    def encode(self, X, batch_encode=False, percentile=100):
        # print('Tokenizing Bert')
        tokens = self.tokenizer(X)

        if batch_encode:
            lengths = [len(i) for i in tokens['input_ids']]
            size = int(np.percentile(lengths, percentile)) if percentile < 100 else max(lengths)
            if size > 512:
                print(f'WARNING: Max size is {size}. Truncating to 512')
            size = max(size, 512)
            
            tokenized_data = self.tokenizer(
                X, 
                padding=True, 
                truncation=True, 
                max_length=size
            )
        else:
            tokenized_data = self.tokenizer(X)
        # print('Bert Tokenized')

        return tokenized_data


class BertTFIDF:
    def __init__(self, name, X=None):
        self.bert = BertTokenizerEncoder(name)
        self.tfidf = TFIDFEncoder()

        if X:
            self.encode(X)

    def encode(self, X):
        X_b = [f"{SEP}".join([str(j) for j in i]) for i in self.bert.encode(X)['input_ids']]
        X_t = self.tfidf.encode(X_b)
        return X_t


class FasttextEncoder:
    def __init__(self, model_name, X=None):
        self.model = fasttext.load_model(model_name)
        if X:
            self.encode(X)

    def encode(self, X):
        def get_sentence_embedding(sentence):
            return self.model.get_sentence_vector(sentence)
        
        # print('Encoding Fasttext')
        X_t = [" ".join(doc_tokenizer(i)) for i in X]
        X_t = np.array([get_sentence_embedding(i) for i in X_t])
        # print('Fasttext Encoded')
        return X_t


class ClassLabelEncoder(LabelEncoder):
    def __init__(self, y=None) -> None:
        super().__init__()
        if y:
            self.fit(y)
    
    def encode(self, y):
        return self.fit_transform(y)

In [4]:
from typing import List, Union
import networkx as nx
from tqdm.auto import tqdm
import pickle
from random import shuffle
from sklearn.model_selection import StratifiedKFold


class GenericGraph(nx.DiGraph):
    def __init__(self, json_obj: dict, use_type=False):
        super().__init__()
        self.use_type = use_type
        self.json_obj = json_obj
        self.graph_id = json_obj.get('ids')
        self.graph_type = json_obj.get('model_type')
        self.label = json_obj.get('labels')
        self.is_duplicated = json_obj.get('is_duplicated')
        self.directed = json.loads(json_obj.get('graph')).get('directed')
        self.create_graph(json_obj)
        self.text = json_obj.get('txt')


    def create_graph(self, json_obj):
        graph = json.loads(json_obj['graph'])
        nodes = graph['nodes']
        edges = graph['links']
        for node in nodes:
            self.add_node(node['id'], **node)
        for edge in edges:
            self.add_edge(edge['source'], edge['target'], **edge)
    
    # @property
    # def text(self):
    #     txt = list()
    #     for _, d in self.nodes(data=True):
    #         etype = d.get('type', '')
    #         name = d.get('name', '')
    #         node_data = f"{name}{etype if self.use_type else ''}"
    #         txt.append(node_data)
    #     return SEP.join(txt).strip()


    def get_node_embeddings(self):
        pass
        
    def __repr__(self):
        return f'{self.json_obj}\nGraph({self.graph_id}, nodes={self.number_of_nodes()}, edges={self.number_of_edges()})'


class Dataset:
    def __init__(
            self, 
            dataset_name: str, 
            dataset_dir = datasets_dir,
            save_dir = 'datasets/pickles',
            reload=False,
            remove_duplicates=False,
            extension='.jsonl'
        ):
        self.name = dataset_name
        self.dataset_dir = dataset_dir
        self.save_dir = save_dir
        self.extension = extension
        os.makedirs(save_dir, exist_ok=True)

        dataset_exists = os.path.exists(os.path.join(save_dir, f'{dataset_name}.pkl'))
        if reload or not dataset_exists:
            self.graphs: List[GenericGraph] = []
            data_path = os.path.join(dataset_dir, dataset_name)
            for file in os.listdir(data_path):
                if file.endswith(self.extension) and file.startswith('ecore'):
                    json_objects = json.load(open(os.path.join(data_path, file)))
                    self.graphs += [
                        GenericGraph(g) for g in tqdm(
                            json_objects, desc=f'Loading {dataset_name.title()}'
                        )
                    ]
            self.save()
        
        else:
            self.load()
        
        if remove_duplicates:
            self.remove_duplicates()

        print(f'Graphs: {len(self.graphs)}')


    def remove_duplicates(self):
        self.graphs = self.dedup()

    def dedup(self) -> List[GenericGraph]:
        return [g for g in self.graphs if not g.is_duplicated]
    
    
    def get_train_test_split(self, train_size=0.8):
        n = len(self.graphs)
        train_size = int(n * train_size)
        idx = list(range(n))
        shuffle(idx)
        train_idx = idx[:train_size]
        test_idx = idx[train_size:]
        return train_idx, test_idx
    

    def k_fold_split(
            self,  
            k=10
        ):
        kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        n = len(self.graphs)
        for train_idx, test_idx in kfold.split(np.zeros(n), np.zeros(n)):
            yield train_idx, test_idx


    @property
    def data(self):
        X, y = [], []
        for g in self.graphs:
            X.append(g.text)
            y.append(g.label)
        
        return X, y

    def __repr__(self):
        return f'Dataset({self.name}, graphs={len(self.graphs)})'
    
    def __getitem__(self, key):
        return self.graphs[key]
    
    def __iter__(self):
        return iter(self.graphs)
    
    def __len__(self):
        return len(self.graphs)
    
    def save(self):
        print(f'Saving {self.name} to pickle')
        with open(os.path.join(self.save_dir, f'{self.name}.pkl'), 'wb') as f:
            pickle.dump(self.graphs, f)
        print(f'Saved {self.name} to pickle')


    def load(self):
        print(f'Loading {self.name} from pickle')
        with open(os.path.join(self.save_dir, f'{self.name}.pkl'), 'rb') as f:
            self.graphs = pickle.load(f)
        
        print(f'Loaded {self.name} from pickle')
    

reload = False
ecore = Dataset('ecore_555', reload=reload)
modelset = Dataset('modelset', reload=reload, remove_duplicates=True)
# mar = Dataset('mar-ecore-github', reload=reload)


datasets = {
    'ecore': ecore,
    'modelset': modelset,
    # 'mar': mar
}

Loading ecore_555 from pickle
Loaded ecore_555 from pickle
Graphs: 548
Loading modelset from pickle
Loaded modelset from pickle
Graphs: 2043


In [111]:
ecore[0].text

'PrimitiveTypes\nBoolean\nInteger\nString\nBIBTEX\nLocatedElement\nlocation\ncommentsBefore\ncommentsAfter\nBibtex\nentries\nEntry\nkey\nfields\nArticle\nBook\nInbook\nBooklet\nInproceedings\nProceedings\nIncollection\nTechreport\nPhdThesis\nMastersThesis\nManual\nMisc\nField\nvalue\nAuthors\nAuthorUrls\nTitle\nJournal\nBookTitle\nInstitution\nOrganization\nType\nDay\nNumber\nChapter\nVolume\nSeries\nPages\nPublisher\nHowpublished\nSchool\nEditor\nEdition\nAddress\nYear\nMonth\nNote\nText\nAbstractField\nIsbn\nIssn\nUrl\nDoi'

In [112]:
"\n".join([d['name'] for n, d in ecore[0].nodes(data=True) if 'name' in d])

'PrimitiveTypes\nBoolean\nInteger\nString\nBIBTEX\nLocatedElement\nBibtex\nEntry\nArticle\nBook\nInbook\nBooklet\nInproceedings\nProceedings\nIncollection\nTechreport\nPhdThesis\nMastersThesis\nManual\nMisc\nField\nAuthors\nAuthorUrls\nTitle\nJournal\nBookTitle\nInstitution\nOrganization\nType\nDay\nNumber\nChapter\nVolume\nSeries\nPages\nPublisher\nHowpublished\nSchool\nEditor\nEdition\nAddress\nYear\nMonth\nNote\nText\nAbstractField\nIsbn\nIssn\nUrl\nDoi\nlocation\ncommentsBefore\ncommentsAfter\nentries\nkey\nfields\nvalue'

In [18]:
import pandas as pd

for name, dataset in datasets.items():
    print(name, len(dataset))
    df = pd.DataFrame([g.json_obj for g in dataset])

ecore 548
modelset 3337


In [19]:
len(max(modelset, key=lambda x: len(x.text)).text)

29007

#### Training Fasttext

##### Fasttext classification

In [None]:
import fasttext
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

for name, dataset in datasets.items():
    if name not in ['ecore', 'modelset']:
        continue
    print("Dataset: ", name)
    i = 0
    accuracies, bal_accuracies = [], []
    for X_train, X_test, y_train, y_test in dataset.k_fold_split():
        print("Fold number: ", i+1)
        f_train = f'datasets/fasttext_train_{name}_{i}.txt'
        f_test = f'datasets/fasttext_test_{name}_{i}.txt'
        if not os.path.exists(f_train):
            with open(f_train, 'w') as f:
                for x, y in zip(X_train, y_train):
                    x = " ".join(doc_tokenizer(x))
                    f.write(f"__label__{y} {x}\n")
        
        if not os.path.exists(f_test):
            with open(f_test, 'w') as f:
                for x, y in zip(X_test, y_test):
                    x = " ".join(doc_tokenizer(x))
                    f.write(f"__label__{y} {x}\n")
        
        if os.path.exists(f'models/{name}_{i}.bin'):
            model = fasttext.load_model(f'models/{name}_{i}.bin')
        else:
            model = fasttext.train_supervised(
                input=f_train, 
                epoch=100, 
                lr=0.2, 
                wordNgrams=2, 
            )
            model.save_model(f'models/{name}_{i}.bin')        
        y_pred = model.predict([i.strip() for i in open(f_test).readlines()])[0]
        y_true = [i.split()[0].split('__label__')[1] for i in open(f_test).readlines()]
        y_pred = [i[0].split('__label__')[1] for i in y_pred]


        accuracy = accuracy_score(y_true, y_pred)
        bal_accuracy = balanced_accuracy_score(y_true, y_pred)
        print(f"Accuracy: {accuracy}, Balanced Accuracy: {bal_accuracy}")
        accuracies.append(accuracy)
        bal_accuracies.append(bal_accuracy)

        i += 1            
    print(f"Average Accuracy: {np.mean(accuracies)}, Average Balanced Accuracy: {np.mean(bal_accuracies)}")
        

##### Fasttext word embeddings

In [42]:
X_udata = list(set([g.text for dataset in datasets.values() for g in dataset]))
X_udata = [f"{SEP}".join(doc_tokenizer(x)) for x in X_udata]
f_udata = 'datasets/fasttext_udata.txt'
with open(f'{f_udata}', 'w') as f:
    for x in X_udata:
        f.write(f"{x}\n")


In [57]:
model = fasttext.train_unsupervised(
    input=f_udata, 
    epoch=500, 
    lr=0.1,
    minn=2,
    maxn=5,
    dim=128
)
model.save_model("models/uml_fasttext.bin")

Read 0M words
Number of words:  8120
Number of labels: 0
Progress: 100.0% words/sec/thread:    7026 lr: -0.000001 avg.loss:  1.111646 ETA:   0h 0m 0s 60.3% words/sec/thread:    7034 lr:  0.039708 avg.loss:  1.177690 ETA:   0h 3m12s100.0% words/sec/thread:    7026 lr:  0.000000 avg.loss:  1.111496 ETA:   0h 0m 0s


#### Model Encoding

In [102]:
tf_idf_encoder = TFIDFEncoder()
bert_encoder = BertTokenizerEncoder('bert-base-uncased')
bert_tfidf_encoder = BertTFIDF('bert-base-uncased')
fasttext_encoder = FasttextEncoder('models/uml_fasttext.bin')
class_label_encoder = ClassLabelEncoder()

In [103]:
from sklearn import svm
from sklearn.metrics import accuracy_score, balanced_accuracy_score



def train_svm(dataset: Dataset, encoder: Union[TFIDFEncoder, BertTFIDF, FasttextEncoder]):
    accuracies, bal_accuracies = [], []
    for train_idx, test_idx in dataset.k_fold_split():
        X = encoder.encode(dataset.data[0])
        y = class_label_encoder.encode(dataset.data[1])

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        svm_classifier = svm.SVC(kernel='linear')  # You can change the kernel as needed
        svm_classifier.fit(X_train, y_train)
        # Predict on the test set
        y_pred = svm_classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        # print(f'SVM Classifier Accuracy: {accuracy}')
        bal_accuracy = balanced_accuracy_score(y_test, y_pred)
        # print(f'SVM Classifier Balanced Accuracy: {bal_accuracy}')

        accuracies.append(accuracy)
        bal_accuracies.append(bal_accuracy)
    
    print(f'Mean Accuracy: {np.mean(accuracies)}')
    print(f'Mean Balanced Accuracy: {np.mean(bal_accuracies)}')


In [None]:
train_svm(modelset, tf_idf_encoder)

In [106]:
model.get_nearest_neighbors('petrinet', k=5)

[(0.5969380140304565, 'petrinetv3'),
 (0.5963557362556458, 'petrinetv1'),
 (0.5946762561798096, 'petrinetv2'),
 (0.5399251580238342, 'petri'),
 (0.5047121047973633, 'tokens')]

In [115]:
from transformers import Trainer

In [6]:
from transformers import BertTokenizer
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def split_into_chunks(text, max_length=512):
    tokens = tokenizer.tokenize(text)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return [' '.join(chunk) for chunk in chunks]

# # Example usage
long_text = max(modelset, key=lambda x: len(x.text)).text
chunks = split_into_chunks(long_text)
len(chunks)

15

In [7]:
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import numpy as np

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    balanced_acc = balanced_accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'balanced_accuracy': balanced_acc,
    }

# Create your dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(
            text, 
            return_tensors='pt', 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_length
        )
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return inputs


def train_hf(model_name, model_ds: Dataset):
    i = 0
    print(f'Device used: {device}')

    for train_idx, test_idx in model_ds.k_fold_split():
        print(f'Fold number: {i+1}')
        X, y = model_ds.data
        y = LabelEncoder().fit_transform(y)
        X_train, X_test = [X[i] for i in train_idx], [X[i] for i in test_idx]
        y_train, y_test = [y[i] for i in train_idx], [y[i] for i in test_idx]

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(y)))
        model.to(device)

        train_ds = CustomDataset(X_train, y_train, tokenizer, max_length=4096)
        test_ds = CustomDataset(X_test, y_test, tokenizer, max_length=4096)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=10,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
        )

        # Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=test_ds  # Replace with actual evaluation dataset
        )

        # Train the model
        trainer.train()
        trainer.evaluate()

In [9]:
train_hf('allenai/longformer-base-4096', ecore)

Device used: cuda
Fold number: 1


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 85, in _worker
    output = module(*input, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1916, in forward
    outputs = self.longformer(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1729, in forward
    encoder_outputs = self.encoder(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1309, in forward
    layer_outputs = layer_module(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1237, in forward
    self_attn_outputs = self.attention(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1173, in forward
    self_outputs = self.self(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 642, in forward
    attn_output = self._compute_attn_output_with_global_indices(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 1009, in _compute_attn_output_with_global_indices
    attn_output_without_global = self._sliding_chunks_matmul_attn_probs_value(
  File "/home/sali/Miniconda/miniconda3/envs/ML/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py", line 890, in _sliding_chunks_matmul_attn_probs_value
    chunked_attn_probs = attn_probs.transpose(1, 2).reshape(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 194.00 MiB. GPU 0 has a total capacty of 23.68 GiB of which 97.75 MiB is free. Process 3515915 has 2.84 GiB memory in use. Including non-PyTorch memory, this process has 20.74 GiB memory in use. Of the allocated memory 19.50 GiB is allocated by PyTorch, and 866.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
