In [4]:
import os
import random
import torch
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


datasets_dir = 'datasets'
ecore_json_path = os.path.join(datasets_dir, 'ecore_555/ecore_555.jsonl')
mar_json_path = os.path.join(datasets_dir, 'mar-ecore-github/ecore-github.jsonl')
modelsets_uml_json_path = os.path.join(datasets_dir, 'modelset/uml.jsonl')
modelsets_ecore_json_path = os.path.join(datasets_dir, 'modelset/ecore.jsonl')

In [15]:
from data_loading.data import ModelDataset

config_params = dict(
    timeout = 120,
    min_enr = 1.2,
    min_edges = 10
)

dataset_name = 'modelset'
dataset_name = 'ecore_555'
dataset_name = 'mar-ecore-github'
# ecore = ModelDataset(dataset_name, reload=False, **config_params)
modelset = ModelDataset(dataset_name, reload=False, remove_duplicates=True, **config_params)
# mar = ModelDataset('mar-ecore-github', reload=True, **config_params)


# datasets = {
#     'ecore': ecore,
#     'modelset': modelset,
#     'mar': mar
# }

Loading mar-ecore-github from pickle
Loaded mar-ecore-github with 5389 graphs
Loaded mar-ecore-github with 5389 graphs
Graphs: 2997


In [8]:
from data_loading.graph_dataset import GraphDataset

graph_data_params = dict(
    distance=2,
    reload=False,
    add_negative_train_samples=True,
    neg_sampling_ratio=1,
    use_edge_types=False,
)

graph_dataset = GraphDataset(modelset, **graph_data_params)

Processing modelset:   0%|          | 0/830 [00:00<?, ?it/s]

In [6]:
from transformers import AutoTokenizer

EDGE_START = '<edge_begin>'
EDGE_END = '<edge_end>'
NODE_SEP = '<node_sep>'
NODE_PATH_SEP = '<node_path_sep>'

def get_special_tokens():
    return {
        'additional_special_tokens': [EDGE_START, EDGE_END, NODE_SEP, NODE_PATH_SEP]
    }


def get_tokenizer(model_name, special_tokens, max_length):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    tokenizer.add_special_tokens(special_tokens)
    tokenizer.model_max_length = max_length
    return tokenizer

model_name = 'bert-base-uncased'
special_tokens = get_special_tokens()
max_length = 512
tokenizer = get_tokenizer(model_name, special_tokens, max_length)

In [18]:
from settings import LP_TASK_LINK_PRED

task_name = LP_TASK_LINK_PRED

bert_dataset = graph_dataset.get_link_prediction_data(
    tokenizer=tokenizer,
    distance=2,
    task_type=LP_TASK_LINK_PRED
)

In [10]:
def oversample_dataset(dataset, oversampling_ratio=0.7):
    """
    This function oversamples the classes that occur less frequently in the dataset.
    The occurence of each class is counted and each class is oversampled 70% of the difference between the most common class and the class in question.
    """

    class_occurences = dataset[:]['labels'].numpy()
    unique_classes, counts = np.unique(class_occurences, return_counts=True)
    max_count = counts.max()
    indices_with_oversamples = []
    for class_idx, count in zip(unique_classes, counts):
        class_indices = np.where(class_occurences == class_idx)[0]
        indices_with_oversamples.extend(class_indices)
        oversample_count = int(oversampling_ratio * (max_count - count))
        indices_with_oversamples.extend(np.random.choice(class_indices, oversample_count))
    
    return indices_with_oversamples

ind_w_oversamples = oversample_dataset(bert_dataset['train'])
bert_dataset['train'].inputs = bert_dataset['train'][ind_w_oversamples]

In [14]:
from collections import Counter
Counter(bert_dataset['train'][:]['labels'].tolist()), Counter(bert_dataset['test'][:]['labels'].tolist())

(Counter({1: 44719, 0: 44628}), Counter({1: 10797, 0: 10785}))

In [20]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.resize_token_embeddings(len(tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(30526, 768)

In [16]:
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score
)
import torch.nn.functional as F

def compute_metrics_multi_classification(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    logits = torch.tensor(pred.predictions)
    probabilites = F.softmax(logits, dim=-1).numpy()
    acc = (preds == labels).mean()
    roc = roc_auc_score(labels, probabilites, multi_class='ovr')
    f1_macro = f1_score(labels, preds, average='macro')
    f1_micro = f1_score(labels, preds, average='micro')
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')

    return {
        'accuracy': acc,
        'roc_auc': roc,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'precision': precision,
        'recall': recall
    }

def compute_metrics_binary_classification(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    logits = torch.tensor(pred.predictions)
    probabilites = F.softmax(logits, dim=-1).numpy()
    acc = (preds == labels).mean()
    roc = roc_auc_score(labels, probabilites, multi_class='ovr')
    f1_macro = f1_score(labels, preds, average='macro')
    f1_micro = f1_score(labels, preds, average='micro')
    precision = precision_score(labels, preds, average='macro')
    recall = recall_score(labels, preds, average='macro')

    return {
        'accuracy': acc,
        'roc_auc': roc,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import TrainingArguments, Trainer
import os

output_dir = os.path.join(
    'results',
    dataset_name,
    task_name
)

logs_dir = os.path.join(
    'logs',
    dataset_name,
    task_name
)
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=logs_dir,
    logging_steps=1000,
    eval_strategy='steps',
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bert_dataset['train'],
    eval_dataset=bert_dataset['test'],
    compute_metrics=compute_metrics_binary_classification
)

trainer.train()

: 