In [None]:
import json
import glob
from os import listdir
from os.path import isfile, join
import os.path as osp
import copy
import os
import pickle
from datetime import datetime
import random
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import NNConv
from torch_geometric.nn import GATv2Conv
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
def nextplot(force=False):
    """Start a new plot.

    In a notebook (or if `force=True`), create a new figure. Otherwise (e.g, in
    IPython), clear the current figure.

    """
    inNotebook = "IPKernelApp" in get_ipython().config
    if force or inNotebook:
        plt.figure()  # this creates a new plot
    else:
        plt.clf()  # and this clears the current one

In [None]:
class SkinbaronDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super(SkinbaronDataset, self).__init__(root, transform, pre_transform, pre_filter)
    @property
    def processed_file_names(self):
        return processed_graphs
    def len(self):
        return len(self.processed_file_names)
    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [None]:
# define GCN (ECC), and GAT neural network architectures
class ECCCONV(nn.Module):
    """Define ECCCONV class for the Edge-Conditioned Convolutional (ECC) neural network."""

    def __init__(self, size_in, size_out, edge_size, tmax):
        super(ECCCONV, self).__init__()
        nn1 = nn.Linear(edge_size, size_in * size_out)
        self.conv1 = NNConv(size_in, size_out, nn1)
        self.lin1 = nn.Linear(size_in, size_out, bias=False)
        self.readout = nn.Linear(2 * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr) + self.lin1(graph.x))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred

In [None]:
# define architecture of GAE
class GATL1noSelf(nn.Module):
    """Define GATL1noSelf class for the Graph Attention neural network.(GAT)
    with one layer, and without self loops"""

    def __init__(self, size_in, size_out, edge_size, num_head, tmax, dropout_prob):
        super(GATL1noSelf, self).__init__()
        self.conv1 = GATv2Conv(in_channels=size_in, out_channels=size_out, heads=num_head, dropout=dropout_prob,
                               add_self_loops=False, edge_dim=edge_size)
        self.readout = nn.Linear(2 * num_head * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred

class GATL1withSelf(nn.Module):
    """Define GATL1withSelf class for the Graph Attention neural network.(GAT)
    with one layer, and self loops"""

    def __init__(self, size_in, size_out, edge_size, num_head, tmax, dropout_prob):
        super(GATL1withSelf, self).__init__()
        self.conv1 = GATv2Conv(in_channels=size_in, out_channels=size_out, heads=num_head, edge_dim=edge_size,
                               dropout=dropout_prob, fill_value='mean')
        self.readout = nn.Linear(2 * num_head * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred
    

class GATL2noSelf(nn.Module):
    """Define GATL2noSelf class for the Graph Attention neural network.(GAT)
    with two layers, and without self loops"""

    def __init__(self, size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob):
        super(GATL2noSelf, self).__init__()
        self.conv1 = GATv2Conv(in_channels=size_in, out_channels=size_hid1, heads=num_head, dropout=dropout_prob,
                               add_self_loops=False, edge_dim=edge_size)
        self.conv2 = GATv2Conv(in_channels=num_head * size_hid1, out_channels=size_out, heads=num_head, dropout=dropout_prob,
                               add_self_loops=False, edge_dim=edge_size)
        self.readout = nn.Linear(2 * num_head * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr))
        graph.x = F.elu(self.conv2(graph.x, graph.edge_index, graph.edge_attr))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred
    
class GATL2withSelf(nn.Module):
    """Define GATL2withSelf class for the Graph Attention neural network.(GAT)
    with two layers, and self loops"""

    def __init__(self, size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob):
        super(GATL2withSelf, self).__init__()
        self.conv1 = GATv2Conv(in_channels=size_in, out_channels=size_hid1, heads=num_head, edge_dim=edge_size,
                               dropout=dropout_prob, fill_value='mean')
        self.conv2 = GATv2Conv(in_channels=num_head * size_hid1, out_channels=size_out, heads=num_head, edge_dim=edge_size,
                               dropout=dropout_prob, fill_value='mean')
        self.readout = nn.Linear(2 * num_head * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr))
        graph.x = F.elu(self.conv2(graph.x, graph.edge_index, graph.edge_attr))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred
    
class GATL3noSelf(nn.Module):
    """Define GATL3noSelf class for the Graph Attention neural network.(GAT)
    with three layers, and without self loops"""

    def __init__(self, size_in, size_out, size_hid1, size_hid2, edge_size, num_head, tmax, dropout_prob):
        super(GATL3noSelf, self).__init__()
        self.conv1 = GATv2Conv(in_channels=size_in, out_channels=size_hid1, heads=num_head, dropout=dropout_prob,
                               add_self_loops=False, edge_dim=edge_size)
        self.conv2 = GATv2Conv(in_channels=num_head * size_hid1, out_channels=size_hid2, heads=num_head, dropout=dropout_prob,
                               add_self_loops=False, edge_dim=edge_size)
        self.conv3 = GATv2Conv(in_channels=num_head * size_hid2, out_channels=size_out, heads=num_head, dropout=dropout_prob,
                               add_self_loops=False, edge_dim=edge_size)
        self.readout = nn.Linear(2 * num_head * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr))
        graph.x = F.elu(self.conv2(graph.x, graph.edge_index, graph.edge_attr))
        graph.x = F.elu(self.conv3(graph.x, graph.edge_index, graph.edge_attr))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred
    
class GATL3withSelf(nn.Module):
    """Define GATL3withSelf class for the Graph Attention neural network.(GAT)
    with three layers, and without self loops"""

    def __init__(self, size_in, size_out, size_hid1, size_hid2, edge_size, num_head, tmax, dropout_prob):
        super(GATL3withSelf, self).__init__()
        self.conv1 = GATv2Conv(in_channels=size_in, out_channels=size_hid1, heads=num_head, edge_dim=edge_size,
                               dropout=dropout_prob, fill_value='mean')
        self.conv2 = GATv2Conv(in_channels=num_head * size_hid1, out_channels=size_hid2, heads=num_head, edge_dim=edge_size,
                               dropout=dropout_prob, fill_value='mean')
        self.conv3 = GATv2Conv(in_channels=num_head * size_hid2, out_channels=size_out, heads=num_head, edge_dim=edge_size,
                               dropout=dropout_prob, fill_value='mean')
        self.readout = nn.Linear(2 * num_head * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr))
        graph.x = F.elu(self.conv2(graph.x, graph.edge_index, graph.edge_attr))
        graph.x = F.elu(self.conv3(graph.x, graph.edge_index, graph.edge_attr))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred

In [None]:
# set the device to GPU or CPU (only if GPU is not available we will use CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# for reproducibility of the result (note: we use SGD for optimization)
np.random.seed(0)
# Define binary cross entropy loss.
batch_loss = nn.BCELoss()
#batch_loss = nn.MSELoss() # in case we use sum of time and control-flow in m-array
targat_path = r"D:\Final master thesis evaluation\exp1\data"
processed_Pattern = r"D:\Final master thesis evaluation\exp1\data\processed\*.pt"
case_id_target = r'D:\Final master thesis evaluation\exp1\case_id_list.pkl'
models_target = ['D:\Final master thesis evaluation\exp1\models\GATl1h1nl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h4nl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h1nl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h4nl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h1nl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h4nl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h1wl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h4wl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h1wl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h4wl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h1wl_param.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h4wl_param.pt']
loss_df_target = ['D:\Final master thesis evaluation\exp1\models\GATl1h1nl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h4nl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h1nl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h4nl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h1nl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h4nl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h1wl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h4wl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h1wl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h4wl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h1wl_loss.pt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h4wl_loss.pt']
models_txt_target = ['D:\Final master thesis evaluation\exp1\models\GATl1h1nl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h4nl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h1nl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h4nl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h1nl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h4nl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h1wl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL1h4wl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h1wl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL2h4wl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h1wl.txt',
                 'D:\Final master thesis evaluation\exp1\models\GATL3h4wl.txt']
# creating dataset
processed_graphs = glob.glob(processed_Pattern)
dataset = SkinbaronDataset(root=targat_path)
# train/validation/test split
train_dataset = dataset[0:9000]
validation_dataset = dataset[9000:12000]
test_dataset = dataset[12000:15000]
print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of validation graphs: {len(validation_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')
# Retrieving the case_id list from saved file on the disk:
case_id_file = open(case_id_target, "rb")
case_id_list = pickle.load(case_id_file)
case_id_file.close()
# find the length of the longest trace based on the edge_m_array attribute
tmax = int(dataset.get(0).edge_m_array.shape[1]) + 1
dropout_prob = 0.0

In [None]:
number_heads = [1,4,1,4,1,4,1,4,1,4,1,4]
for i in range (12):
    # initialize the model
    num_head = number_heads[i]
    size_in = dataset.get(0).x.shape[1]
    size_out = int(2 * dataset.get(0).x.shape[1] / num_head)       
    edge_size = dataset.get(0).edge_attr.shape[1]    
    if i == 0 or i == 1:
        model = GATL1noSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
    elif i == 2 or i == 3:
        size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
        model = GATL2noSelf(size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob)   
    elif i == 4 or i == 5:
        size_hid1 = int(1.33 * dataset.get(0).x.shape[1] / num_head)
        size_hid2 = int(1.67 * dataset.get(0).x.shape[1] / num_head)
        model = GATL3noSelf(size_in, size_out, size_hid1, size_hid2, edge_size, num_head, tmax, dropout_prob)
    elif i == 6 or i == 7:
        model = GATL1withSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
    elif i == 8 or i == 9:
        size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
        model = GATL2withSelf(size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob)
    elif i == 10 or i == 11:
        size_hid1 = int(1.33 * dataset.get(0).x.shape[1] / num_head)
        size_hid2 = int(1.67 * dataset.get(0).x.shape[1] / num_head)
        model = GATL3withSelf(size_in, size_out, size_hid1, size_hid2, edge_size, num_head, tmax, dropout_prob)    
    # setting the minibatch size
    batch_size = 64
    loader = DataLoader(train_dataset, batch_size=batch_size)
    # move to GPU (if available)
    model = model.to(device)
    # inizialize the optimizer
    lr = 0.01
    weight_decay = 5e-4
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    with open(models_txt_target[i], "w") as GAT2_file:
        # Print model's state_dict
        print("Model's state_dict:", file=GAT2_file)
        for param_tensor in model.state_dict():
            print(param_tensor, "\t", model.state_dict()[param_tensor].size(), file=GAT2_file)
        # Print optimizer's state_dict
        print("Optimizer's state_dict:", file=GAT2_file)
        for var_name in optimizer.state_dict():
            print(var_name, "\t", optimizer.state_dict()[var_name], file=GAT2_file)
        GAT2_file.close()
    # training of the model
    epochs = 100
    train_hist = {}
    train_hist['loss'] = []
    # Initialize training
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()
    model.train()
    # training loop
    begin_time = datetime.now()
    for epoch in range(epochs):
        print(epoch)
        for idx, data_batch in enumerate(loader):
            data_batch = data_batch.to(device)
            optimizer.zero_grad()
            loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
            loss.backward()
            optimizer.step()
            train_hist['loss'].append(loss.item())
            with open(models_txt_target[i], "a") as GAT2_file:
                print('[Epoch %4d/%4d] [Batch %4d/%4d] Loss: % 2.2e' % (epoch + 1, epochs,idx + 1, len(loader),
                                                                        loss.item()), file=GAT2_file)
                GAT2_file.close()
    finish_time = datetime.now()
    with open(models_txt_target[i], "a") as GAT2_file:
        print('Training time:', file=GAT2_file)
        print(finish_time - begin_time, file=GAT2_file)
        GAT2_file.close()    
    # save and load learned parameters
    torch.save(model.state_dict(), models_target[i])

In [None]:
# get anomaly scores produced on validation dataset
number_heads = [1,4,1,4,1,4,1,4,1,4,1,4]
for i in range (12):
    # initialize the model
    num_head = number_heads[i]
    size_in = dataset.get(0).x.shape[1]
    size_out = int(2 * dataset.get(0).x.shape[1] / num_head)       
    edge_size = dataset.get(0).edge_attr.shape[1]    
    if i == 0 or i == 1:
        model = GATL1noSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
    elif i == 2 or i == 3:
        size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
        model = GATL2noSelf(size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob)   
    elif i == 4 or i == 5:
        size_hid1 = int(1.33 * dataset.get(0).x.shape[1] / num_head)
        size_hid2 = int(1.67 * dataset.get(0).x.shape[1] / num_head)
        model = GATL3noSelf(size_in, size_out, size_hid1, size_hid2, edge_size, num_head, tmax, dropout_prob)
    elif i == 6 or i == 7:
        model = GATL1withSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
    elif i == 8 or i == 9:
        size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
        model = GATL2withSelf(size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob)
    elif i == 10 or i == 11:
        size_hid1 = int(1.33 * dataset.get(0).x.shape[1] / num_head)
        size_hid2 = int(1.67 * dataset.get(0).x.shape[1] / num_head)
        model = GATL3withSelf(size_in, size_out, size_hid1, size_hid2, edge_size, num_head, tmax, dropout_prob)    
    model.load_state_dict(torch.load(models_target[i]))
    # get the result in a dataframe
    batch_size = 1
    loader = DataLoader(validation_dataset, batch_size=batch_size)
    validation_id_list = case_id_list[9000:12000]
    loss_list = []
    begin_time = datetime.now()
    for data_batch in loader:
        loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
        if loss > 0:
            loss_list.append(loss.item())
        else:
            loss_list.append(0)      
    finish_time = datetime.now()    
    with open(models_txt_target[i], "a") as GAT2_file:
        print('Anomaly score computation time for', len(validation_dataset), 'cases:', file=GAT2_file)
        print(finish_time - begin_time, file=GAT2_file)
        GAT2_file.close()
    GAT2_dictionary = {'case_id': validation_id_list, 'Loss': loss_list}
    loss_df = pd.DataFrame(GAT2_dictionary)
    loss_df.to_pickle(loss_df_target[i])

In [None]:
# import label dataframe
label_csv = pd.read_csv(r'D:\Final master thesis evaluation\exp1\large-0.1-1.csv')
label_csv.drop("Unnamed: 0", axis=1, inplace=True)
label_csv["case_id"] = pd.to_numeric(label_csv["case_id"])
label_csv.case_id.astype(str).astype(int)
label_csv.label.astype(str)
#label_csv.dtypes
validation_csv = label_csv.loc[label_csv['case_id'] > 9000]
validation_csv = validation_csv.loc[label_csv['case_id'] < 12001]
#print(len(validation_csv))
label_csv1 = validation_csv.loc[validation_csv['label']== 'normal']
normal_class = len(label_csv1)
anom_class = 3000 - len(label_csv1)
print('numnber of anomalies in the log:', anom_class)
label_csv.head(1)

In [None]:
for i in range (12):
    loss_df = pd.read_pickle(loss_df_target[i])
    loss_df["case_id"] = pd.to_numeric(loss_df["case_id"])
    result = pd.merge(loss_df, validation_csv, on=["case_id"])
    sorted_result = result.sort_values(by=['Loss'], ascending = False, ignore_index = True)
    sorted_list = sorted_result['label'].tolist()
    predictions = []
    for j in range (len(sorted_list)):
        if sorted_list[j] == 'normal':
            predictions.append(0)
        else:
            predictions.append(1)
    prediction_array = np.array(predictions)
    best_score = 0
    for j in range (len(sorted_list)):
        current_alarms = prediction_array[0:j+1]
        current_normals = prediction_array[j+1:]
        positives = j+1
        true_positives = np.sum(current_alarms)
        false_positives = positives - true_positives
        negatives = 3000 - positives
        false_negatives = np.sum(current_normals)
        true_negatives = negatives - false_negatives
        precision = true_positives/(true_positives+false_positives)
        recall = true_positives/(true_positives+false_negatives)
        f1_score = 2*precision*recall/(precision+recall)
        if f1_score > best_score:
            best_score = f1_score
            precision_on_normal = true_negatives/(true_negatives+false_negatives)
            recall_on_normal = true_negatives/(true_negatives+false_positives)
            f1_score_on_normal = 2*precision_on_normal*recall_on_normal/(precision_on_normal+recall_on_normal)
            macro_f1_score = (f1_score + f1_score_on_normal)/2
            best_cut = [j, precision, recall, f1_score, precision_on_normal, recall_on_normal,
                        f1_score_on_normal, macro_f1_score]    
    print(best_cut)

In [None]:
# get anomaly scores produced on test dataset only for L2H1WL
num_head = 1
# initialize the model
size_in = dataset.get(0).x.shape[1]
size_out = int(2 * dataset.get(0).x.shape[1] / num_head)       
edge_size = dataset.get(0).edge_attr.shape[1] 
size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
model = GATL1noSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
model.load_state_dict(torch.load(models_target[0]))
# get the result in a dataframe
batch_size = 1
loader = DataLoader(test_dataset, batch_size=batch_size)
test_id_list = case_id_list[12000:15000]
loss_list = []
begin_time = datetime.now()
for data_batch in loader:
    loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
    if loss > 0:
        loss_list.append(loss.item())
    else:
        loss_list.append(0)      
finish_time = datetime.now()    
with open(models_txt_target[8], "a") as GAT2_file:
    print('Anomaly score computation time (training set) for', len(test_dataset), 'cases:', file=GAT2_file)
    print(finish_time - begin_time, file=GAT2_file)
    GAT2_file.close()
GAT2_dictionary = {'case_id': test_id_list, 'Loss': loss_list}
loss_df = pd.DataFrame(GAT2_dictionary)
loss_df.to_pickle('D:\Final master thesis evaluation\exp1\models\GATL1h1nl_test_loss.pt')

In [None]:
# import label dataframe
label_csv = pd.read_csv(r'D:\Final master thesis evaluation\exp1\large-0.1-1.csv')
label_csv.drop("Unnamed: 0", axis=1, inplace=True)
label_csv["case_id"] = pd.to_numeric(label_csv["case_id"])
label_csv.case_id.astype(str).astype(int)
label_csv.label.astype(str)
#label_csv.dtypes
test_csv = label_csv.loc[label_csv['case_id'] > 12000]
#print(len(validation_csv))
label_csv1 = test_csv.loc[test_csv['label']== 'normal']
normal_class = len(label_csv1)
anom_class = 3000 - len(label_csv1)
print('numnber of anomalies in the log:', anom_class)
label_csv.head(1)

In [None]:
loss_df = pd.read_pickle('D:\Final master thesis evaluation\exp1\models\GATL1h1nl_test_loss.pt')
loss_df["case_id"] = pd.to_numeric(loss_df["case_id"])
result = pd.merge(loss_df, test_csv, on=["case_id"])
sorted_result = result.sort_values(by=['Loss'], ascending = False, ignore_index = True)
sorted_list = sorted_result['label'].tolist()
predictions = []
for j in range (len(sorted_list)):
    if sorted_list[j] == 'normal':
        predictions.append(0)
    else:
        predictions.append(1)
prediction_array = np.array(predictions)
best_score = 0
for j in range (len(sorted_list)):
    current_alarms = prediction_array[0:j+1]
    current_normals = prediction_array[j+1:]
    positives = j+1
    true_positives = np.sum(current_alarms)
    false_positives = positives - true_positives
    negatives = 3000 - positives
    false_negatives = np.sum(current_normals)
    true_negatives = negatives - false_negatives
    precision = true_positives/(true_positives+false_positives)
    recall = true_positives/(true_positives+false_negatives)
    f1_score = 2*precision*recall/(precision+recall)
    if f1_score > best_score:
        best_score = f1_score
        precision_on_normal = true_negatives/(true_negatives+false_negatives)
        recall_on_normal = true_negatives/(true_negatives+false_positives)
        f1_score_on_normal = 2*precision_on_normal*recall_on_normal/(precision_on_normal+recall_on_normal)
        macro_f1_score = (f1_score + f1_score_on_normal)/2
        best_cut = [j, precision, recall, f1_score, precision_on_normal, recall_on_normal,
                    f1_score_on_normal, macro_f1_score]    
print(best_cut)

ensemble?

In [None]:
# validation_csv = label_csv.loc[label_csv['case_id'] > 9000]
validation_csv = validation_csv.loc[label_csv['case_id'] < 12001]
label_csv1 = validation_csv.loc[validation_csv['label']== 'normal']
normal_class = len(label_csv1)
anom_class = 3000 - len(label_csv1)
print('numnber of anomalies in the log:', anom_class)
loss1_df = pd.read_pickle(loss_df_target[8])
loss2_df = pd.read_pickle(loss_df_target[0])
loss1_df["case_id"] = pd.to_numeric(loss1_df["case_id"])
loss2_df["case_id"] = pd.to_numeric(loss2_df["case_id"])
result1 = pd.merge(loss1_df, validation_csv, on=["case_id"])
result2 = pd.merge(loss2_df, validation_csv, on=["case_id"])
sorted_result1 = result1.sort_values(by=['Loss'], ascending = False, ignore_index = True)
sorted_result2 = result2.sort_values(by=['Loss'], ascending = False, ignore_index = True)
first_labels = sorted_result1['label'].tolist()
second_labels = sorted_result2['label'].tolist()
first_labels_limited = first_labels[0:335]
second_labels_limited = second_labels[0:335]
third_labels_limited = first_labels[0:361]
predictions = []
for j in range (len(first_labels_limited)):
    if first_labels_limited[j] == 'normal':
        predictions.append(0)
    else:
        predictions.append(1)
prediction_array = np.array(predictions)
print(np.sum(prediction_array), 'anomalies detected')
predictions = []
for j in range (len(second_labels_limited)):
    if second_labels_limited[j] == 'normal':
        predictions.append(0)
    else:
        predictions.append(1)
prediction_array = np.array(predictions)
print(np.sum(prediction_array), 'anomalies detected')
predictions = []
for j in range (len(third_labels_limited)):
    if third_labels_limited[j] == 'normal':
        predictions.append(0)
    else:
        predictions.append(1)
prediction_array = np.array(predictions)
print(np.sum(prediction_array), 'anomalies detected')
sorted_list1 = sorted_result1['case_id'].tolist()
sorted_list2 = sorted_result2['case_id'].tolist()
first_sad1 = set(sorted_list1[0:100])
first_sad2 = set(sorted_list2[0:100])
first_dosad1 = set(sorted_list1[0:200])
first_dosad2 = set(sorted_list2[0:200])
first_sesad1 = set(sorted_list1[0:300])
first_sesad2 = set(sorted_list2[0:300])
first_all1 = set(sorted_list1[0:361])
first_all2 = set(sorted_list2[0:335])
common_sad = first_sad1.intersection(first_sad2)
common_dosad = first_dosad1.intersection(first_dosad2)
common_sesad = first_sesad1.intersection(first_sesad2)
common_all = first_all1.intersection(first_all2)
print('Number of common cases in top 100 anomalous scores:', len(common_sad))
print('Number of common cases in top 200 anomalous scores:', len(common_dosad))
print('Number of common cases in top 300 anomalous scores:', len(common_sesad))
print('Number of common cases for the best cut:', len(common_all))

Convergence

In [None]:
convergence_list = ['D:\Final master thesis evaluation\exp2\convergence_models\GATL1h1nl_param.pt',
                   'D:\Final master thesis evaluation\exp2\convergence_models\GATL2h1wl_param.pt']
loss_list_target = ['D:\Final master thesis evaluation\exp2\convergence_models\GATL1h1nl_training_loss.pt',
                   'D:\Final master thesis evaluation\exp2\convergence_models\GATL1h1nl_validation_loss.pt',
                   'D:\Final master thesis evaluation\exp2\convergence_models\GATL2h1wl_training_loss.pt',
                   'D:\Final master thesis evaluation\exp2\convergence_models\GATL2h1wl_validation_loss.pt']
first_model_training_loss = []
second_model_training_loss = []
training_convergence_list = [first_model_training_loss, second_model_training_loss]
first_model_validation_loss = []
second_model_validation_loss = []
validation_convergence_list = [first_model_validation_loss, second_model_validation_loss]
number_heads = [1,4,1,4,1,4,1,4,1,4,1,4]
model_indices = [0,8]
for i in model_indices:
    # initialize the model
    num_head = number_heads[i]
    size_in = dataset.get(0).x.shape[1]
    size_out = int(2 * dataset.get(0).x.shape[1] / num_head)       
    edge_size = dataset.get(0).edge_attr.shape[1]    
    if i == 0:
        model = GATL1noSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
    elif i == 8:
        size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
        model = GATL2withSelf(size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob)
    # setting the minibatch size
    batch_size = 64
    loader = DataLoader(train_dataset, batch_size=batch_size)
    # move to GPU (if available)
    model = model.to(device)
    # inizialize the optimizer
    lr = 0.01
    weight_decay = 5e-4
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    # training of the model
    epochs = 1
    train_hist = {}
    train_hist['loss'] = []
    # Initialize training
    for layer in model.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()
    model.train()
    # training loop
    for epoch in range(epochs):
        for idx, data_batch in enumerate(loader):
            data_batch = data_batch.to(device)
            optimizer.zero_grad()
            loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
            loss.backward()
            optimizer.step()
            train_hist['loss'].append(loss.item())
    # save and load learned parameters
    if i == 0:
        torch.save(model.state_dict(), convergence_list[0])
    else:
        torch.save(model.state_dict(), convergence_list[1])
    batch_size = 1
    loss_list = []
    for data_batch in loader:
        loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
        if loss > 0:
            loss_list.append(loss.item())
        else:
            loss_list.append(0)
    if i == 0:
        training_convergence_list [0].append(sum(loss_list)/9000)
    else:
        training_convergence_list [1].append(sum(loss_list)/9000)
    loader = DataLoader(validation_dataset, batch_size=batch_size)
    loss_list = []
    for data_batch in loader:
        loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
        if loss > 0:
            loss_list.append(loss.item())
        else:
            loss_list.append(0)
    if i == 0:
        validation_convergence_list [0].append(sum(loss_list)/3000)
    else:
        validation_convergence_list [1].append(sum(loss_list)/3000)      

for training_epochs in range (100):
    print(training_epochs)
    for i in model_indices:
        # initialize the model    
        num_head = number_heads[i]
        size_in = dataset.get(0).x.shape[1]
        size_out = int(2 * dataset.get(0).x.shape[1] / num_head)       
        edge_size = dataset.get(0).edge_attr.shape[1]    
        if i == 0:
            model = GATL1noSelf(size_in, size_out, edge_size, num_head, tmax, dropout_prob)
            model.load_state_dict(torch.load(convergence_list[0]))
        elif i == 8:
            size_hid1 = int(1.5 * dataset.get(0).x.shape[1] / num_head)
            model = GATL2withSelf(size_in, size_out, size_hid1, edge_size, num_head, tmax, dropout_prob)
            model.load_state_dict(torch.load(convergence_list[1]))        
        # setting the minibatch size
        batch_size = 64
        loader = DataLoader(train_dataset, batch_size=batch_size)
        # move to GPU (if available)
        model = model.to(device)
        # inizialize the optimizer
        lr = 0.01
        weight_decay = 5e-4
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        # training of the model
        epochs = 1
        train_hist = {}
        train_hist['loss'] = []
        model.train()
        # training loop
        for epoch in range(epochs):
            for idx, data_batch in enumerate(loader):
                data_batch = data_batch.to(device)
                optimizer.zero_grad()
                loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
                loss.backward()
                optimizer.step()
                train_hist['loss'].append(loss.item())
        # save and load learned parameters
        if i == 0:
            torch.save(model.state_dict(), convergence_list[0])
        else:
            torch.save(model.state_dict(), convergence_list[1])
        batch_size = 1
        loss_list = []
        for data_batch in loader:
            loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
            if loss > 0:
                loss_list.append(loss.item())
            else:
                loss_list.append(0)
        if i == 0:
            training_convergence_list [0].append(sum(loss_list)/9000)
        else:
            training_convergence_list [1].append(sum(loss_list)/9000)
        loader = DataLoader(validation_dataset, batch_size=batch_size)
        loss_list = []
        for data_batch in loader:
            loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
            if loss > 0:
                loss_list.append(loss.item())
            else:
                loss_list.append(0)
        if i == 0:
            validation_convergence_list [0].append(sum(loss_list)/3000)
        else:
            validation_convergence_list [1].append(sum(loss_list)/3000)

In [None]:
loss_file = open(loss_list_target[0], "wb")
pickle.dump(training_convergence_list [0], loss_file)
loss_file.close()
loss_file = open(loss_list_target[2], "wb")
pickle.dump(training_convergence_list [1], loss_file)
loss_file.close() 
loss_file = open(loss_list_target[1], "wb")
pickle.dump(validation_convergence_list [0], loss_file)
loss_file.close() 
loss_file = open(loss_list_target[3], "wb")
pickle.dump(validation_convergence_list [1], loss_file)
loss_file.close() 

In [None]:
training_convergence1 = np.array(training_convergence_list [0])*64
validation_convergence1 = np.array(validation_convergence_list [0])
training_convergence2 = np.array(training_convergence_list [1])*64
validation_convergence2 = np.array(validation_convergence_list [1])

In [None]:
x_values = np.arange(1,93)
figure(figsize=(24, 18), dpi=300)
nextplot()
plt.title("Training, and validation loss: GATL1H1NL model")
plt.plot(x_values, training_convergence1, color ="blue", label = 'Average batch training loss')
plt.plot(x_values, validation_convergence1, color ="Orange", label = 'Average Validation loss')
plt.legend()
plt.xlabel("Training Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
x_values = np.arange(1,92)
figure(figsize=(24, 18), dpi=300)
nextplot()
plt.title("Training, and validation loss: GATL2h1WL model")
plt.plot(x_values, training_convergence2, color ="blue", label = 'Average batch training loss')
plt.plot(x_values, validation_convergence2, color ="Orange", label = 'Average Validation loss')
plt.legend()
plt.xlabel("Training Epochs")
plt.ylabel("Loss")
plt.show()