In [None]:
import json
import glob
from os import listdir
from os.path import isfile, join
import os.path as osp
import copy
import os
import pickle
from datetime import datetime
import random
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import NNConv
from torch_geometric.nn import GATv2Conv
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
class SkinbaronDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super(SkinbaronDataset, self).__init__(root, transform, pre_transform, pre_filter)
    @property
    def processed_file_names(self):
        return processed_graphs
    def len(self):
        return len(self.processed_file_names)
    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [None]:
# define GCN (ECC), and GAT neural network architectures
class ECCCONV(nn.Module):
    """Define ECCCONV class for the Edge-Conditioned Convolutional (ECC) neural network."""

    def __init__(self, size_in, size_out, edge_size, tmax):
        super(ECCCONV, self).__init__()
        nn1 = nn.Linear(edge_size, size_in * size_out)
        self.conv1 = NNConv(size_in, size_out, nn1)
        self.lin1 = nn.Linear(size_in, size_out, bias=False)
        self.readout = nn.Linear(2 * size_out, tmax - 1)

    def forward(self, graph):
        graph.x = F.elu(self.conv1(graph.x, graph.edge_index, graph.edge_attr) + self.lin1(graph.x))
        s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, 0])], graph.x[int(graph.edge_index[1, 0])]))
        m_array_pred = torch.sigmoid(self.readout(s_t_nodes))
        for i in range(1, graph.edge_index.shape[1]):  # iterate over the number of edges in the batch (called graph)
            s_t_nodes = torch.cat((graph.x[int(graph.edge_index[0, i])], graph.x[int(graph.edge_index[1, i])]))
            edge_label_pred = torch.sigmoid(self.readout(s_t_nodes))
            m_array_pred = torch.cat((m_array_pred, edge_label_pred), 0)
        m_array_pred = m_array_pred.view(-1, tmax - 1)
        return m_array_pred

In [None]:
targat_path = r'D:\Final master thesis evaluation\small-fifteen\data'
processed_Pattern = r"D:\Final master thesis evaluation\small-fifteen\data\processed\*.pt"
case_id_target = r'D:\Final master thesis evaluation\small-fifteen\case_id_list.pkl'
models_target = r'D:\Final master thesis evaluation\small-fifteen\ECC_param.pt'
loss_df_target = r'D:\Final master thesis evaluation\small-fifteen\ECC_loss.pt'
# set the device to GPU or CPU (only if GPU is not available we will use CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# for reproducibility of the result (note: we use SGD for optimization)
np.random.seed(0)
# Define binary cross entropy loss.
batch_loss = nn.BCELoss()
# creating dataset
processed_graphs = glob.glob(processed_Pattern)
dataset = SkinbaronDataset(root=targat_path)
# train/validation/test split
train_dataset = dataset[0:9000]
test_dataset = dataset[12000:15000]
print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')
# Retrieving the case_id list from saved file on the disk:
case_id_file = open(case_id_target, "rb")
case_id_list = pickle.load(case_id_file)
case_id_file.close()
# find the length of the longest trace based on the edge_m_array attribute
tmax = int(dataset.get(0).edge_m_array.shape[1]) + 1
dropout_prob = 0.0

In [None]:
# initialize ECC model
num_head = 1
size_in = dataset.get(0).x.shape[1]
size_out = int(2 * dataset.get(0).x.shape[1] / num_head)
edge_size = dataset.get(0).edge_attr.shape[1]
model = ECCCONV(size_in, size_out, edge_size, tmax)
# setting the minibatch size
batch_size = 8
loader = DataLoader(train_dataset, batch_size=batch_size)
# move to GPU (if available)
model = model.to(device)
# inizialize the optimizer
lr = 0.001
weight_decay = 0
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
# training of the model
epochs = 100
train_hist = {}
train_hist['loss'] = []
# Initialize training
for layer in model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()
model.train()
# training loop
for epoch in range(epochs):
    print(epoch)
    for idx, data_batch in enumerate(loader):
        data_batch = data_batch.to(device)
        optimizer.zero_grad()
        loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
        loss.backward()
        optimizer.step()
        train_hist['loss'].append(loss.item())
        # print('[Epoch %4d/%4d] [Batch %4d/%4d] Loss: % 2.2e' % (epoch + 1, epochs,idx + 1, len(loader),loss.item()))
# save and load learned parameters
torch.save(model.state_dict(), models_target)

In [None]:
 get anomaly scores produced on test dataset only for the best hyperparameter configuration
test_loss_path = r'D:\Final master thesis evaluation\small-fifteen\ECC_loss.pt'
label_csv = pd.read_csv(r'D:\Final master thesis evaluation\small-fifteen\small-0.1-1.csv')
label_csv.drop("Unnamed: 0", axis=1, inplace=True)
label_csv["case_id"] = pd.to_numeric(label_csv["case_id"])
label_csv.case_id.astype(str).astype(int)
label_csv.label.astype(str)
test_csv = label_csv.loc[label_csv['case_id'] > 12000]
# get the result in a dataframe
batch_size = 1
loader = DataLoader(test_dataset, batch_size=batch_size)
test_id_list = case_id_list[12000:15000]
loss_list = []
for data_batch in loader:
    loss = batch_loss(model.forward(data_batch).float(), data_batch.edge_m_array.float())
    if loss > 0:
        loss_list.append(loss.item())
    else:
        loss_list.append(0)  
GAT2_dictionary = {'case_id': test_id_list, 'Loss': loss_list}
loss_df = pd.DataFrame(GAT2_dictionary)
loss_df.to_pickle(test_loss_path)
loss_df["case_id"] = pd.to_numeric(loss_df["case_id"])
result = pd.merge(loss_df, test_csv, on=["case_id"])
sorted_result = result.sort_values(by=['Loss'], ascending = False, ignore_index = True)
sorted_list = sorted_result['label'].tolist()
predictions = []
for j in range (len(sorted_list)):
    if sorted_list[j] == 'normal':
        predictions.append(0)
    else:
        predictions.append(1)
prediction_array = np.array(predictions)
best_score = 0
for j in range (len(sorted_list)):
    current_alarms = prediction_array[0:j+1]
    current_normals = prediction_array[j+1:]
    positives = j+1
    true_positives = np.sum(current_alarms)
    false_positives = positives - true_positives
    negatives = 3000 - positives
    false_negatives = np.sum(current_normals)
    true_negatives = negatives - false_negatives
    precision = true_positives/(true_positives+false_positives)
    recall = true_positives/(true_positives+false_negatives)
    f1_score = 2*precision*recall/(precision+recall)
    if f1_score > best_score:
        best_score = f1_score
        precision_on_normal = true_negatives/(true_negatives+false_negatives)
        recall_on_normal = true_negatives/(true_negatives+false_positives)
        f1_score_on_normal = 2*precision_on_normal*recall_on_normal/(precision_on_normal+recall_on_normal)
        macro_f1_score = (f1_score + f1_score_on_normal)/2
        best_result = [precision, recall, f1_score, precision_on_normal, recall_on_normal,
                    f1_score_on_normal, macro_f1_score]
        best_cut = [j, false_positives, false_negatives]
print(best_result)
print(best_cut)