In [None]:
import json
import glob
from os import listdir
from os.path import isfile, join
import os.path as osp
import copy
import os
import pickle
from datetime import datetime
import random
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import NNConv
from torch_geometric.nn import GATv2Conv
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
class SkinbaronDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super(SkinbaronDataset, self).__init__(root, transform, pre_transform, pre_filter)
    @property
    def processed_file_names(self):
        return processed_graphs
    def len(self):
        return len(self.processed_file_names)
    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [None]:
def getListofallkeys(dict):      
    return list(dict.keys())

In [None]:
# Method for getting meta date from the event log
def logStatistics (event_log):
    # set required variables
    categorical_value_list = []
    cardinality_list = []
    cat_encoder_list = []
    max_len = 0
    # get list of all activities in event log
    act_dict = attributes_filter.get_attribute_values(event_log, "concept:name")
    activities = list(act_dict.keys())
    # one-hot encoding for activity labels
    act_array = np.array(activities)
    act_enc = OneHotEncoder(handle_unknown='ignore')
    act_enc.fit(act_array.reshape(-1, 1))
    # get list of all categorical attributes in the log
    categorical_attributes = getListofallkeys(event_log[0][0])
    categorical_attributes.remove('case_id')
    categorical_attributes.remove('concept:name')
    # get list of values, and cardinality of each of categorical attributes.
    for category in categorical_attributes:
        current_cat_dict = attributes_filter.get_attribute_values(event_log, category)
        current_cat_list = list(current_cat_dict.keys())
        categorical_value_list.append(current_cat_list)
        cardinality_list.append(len(current_cat_list))
        # one-hot encoding for each of categorical attribute
        current_array = np.array(current_cat_list)
        current_enc = OneHotEncoder(handle_unknown='ignore')
        current_enc.fit(current_array.reshape(-1, 1))
        cat_encoder_list.append(current_enc)
    # get length of the longest case in the log
    num_cases = len(event_log)
    for i in range (num_cases):
        current_length = len(event_log[i])
        max_len = max(max_len, current_length)
    return num_cases, max_len, act_array, act_enc, categorical_attributes, cardinality_list, cat_encoder_list

In [None]:
# method for creating node feature matrix for one trace, additionally we create a m_array dict, and pair_dict!
def build_node_feature_matrix(current_trace, activity_label_array, act_encoder, max_trace_length):
    # Compute length of trace
    trace_length = len(current_trace)
    # dimensions of one-hot encoder for activity labels:a dim*dim array:
    dim = len(activity_label_array)
    """ initialize a dictionary to collect:
        Keys: tuple of activity labels (directly-followed pairs) 
        values: position of pair's occurrence in the trace
    """
    pair_dict = {}
    # initialize an auxiliary list to collect to collect unique activity labels in the current case
    act_list = []
    """ initialize a dictionary to collect:
        Keys: tuple of source node index, and target node index
        values: relevant m-array
    """
    m_array_dict = {}
    # creating pair_dict dictionary
    for i in range(trace_length - 1):
        current_activity_label = current_trace[i].get('concept:name')
        act_list.append(current_activity_label)
        next_activity_label = current_trace[i + 1].get('concept:name')
        act_list.append(next_activity_label)
        activity_tuple = (current_activity_label, next_activity_label)
        if activity_tuple in pair_dict:
            pair_dict[activity_tuple].append(i)
        else:
            pair_dict[activity_tuple] = [i]
    # identify unique activity labels, and initialize node feature matrix based on its size.
    act_set = list(set(act_list))
    node_feature = np.zeros((len(act_set), dim), dtype=np.float32)
    # create node feature matrix based on one-hot encoding
    counter_act = 0
    for act in act_set:
        act_arr = np.array(act)
        node_feature[counter_act] = act_encoder.transform(act_arr.reshape(-1, 1)).toarray()
        counter_act += 1
    # iterate over all directly-followed pairs, identify their source and target index in the node set.
    # create m-array based on the position(s) of their occurrence, and return indexes, and array in a dictionary
    for key in pair_dict:
        source_index = act_set.index(key[0])
        target_index = act_set.index(key[1])
        m_array = np.zeros(max_trace_length - 1, dtype=np.intc)
        for i in range(len(pair_dict[key])):
            m_array[pair_dict[key][i]] = 1
        m_array_dict[(source_index, target_index)] = m_array
    return pair_dict, node_feature, m_array_dict, act_set

In [None]:
# method for creating p_array dict, its structure is similar to the structure of m_array dict!
# keys are exactly the same,
def build_p_array_dict(current_trace, m_array, pairs, act_set, max_trace_length, categorical, cardinality, cat_encoder_list):
    p_array_dict = {}
    m_size = sum(cardinality) # no need for extra dimension for timestamp as well as binary, numerical, ordinal atts
    for key in m_array:
        p_array = np.zeros((max_trace_length - 1) * m_size, dtype=np.float32)
        source_index = key[0]
        target_index = key[1]
        source_act = act_set[source_index]
        target_act = act_set[target_index]
        act_tuple = (source_act, target_act)
        current_occurrence = pairs[act_tuple]  # is a list
        for i in current_occurrence:
            # get the changes for categorical attributes
            pointer = 0
            iterator = 0
            for j in categorical:
                curr_card = cardinality[iterator]
                source_att = current_trace[i].get(j)
                target_att = current_trace[i + 1].get(j)
                if str(target_att) != 'nan':
                    curr_arr = np.array(target_att)
                    target_onehot = cat_encoder_list[iterator].transform(curr_arr.reshape(-1, 1)).toarray()
                    if str(source_att) == 'nan':
                        p_array[i * m_size + pointer: i * m_size + pointer + curr_card] = target_onehot
                    else:
                        if target_att != source_att:
                            curr_arr2 = np.array(source_att)
                            source_onehot = cat_encoder_list[iterator].transform(curr_arr2.reshape(-1, 1)).toarray()
                            p_array[i * m_size + pointer: i * m_size + pointer + curr_card] = target_onehot - source_onehot
                iterator = iterator + 1
                pointer = pointer + curr_card
                # now that we have the value (p_array), we link it to the current key in our dictionary
        p_array_dict[key] = p_array
    return p_array_dict

In [None]:
interest_path = r'D:\Final master thesis evaluation\exp1'
log_pattern = r'D:\Final master thesis evaluation\exp1\*.xes'
processed_path = r'D:\Final master thesis evaluation\exp1\data\processed'
case_id_target = r'D:\Final master thesis evaluation\exp1\case_id_list.pkl'
processed_Pattern = r"D:\Final master thesis evaluation\exp1\data\processed\*.pt"
targat_path = r"D:\Final master thesis evaluation\exp1\data"
logs = glob.glob(log_pattern)

In [None]:
# import the log
event_log = xes_importer.apply(logs[0])
# get required meta data
num_cases, tmax, act_array, act_enc, categorical_attributes, cardinality_list, cat_encoder_list = logStatistics(event_log)
# core part of feature eng.
case_id_list = []
idx = 0
for i in range (len(event_log)):
    case = event_log [i]
    pairs, node_feature_matrix, m_array_dict, activity_set = build_node_feature_matrix(case, act_array, act_enc, tmax)
    p_array_dict = build_p_array_dict(case, m_array_dict, pairs, activity_set, tmax, categorical_attributes,
                                          cardinality_list, cat_encoder_list)
    case_id_list.append(case.attributes.get('concept:name'))
    edge_index_list = []
    edge_att_list = []
    edge_marray_list = []
    for key in p_array_dict:
        edge_index_list.append(list(key))
        edge_att_list.append(list(p_array_dict.get(key)))
        edge_marray_list.append(list(m_array_dict.get(key)))
    edge_index = torch.tensor(edge_index_list, dtype=torch.long)
    edge_attr = torch.tensor(edge_att_list, dtype=torch.float)
    edge_m_array = torch.tensor(edge_marray_list, dtype=torch.uint8)               
    x = torch.from_numpy(node_feature_matrix).float()
    graph = Data(x=x, edge_index=edge_index.t().contiguous(), edge_attr=edge_attr, edge_m_array=edge_m_array)
    torch.save(graph, osp.join(processed_path, f'data_{idx}.pt'))
    idx += 1
case_id_file = open(case_id_target, "wb")
pickle.dump(case_id_list, case_id_file)
case_id_file.close()