In [1]:
import os
import re
import glob
import random 
import statistics
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
def load_event_log(dataset_path=None, n_rows=1000):
    print('Reading', dataset_path)
    try:
        event_log = pd.read_csv(dataset_path, nrows=n_rows)
    except:
        return None
    h = event_log.columns.values.tolist()
    # check if a dataset contains a tuple of case_id, activity, and timestamp
    if ('case:concept:name' in h) and ('concept:name' in h) and ('time:timestamp' in h):
        # pre-process an event log
        return event_log.fillna(np.nan).replace([np.nan], [''])
    else:
        return None


In [3]:
# feature_extraction: 
# input: expect a vector of values in a column
# output: set of features
def local_feature_extraction(value, round_digits=3):
    patterns = ['[a-z]', '[A-Z]', '\d', '\s', '[^a-zA-Z\d\s]']
    f = {}
    if len(value) == 0:
        f['f_chars'] = 0 
        f['f_words'] = 0
        for p in patterns:
            f['f_{}'.format(p)] = 0
    else:
        # length: length of a value
        f['f_chars'] = len(value)
        # words: number of words in a value
        f['f_words'] = len(re.findall(r'\w+', value))
        # The following code find the frequency of each pattern in patterns in a value
        for p in patterns:
            f['f_{}'.format(p)] = round(len(re.findall(p, value)) / len(value), round_digits)
    return f

def feature_extraction(values, round_digits=3):
    # set type of values string
    values = values.astype(str)
    # local features
    f_local = [local_feature_extraction(value) for value in values]
    # convert it into a DF to easily calculate mean of each feature
    f = pd.DataFrame.from_dict(f_local)
    f = f.apply(np.mean, axis=0)
    # global features
    # count the occurence of each value in values
    counts = Counter(values).values()
    if len(counts) > 1:
        # find the mean and variance of counts
        # append global features
        f['f_ratio_unique_values'] = round(len(set(values)) / len(values), round_digits)
        f['f_mean_unique_values'] = round(statistics.mean(counts), round_digits)
    else:
        f['f_ratio_unique_values'] = 1
        f['f_mean_unique_values'] = 1
    return f.to_numpy()


In [4]:
datasets = glob.glob('../datasets/*.csv', recursive=False)
event_logs = [load_event_log(d, n_rows=1000) for d in datasets]
# remove None from event_logs
event_logs = [e for e in event_logs if e is not None]
print('Datasets found in the dataset dir:', len(datasets))
print('Datasets that are ready for evaluation:', len(event_logs))

Reading ../datasets/BPIC2015_2.csv
Reading ../datasets/BPIC2013_incident_management.csv
Reading ../datasets/BPIC2018.csv
Reading ../datasets/BPIC2013_problem_management_open_problems.csv
Reading ../datasets/BPIC2016_Clicks_NOT_Logged_In.csv
Reading ../datasets/BPIC2015_5.csv
Reading ../datasets/BPIC2015_4.csv
Reading ../datasets/BPIC2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2014_change_log.csv
Reading ../datasets/Production_Data.csv
Reading ../datasets/BPIC2011_Dutch_academic_hospital.csv
Reading ../datasets/BPIC2017.csv
Reading ../datasets/BPIC2013_problem_management_closed_problems.csv
Reading ../datasets/BPIC2015_3.csv
Reading ../datasets/BPIC2012_loan_application_process.csv
Reading ../datasets/BPIC2019_purchase_order_handling_process.csv
Reading ../datasets/BPI2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2015_1.csv
Reading ../datasets/BPI2016_Clicks_NOT_Logged_In.csv
Datasets found in the dataset dir: 19
Datasets that are ready for evaluation: 13


In [5]:
tmp = [e.apply(lambda x: feature_extraction(x), axis=0) for e in event_logs]

In [6]:
# relabel: if a label is a key label, keep it. Otherwise, relabel it as 'other'
def relabel(labels):
    return np.array([label if re.search(r'case:concept:name|concept:name|time:timestamp', label) else 'other' \
                     for label in labels])

# flatten: flatten an array of arrays
# https://stackoverflow.com/a/952952/7184459
def flatten(x):
    return [x_elem_elem for x_elem in x for x_elem_elem in x_elem]

In [9]:
n_splits = 2
n_repeats = 10
n_estimators = 100

# extract labels and features
y = np.array([np.array(relabel(e.columns.values)) for e in tmp], dtype=object)
X = np.array([np.transpose(e).values for e in tmp], dtype=object)
# performance evaluation using CV
rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
for train_index, test_index in rkf.split(range(len(X))):
    clf = RandomForestClassifier(n_estimators=n_estimators)
    X_train, X_test = flatten(X[train_index]), flatten(X[test_index])
    y_train, y_test = flatten(y[train_index]), flatten(y[test_index])
    clf.fit(X_train, y_train)
    # evaluate the classifier
    y_predicted = clf.predict(X_test)
    print(classification_report(y_test, y_predicted))

                   precision    recall  f1-score   support

case:concept:name       0.67      0.29      0.40         7
     concept:name       0.67      0.29      0.40         7
            other       0.95      0.99      0.97       278
   time:timestamp       1.00      0.43      0.60         7

         accuracy                           0.95       299
        macro avg       0.82      0.50      0.59       299
     weighted avg       0.94      0.95      0.94       299

                   precision    recall  f1-score   support

case:concept:name       1.00      0.17      0.29         6
     concept:name       0.50      0.17      0.25         6
            other       0.92      0.97      0.94       112
   time:timestamp       0.75      1.00      0.86         6

         accuracy                           0.90       130
        macro avg       0.79      0.58      0.58       130
     weighted avg       0.89      0.90      0.88       130

                   precision    recall  f1-score  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

case:concept:name       1.00      0.29      0.44         7
     concept:name       0.00      0.00      0.00         7
            other       0.90      0.98      0.94       115
   time:timestamp       0.78      1.00      0.88         7

         accuracy                           0.90       136
        macro avg       0.67      0.57      0.57       136
     weighted avg       0.86      0.90      0.86       136

                   precision    recall  f1-score   support

case:concept:name       0.67      0.67      0.67         6
     concept:name       1.00      0.33      0.50         6
            other       0.97      0.99      0.98       275
   time:timestamp       1.00      0.83      0.91         6

         accuracy                           0.97       293
        macro avg       0.91      0.71      0.76       293
     weighted avg       0.97      0.97      0.97       293



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

case:concept:name       0.50      0.14      0.22         7
     concept:name       0.00      0.00      0.00         7
            other       0.95      0.99      0.97       270
   time:timestamp       0.78      1.00      0.88         7

         accuracy                           0.95       291
        macro avg       0.56      0.53      0.52       291
     weighted avg       0.92      0.95      0.93       291

                   precision    recall  f1-score   support

case:concept:name       0.75      0.50      0.60         6
     concept:name       0.50      0.17      0.25         6
            other       0.92      0.98      0.95       120
   time:timestamp       1.00      0.67      0.80         6

         accuracy                           0.91       138
        macro avg       0.79      0.58      0.65       138
     weighted avg       0.90      0.91      0.90       138

                   precision    recall  f1-score  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

case:concept:name       0.67      0.29      0.40         7
     concept:name       0.50      0.43      0.46         7
            other       0.95      0.96      0.95       176
   time:timestamp       0.70      1.00      0.82         7

         accuracy                           0.92       197
        macro avg       0.70      0.67      0.66       197
     weighted avg       0.91      0.92      0.91       197

                   precision    recall  f1-score   support

case:concept:name       0.67      0.33      0.44         6
     concept:name       0.00      0.00      0.00         6
            other       0.95      0.99      0.97       214
   time:timestamp       1.00      0.67      0.80         6

         accuracy                           0.94       232
        macro avg       0.65      0.50      0.55       232
     weighted avg       0.92      0.94      0.93       232

                   precision    recall  f1-score  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

case:concept:name       0.67      0.57      0.62         7
     concept:name       0.67      0.29      0.40         7
            other       0.97      0.99      0.98       291
   time:timestamp       1.00      0.71      0.83         7

         accuracy                           0.96       312
        macro avg       0.82      0.64      0.71       312
     weighted avg       0.95      0.96      0.95       312

                   precision    recall  f1-score   support

case:concept:name       1.00      0.33      0.50         6
     concept:name       0.00      0.00      0.00         6
            other       0.91      0.97      0.94        99
   time:timestamp       0.67      1.00      0.80         6

         accuracy                           0.89       117
        macro avg       0.64      0.58      0.56       117
     weighted avg       0.85      0.89      0.86       117



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
