In [1]:
import os
import re
import glob
import random 
import statistics
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
def load_event_log(dataset_path=None, n_rows=1000):
    print('Reading', dataset_path)
    try:
        event_log = pd.read_csv(dataset_path, nrows=n_rows)
    except:
        return None
    h = event_log.columns.values.tolist()
    # check if a dataset contains a tuple of case_id, activity, and timestamp
    if ('case:concept:name' in h) and ('concept:name' in h) and ('time:timestamp' in h):
        # pre-process an event log
        return event_log.fillna(np.nan).replace([np.nan], [''])
    else:
        return None

In [3]:
# feature_extraction: 
# input: expect a vector of values in a column
# output: set of features
def local_feature_extraction(value, round_digits=3):
    patterns = ['[a-z]', '[A-Z]', '\d', '\s', '[^a-zA-Z\d\s]']
    f = {}
    if len(value) == 0:
        f['f_chars'] = 0 
        f['f_words'] = 0
        for p in patterns:
            f['f_{}'.format(p)] = 0
    else:
        # length: length of a value
        f['f_chars'] = len(value)
        # words: number of words in a value
        f['f_words'] = len(re.findall(r'\w+', value))
        # The following code find the frequency of each pattern in patterns in a value
        for p in patterns:
            f['f_{}'.format(p)] = round(len(re.findall(p, value)) / len(value), round_digits)
    return f

def feature_extraction(values, round_digits=3):
    # set type of values string
    values = values.astype(str)
    # local features
    f_local = [local_feature_extraction(value) for value in values]
    # convert it into a DF to easily calculate mean of each feature
    f = pd.DataFrame.from_dict(f_local)
    f = f.apply(np.mean, axis=0)
    # global features
    # count the occurence of each value in values
    counts = Counter(values).values()
    if len(counts) > 1:
        # f_ratio_unique_values: how much unique values are involved
        f['f_ratio_unique_values'] = round(len(set(values)) / len(values), round_digits)
        # f_mean_unique_values: mean value of number of appearance of each value
        f['f_mean_unique_values'] = round(statistics.mean(counts), round_digits)
    else:
        f['f_ratio_unique_values'] = 1
        f['f_mean_unique_values'] = 1
    return f.to_numpy()


In [4]:
datasets = glob.glob('../datasets/*.csv', recursive=False)
event_logs = [load_event_log(d, n_rows=1000) for d in datasets]
# remove None from event_logs
event_logs = [e for e in event_logs if e is not None]
print('Datasets found in the dataset dir:', len(datasets))
print('Datasets that are ready for evaluation:', len(event_logs))

Reading ../datasets/BPIC2015_2.csv
Reading ../datasets/BPIC2013_incident_management.csv
Reading ../datasets/BPIC2018.csv
Reading ../datasets/BPIC2013_problem_management_open_problems.csv
Reading ../datasets/BPIC2016_Clicks_NOT_Logged_In.csv
Reading ../datasets/BPIC2015_5.csv
Reading ../datasets/BPIC2015_4.csv
Reading ../datasets/BPIC2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2014_change_log.csv
Reading ../datasets/Production_Data.csv
Reading ../datasets/BPIC2011_Dutch_academic_hospital.csv
Reading ../datasets/BPIC2017.csv
Reading ../datasets/BPIC2013_problem_management_closed_problems.csv
Reading ../datasets/BPIC2015_3.csv
Reading ../datasets/BPIC2012_loan_application_process.csv
Reading ../datasets/BPIC2019_purchase_order_handling_process.csv
Reading ../datasets/BPI2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2015_1.csv
Reading ../datasets/BPI2016_Clicks_NOT_Logged_In.csv
Datasets found in the dataset dir: 19
Datasets that are ready for evaluation: 13


In [5]:
tmp = [e.apply(lambda x: feature_extraction(x), axis=0) for e in event_logs]