In [1]:
import os
import re
import glob
import time
import random 
import pickle
import signal
import statistics
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RepeatedKFold
# for CV
# process mining
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
# miners
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
# performance metrics
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator

In [2]:
def load_event_log(dataset_path=None, n_rows=1000):
    print('Reading', dataset_path)
    try:
        event_log = pd.read_csv(dataset_path, nrows=n_rows)
    except:
        return None
    h = event_log.columns.values.tolist()
    # check if a dataset contains a tuple of case_id, activity, and timestamp
    if ('case:concept:name' in h) and ('concept:name' in h) and ('time:timestamp' in h):
        # pre-process an event log
        return event_log.fillna(np.nan).replace([np.nan], [''])
    else:
        return None

In [3]:
datasets = glob.glob('../datasets/*.csv', recursive=False)
event_logs = [load_event_log(d, n_rows=1000) for d in datasets]
# remove None from event_logs
event_logs = [e for e in event_logs if e is not None]
print('Datasets found in the dataset dir:', len(datasets))
print('Datasets that are ready for evaluation:', len(event_logs))

Reading ../datasets/BPIC2015_2.csv
Reading ../datasets/BPIC2013_incident_management.csv
Reading ../datasets/BPIC2020_Prepaid_Travel_Costs.csv
Reading ../datasets/BPIC2018.csv
Reading ../datasets/BPIC2013_problem_management_open_problems.csv
Reading ../datasets/BPIC2016_Clicks_NOT_Logged_In.csv
Reading ../datasets/BPIC2015_5.csv
Reading ../datasets/BPIC2015_4.csv
Reading ../datasets/BPIC2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2014_change_log.csv
Reading ../datasets/Production_Data.csv
Reading ../datasets/BPIC2011_Dutch_academic_hospital.csv
Reading ../datasets/BPIC2017.csv
Reading ../datasets/BPIC2013_problem_management_closed_problems.csv
Reading ../datasets/BPIC2015_3.csv
Reading ../datasets/BPIC2012_loan_application_process.csv
Reading ../datasets/BPIC2019_purchase_order_handling_process.csv
Reading ../datasets/BPI2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2015_1.csv
Reading ../datasets/BPI2016_Clicks_NOT_Logged_In.csv
Datasets found in the dataset dir: 20
Datasets t

In [4]:
def measure_score(d=None, key_indices=None, miner='inductive_miner', n_splits=2,
                  model_discovery_timeout_in_sec=5, evaluation_timeout_in_sec=60):
    def discover_model(event_log, miner):
        # default parameters are used
        if miner == 'inductive_miner':
            net, im, fm = inductive_miner.apply(event_log,
                {pm4py.algo.discovery.inductive.variants.im.algorithm.Parameters.NOISE_THRESHOLD: 0.2},
                pm4py.algo.discovery.inductive.algorithm.Variants.IM)
        elif miner == 'heuristics_miner':
            net, im, fm = heuristics_miner.apply(event_log, {
                heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.5,
                heuristics_miner.Variants.CLASSIC.value.Parameters.AND_MEASURE_THRESH: 0.65,
                heuristics_miner.Variants.CLASSIC.value.Parameters.MIN_ACT_COUNT: 1,
                heuristics_miner.Variants.CLASSIC.value.Parameters.MIN_DFG_OCCURRENCES: 1,
                heuristics_miner.Variants.CLASSIC.value.Parameters.DFG_PRE_CLEANING_NOISE_THRESH: 0.05,
                heuristics_miner.Variants.CLASSIC.value.Parameters.LOOP_LENGTH_TWO_THRESH: 2})
        return net, im, fm

    def evaluate_score(event_log, net, im, fm, metric):
        fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, 
                                                 variant=replay_fitness_evaluator.Variants.TOKEN_BASED)['log_fitness']
        precision = precision_evaluator.apply(event_log, net, im, fm, 
                                              variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
        generalization = generalization_evaluator.apply(event_log, net, im, fm)
        simplicity = simplicity_evaluator.apply(net)
        return fitness + precision + generalization + simplicity
    
    def timeout_handler(signum, frame):
        raise Exception('timeout')
        
    def cross_validation(event_log_df, train, test):
        # dataset need be sorted by timestamp (https://pm4py.fit.fraunhofer.de/documentation#item-import-csv)
        #  print("%s %s" % (train, test))
        train_log_df = event_log_df.iloc[train]
        train_log_df = train_log_df.sort_values('time:timestamp')
        #  print('train_log_df')
        #  print(set(train_log_df['case:concept:name']))
        test_log_df = event_log_df.iloc[test]
        test_log_df = test_log_df.sort_values('time:timestamp')
        #  print('test_log_df')
        #  print(set(test_log_df['case:concept:name']))
        train_log = log_converter.apply(train_log_df, \
                                        variant=log_converter.Variants.TO_EVENT_LOG)
        test_log = log_converter.apply(test_log_df, \
                                       variant=log_converter.Variants.TO_EVENT_LOG)
    
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), \
              'discovering a process model...')
        
        # discover_model may take time. hence, we set a timeout for it
        # https://stackoverflow.com/a/494273/7184459
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(model_discovery_timeout_in_sec)
        try:
            net, im, fm = discover_model(train_log, miner)
        except Exception:
            print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'gave up discovering a model')
            print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'score evaluated:', 0)
            return 0
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'process model discovered')
        # reset a timer when a model discovery finishes before the deadline
        signal.alarm(0)
        # set another timer for evaluation
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(evaluation_timeout_in_sec)
        try:
            score = evaluate_score(test_log, net, im, fm, metric)
        except Exception:
            print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'gave up evaluating a model')
            print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'score evaluated:', 0)
            return 0
        signal.alarm(0)
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'score evaluated:', score)
        return score
    
    # evaluate the goodness of such an assumption with CV
    gkf = GroupKFold(n_splits=n_splits)
    X = list(d.index)
    d.columns.values[2] = 'case:concept:name' # treat d.columns.values[2] as case_id
    groups = list(d['case:concept:name']) 
    if len(set(groups)) < n_splits:
        print("couldn't group events well with the given case_id candidate. score: 0")
        return 0
    else:
        scores = [cross_validation(d, train, test) for train, test in gkf.split(X, groups=groups)]
        score = round(np.mean(scores), ndigits=3)
        print('final score:', score)
    return score

In [5]:
def infer_case_id_column(event_log):
    # 0. exclude known columns (i.e. activity and timestamp) from a given event log
    column_names = set(event_log.columns.values.tolist())
    known_columns = set(['concept:name', 'time:timestamp'])
    case_id_candidates = list(column_names - known_columns)
    e = event_log.loc[:, case_id_candidates]
    # 1. calculate g and gr
    def g(column_values):
        return 1 - len(set(column_values)) / len(column_values)
    gs = e.apply(lambda x: g(x), axis=0).values
    avg_g = np.mean(gs)
    grs = 1 - np.abs(gs - avg_g)
    # 2. evaluate the four metrics
    
    start_time = time.time()
    scores = [measure_score(event_log.loc[:, list(known_columns) + [c]]) for c in case_id_candidates]
    scores = ((scores + grs) / 5).tolist()
    computation_time = time.time() - start_time
    # 3. return the index of the maximum score as the case_id's column 
    return {
        'computation_time': computation_time,
        'is_correct': event_log.columns.values.tolist()[scores.index(max(scores))] == 'case:concept:name'
    }

In [6]:
results = [infer_case_id_column(event_log) for event_log in event_logs]

with open(r'results_A_Andaloussi.pickle', 'wb') as output_file:
    pickle.dump(results, output_file)

couldn't group events well with the given case_id candidate. score: 0
2022-08-08 04:34:57 discovering a process model...
2022-08-08 04:34:57 process model discovered
2022-08-08 04:34:57 gave up evaluating a model
2022-08-08 04:34:57 score evaluated: 0
2022-08-08 04:34:57 discovering a process model...
2022-08-08 04:35:02 gave up discovering a model
2022-08-08 04:35:02 score evaluated: 0
final score: 0.0
2022-08-08 04:35:02 discovering a process model...
2022-08-08 04:35:02 process model discovered
2022-08-08 04:35:02 gave up evaluating a model
2022-08-08 04:35:02 score evaluated: 0
2022-08-08 04:35:02 discovering a process model...
2022-08-08 04:35:02 process model discovered
2022-08-08 04:35:02 gave up evaluating a model
2022-08-08 04:35:02 score evaluated: 0
final score: 0.0
2022-08-08 04:35:02 discovering a process model...
2022-08-08 04:35:07 gave up discovering a model
2022-08-08 04:35:07 score evaluated: 0
2022-08-08 04:35:07 discovering a process model...
2022-08-08 04:35:12 gav

In [7]:
results

[{'computation_time': 156.94144010543823, 'is_correct': False},
 {'computation_time': 0.9789524078369141, 'is_correct': False},
 {'computation_time': 6.929790258407593, 'is_correct': False},
 {'computation_time': 7.271513223648071, 'is_correct': False},
 {'computation_time': 0.8399205207824707, 'is_correct': False},
 {'computation_time': 173.53237318992615, 'is_correct': False},
 {'computation_time': 171.80070900917053, 'is_correct': False},
 {'computation_time': 498.7450203895569, 'is_correct': False},
 {'computation_time': 7.851968050003052, 'is_correct': False},
 {'computation_time': 1.0385560989379883, 'is_correct': False},
 {'computation_time': 149.25721526145935, 'is_correct': False},
 {'computation_time': 3.214446783065796, 'is_correct': False},
 {'computation_time': 6.648387432098389, 'is_correct': False},
 {'computation_time': 181.50213479995728, 'is_correct': False}]