In [1]:
import re
import glob
import pandas as pd
import numpy as np
from datetime import datetime 
from itertools import product
from SetSimilaritySearch import all_pairs
# for CV
from sklearn.model_selection import GroupKFold
# process mining
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
# miners
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
# performance metrics
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator

In [2]:
def similarity_check(d=None, key_indices=None, min_cosine_sim=0.5):
    event_log_df = d.iloc[:, key_indices]
    event_log_df.columns.values[0] = 'case:concept:name'
    event_log_df.columns.values[1] = 'time:timestamp'
    event_log_df.columns.values[2] = 'concept:name'
    # get the sequence of activities by case_ids
    sets = event_log_df.groupby('case:concept:name')['concept:name']\
        .apply(list).apply(set).tolist()
    if len(sets) > 0:
        pairs = list(all_pairs(sets, similarity_func_name="cosine", similarity_threshold=0.1))
        #  print('pairs:', pairs)
        if len(pairs) == 0:
            cosine_sim = 0
        else:
            cosine_sim = np.mean([list(pairs[i])[2] for i in range(len(pairs))])
    else:
        cosine_sim = 0

    # print('cosine similarity:', cosine_sim)
    if cosine_sim <= min_cosine_sim:
        return False
    else:
        # if passed all preliminary tests, then return True
        return True

In [3]:
def measure_score(d=None, key_indices=None, miner='inductive_miner', metric='Buijs2014', n_splits=3):
    def discover_model(event_log, miner):
        # default parameters are used
        if miner == 'inductive_miner':
            net, im, fm = inductive_miner.apply(event_log,
                {pm4py.algo.discovery.inductive.variants.im.algorithm.Parameters.NOISE_THRESHOLD: 0.2},
                pm4py.algo.discovery.inductive.algorithm.Variants.IM)
        elif miner == 'heuristics_miner':
            net, im, fm = heuristics_miner.apply(event_log, {
                heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: 0.5,
                heuristics_miner.Variants.CLASSIC.value.Parameters.AND_MEASURE_THRESH: 0.65,
                heuristics_miner.Variants.CLASSIC.value.Parameters.MIN_ACT_COUNT: 1,
                heuristics_miner.Variants.CLASSIC.value.Parameters.MIN_DFG_OCCURRENCES: 1,
                heuristics_miner.Variants.CLASSIC.value.Parameters.DFG_PRE_CLEANING_NOISE_THRESH: 0.05,
                heuristics_miner.Variants.CLASSIC.value.Parameters.LOOP_LENGTH_TWO_THRESH: 2})
        return net, im, fm

    def evaluate_score(event_log, net, im, fm, metric):
        if metric == 'fitness':
            score = replay_fitness_evaluator.apply(event_log, net, im, fm, 
                    variant=replay_fitness_evaluator.Variants.TOKEN_BASED)['log_fitness']
        elif metric == 'precision':
            score = precision_evaluator.apply(event_log, net, im, fm, 
                    variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
        elif metric == 'generalization':
            score = generalization_evaluator.apply(event_log, net, im, fm)
        elif metric == 'simplicity':
            score = simplicity_evaluator.apply(net)
        else:
            if metric == 'Buijs2014':
            # A paper "Quality dimensions in process discovery: The importance of fitness, 
            # precision, generalization and simplicity" proposed to calculate the following 
            # four metrics with giving 10 times more weight to replay fitness than 
            # the other three.
            # 10 x + 3x = 1 => x = 1 / 13
                weights = [10/13, 1/13, 1/13, 1/13]
            elif metric == 'average':
                weights = [0.25, 0.25, 0.25, 0.25]
            fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, 
                    variant=replay_fitness_evaluator.Variants.TOKEN_BASED)['log_fitness']
            precision = precision_evaluator.apply(event_log, net, im, fm, 
                    variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
            #  fitness = replay_fitness_evaluator.apply(event_log, net, im, fm, 
                    #  variant=replay_fitness_evaluator.Variants.ALIGNMENT_BASED)['log_fitness']
            #  precision = precision_evaluator.apply(event_log, net, im, fm, 
                    #  variant=precision_evaluator.Variants.ALIGN_ETCONFORMANCE)
            generalization = generalization_evaluator.apply(event_log, net, im, fm)
            simplicity = simplicity_evaluator.apply(net)
            score = np.dot(weights, [fitness, precision, generalization, simplicity])
        return score
    
    def cross_validation(event_log_df, train, test):
        # dataset need be sorted by timestamp (https://pm4py.fit.fraunhofer.de/documentation#item-import-csv)
        #  print("%s %s" % (train, test))
        train_log_df = event_log_df.iloc[train]
        train_log_df = train_log_df.sort_values('time:timestamp')
        #  print('train_log_df')
        #  print(set(train_log_df['case:concept:name']))
        test_log_df = event_log_df.iloc[test]
        test_log_df = test_log_df.sort_values('time:timestamp')
        #  print('test_log_df')
        #  print(set(test_log_df['case:concept:name']))
        train_log = log_converter.apply(train_log_df, \
                                        variant=log_converter.Variants.TO_EVENT_LOG)
        test_log = log_converter.apply(test_log_df, \
                                       variant=log_converter.Variants.TO_EVENT_LOG)
    
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), \
              'discovering a process model...')
        #  print(train_log_df[['case:concept:name', 'time:timestamp', 'concept:name']])
        #  print(train_log_df.groupby('case:concept:name').size())
        #  print(set(train_log_df['case:concept:name']))
        #  print(set(train_log_df['time:timestamp']))
        #  print(set(train_log_df['concept:name']))
        net, im, fm = discover_model(train_log, miner)
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'process model discovered')
        score = evaluate_score(test_log, net, im, fm, metric)
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'score evaluated:', score)
        return score
    
    # assume the columns as case_id, timestamp, and activity from left
    event_log_df = d.iloc[:, key_indices]
    event_log_df.columns.values[0] = 'case:concept:name'
    event_log_df.columns.values[1] = 'time:timestamp'
    event_log_df.columns.values[2] = 'concept:name'
    # evaluate the goodness of such an assumption with CV
    gkf = GroupKFold(n_splits=n_splits)
    X = list(event_log_df.index)
    groups = list(event_log_df['case:concept:name'])
    try:
        scores = [cross_validation(event_log_df, train, test) \
              for train, test in gkf.split(X, groups=groups)]
        score = round(np.mean(scores), ndigits=3)
    except:
        score = 0
    print('final score:', score)
    return score

In [4]:
dataset_names = glob.glob('../datasets/*.csv', recursive=False)
n_rows = 1000
datasets = [pd.read_csv(d, nrows=n_rows) for d in dataset_names]

In [5]:
dataset = datasets[0]

dataset

Unnamed: 0,monitoringResource,org:resource,activityNameNL,concept:name,question,dateFinished,action_code,activityNameEN,planned,lifecycle:transition,...,case:last_phase,case:case_type,case:startDate,case:requestComplete,case:SUMleges,case:IDofConceptCase,case:termName,case:landRegisterID,dueDate,dateStop
0,4634935,560530,registratie datum binnenkomst aanvraag,01_HOOFD_010,EMPTY,2013-01-10 12:07:35,01_HOOFD_010,register submission date request,2012-03-23 10:21:14+01:00,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
1,4634935,560530,OLO berichtenverkeer actief,01_HOOFD_011,False,2013-01-10 12:07:35,01_HOOFD_011,OLO messaging active,2012-03-23 10:26:05+01:00,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
2,4634935,560530,versturen ontvangstbevestiging,01_HOOFD_020,True,2013-01-10 12:07:35,01_HOOFD_020,send confirmation receipt,,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
3,4634935,560530,fase aanvraag ontvangen,01_HOOFD_015,EMPTY,2013-01-10 12:07:35,01_HOOFD_015,phase application received,,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
4,4634935,560530,aanvrager is belanghebbende,03_GBH_005,True,2013-01-10 12:07:35,03_GBH_005,applicant is stakeholder,,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,560521,560532,verlengen proceduretermijn,06_VD_010,False,2012-04-25 10:43:10,06_VD_010,extend procedure term,,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,
996,560521,560532,registratie datum publicatie,01_HOOFD_101,25-4-2012 10:43:07,2012-04-25 10:43:10,01_HOOFD_101,registration date publication,,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,
997,560521,560532,WAW vergunningsaspect,01_HOOFD_130,False,2012-04-25 10:43:10,01_HOOFD_130,WAW permit aspect,,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,
998,560521,560532,behandelen deelzaken volledigheid,01_HOOFD_110,EMPTY,2012-06-20 11:42:30,01_HOOFD_110,treat subcases completeness,2012-04-26 10:43:07+02:00,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,


In [6]:
# identify the columns that involve a digit
tmp = dataset.apply(lambda x: all([bool(re.search(r'\d', str(value))) for value in set(x)]), \
                    axis=0).tolist()
# get the indices of them
timestamp_cand = list(np.where(tmp)[0])
timestamp_cand

[0, 1, 3, 5, 6, 10, 12, 13, 19, 20]

In [7]:
# case_id also should contain a digit
# the same case_id must appear multiple times \
tmp = dataset.iloc[:, timestamp_cand] \
    .apply(lambda x: round(np.mean(x.value_counts()), ndigits=0) > 1, axis=0)
# get the indices of them
case_id_cand = list(np.array(timestamp_cand)[np.where(tmp)[0]])
case_id_cand

[0, 1, 3, 5, 6, 10, 12, 13, 19, 20]

In [8]:
# activity 
# the same case_id must appear multiple times \
tmp = dataset.apply(lambda x: round(np.mean(x.value_counts()), ndigits=0) > 1, axis=0)
activity_cand = list(np.where(tmp)[0])
activity_cand

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24]

In [9]:
# get every possible key columns' indices
# exclude any duplicates as we cannot select the same column for key indices (e.g. case_id and activity, etc.)
key_index_cand = [list(t) for t in product(case_id_cand, timestamp_cand, activity_cand) 
                  if len(set(t)) >= 3] 

In [10]:
tmp = [similarity_check(dataset, key_indices, min_cosine_sim=0.5) \
 for key_indices in key_index_cand]
tmp = np.array(key_index_cand)[np.array(tmp)]

In [None]:
scores = [measure_score(dataset, key_indices) for key_indices in tmp]

2022-07-12 14:53:35 discovering a process model...
2022-07-12 15:05:07 process model discovered


replaying log with TBR, completed variants ::   0%|          | 0/441 [00:00<?, ?it/s]

2022-07-12 17:55:33 score evaluated: 0.7358719988016268
2022-07-12 17:55:33 discovering a process model...
2022-07-12 18:07:24 process model discovered


replaying log with TBR, completed variants ::   0%|          | 0/2 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/261 [00:00<?, ?it/s]

In [None]:
h = dataset.columns.values.tolist()
correct_labels = [
        h.index('case:concept:name'),
        h.index('time:timestamp'),
        h.index('concept:name')
        ]
correct_labels

In [None]:
measure_score(dataset, correct_labels)

In [None]:
sorted(scores)