In [1]:
# Check the statistics of key columns

In [67]:
import re
import glob
import statistics
import numpy as np
import pandas as pd
import pm4py
from random import sample
from itertools import product
from collections import Counter
from SetSimilaritySearch import all_pairs

In [2]:
def load_event_log(dataset_path=None, n_rows=1000):
    print('Reading', dataset_path)
    try:
        event_log = pd.read_csv(dataset_path, nrows=n_rows)
    except:
        return None
    h = event_log.columns.values.tolist()
    # check if a dataset contains a tuple of case_id, activity, and timestamp
    if ('case:concept:name' in h) and ('concept:name' in h) and ('time:timestamp' in h):
        # pre-process an event log
        return event_log.fillna(np.nan).replace([np.nan], [''])
    else:
        return None

In [3]:
datasets = glob.glob('../datasets/*.csv', recursive=False)
event_logs = [load_event_log(d, n_rows=1000) for d in datasets]
# remove None from event_logs
event_logs = [e for e in event_logs if e is not None]
print('Datasets found in the dataset dir:', len(datasets))
print('Datasets that are ready for evaluation:', len(event_logs))

Reading ../datasets/BPIC2015_2.csv
Reading ../datasets/BPIC2013_incident_management.csv
Reading ../datasets/BPIC2020_Prepaid_Travel_Costs.csv
Reading ../datasets/BPIC2018.csv
Reading ../datasets/BPIC2013_problem_management_open_problems.csv
Reading ../datasets/BPIC2016_Clicks_NOT_Logged_In.csv
Reading ../datasets/BPIC2015_5.csv
Reading ../datasets/BPIC2015_4.csv
Reading ../datasets/BPIC2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2014_change_log.csv
Reading ../datasets/BPIC2011_Dutch_academic_hospital.csv
Reading ../datasets/BPIC2017.csv
Reading ../datasets/BPIC2013_problem_management_closed_problems.csv
Reading ../datasets/tmp.csv
Reading ../datasets/BPIC2015_3.csv
Reading ../datasets/test.csv
Reading ../datasets/BPIC2012_loan_application_process.csv
Reading ../datasets/BPIC2019_purchase_order_handling_process.csv
Reading ../datasets/BPI2016_Clicks_Logged_In.csv
Reading ../datasets/BPIC2015_1.csv
Reading ../datasets/BPI2016_Clicks_NOT_Logged_In.csv
Datasets found in the dataset d

In [4]:
# check number of unique values
[event_logs[e].apply(lambda x: statistics.mean(Counter(x).values()), axis=0) for e in range(len(event_logs))]

[monitoringResource         200.000000
 org:resource               166.666667
 activityNameNL               7.936508
 concept:name                 7.042254
 question                    37.037037
 dateFinished                 7.751938
 action_code                  7.042254
 activityNameEN               7.936508
 planned                      2.114165
 lifecycle:transition      1000.000000
 time:timestamp               1.972387
 case:Includes_subCases     500.000000
 case:concept:name           55.555556
 case:Responsible_actor     250.000000
 case:endDate                58.823529
 case:caseStatus           1000.000000
 case:parts                 111.111111
 case:caseProcedure         500.000000
 case:last_phase           1000.000000
 case:case_type            1000.000000
 case:startDate              66.666667
 case:requestComplete       500.000000
 case:SUMleges               71.428571
 case:IDofConceptCase       200.000000
 case:termName              500.000000
 case:landRegisterID     

In [6]:
# check number of unique values
[event_logs[e].apply(lambda x: len(set(x)), axis=0) for e in range(len(event_logs))]

[monitoringResource          5
 org:resource                6
 activityNameNL            126
 concept:name              142
 question                   27
 dateFinished              129
 action_code               142
 activityNameEN            126
 planned                   473
 lifecycle:transition        1
 time:timestamp            507
 case:Includes_subCases      2
 case:concept:name          18
 case:Responsible_actor      4
 case:endDate               17
 case:caseStatus             1
 case:parts                  9
 case:caseProcedure          2
 case:last_phase             1
 case:case_type              1
 case:startDate             15
 case:requestComplete        2
 case:SUMleges              14
 case:IDofConceptCase        5
 case:termName               2
 case:landRegisterID         1
 dueDate                     1
 dateStop                    1
 dtype: int64,
 org:group                 57
 resource country          13
 organization country       7
 org:resource             1

In [7]:
# what do activities look like?
[event_logs[e]['concept:name'] for e in range(len(event_logs))]

[0      01_HOOFD_010
 1      01_HOOFD_011
 2      01_HOOFD_020
 3      01_HOOFD_015
 4        03_GBH_005
            ...     
 995       06_VD_010
 996    01_HOOFD_101
 997    01_HOOFD_130
 998    01_HOOFD_110
 999    01_HOOFD_180
 Name: concept:name, Length: 1000, dtype: object,
 0       Accepted
 1       Accepted
 2         Queued
 3       Accepted
 4         Queued
          ...    
 995     Accepted
 996     Accepted
 997    Completed
 998       Queued
 999     Accepted
 Name: concept:name, Length: 1000, dtype: object,
 0         mail income
 1          mail valid
 2          mail valid
 3          mail valid
 4          initialize
             ...      
 995     begin editing
 996         calculate
 997    finish editing
 998     begin editing
 999         calculate
 Name: concept:name, Length: 1000, dtype: object,
 0       Accepted
 1       Accepted
 2       Accepted
 3       Accepted
 4       Accepted
          ...    
 995     Accepted
 996    Completed
 997     Accepted
 998  

In [8]:
# How much unique activity appears?
[len(set(event_logs[e]['concept:name'])) / len(event_logs[e]['concept:name']) for e in range(len(event_logs))]

[0.142,
 0.003,
 0.021,
 0.003,
 0.136,
 0.105,
 0.134,
 0.023,
 0.004,
 0.097,
 0.024,
 0.021,
 0.111]

In [9]:
# what about case_id?
[len(set(event_logs[e]['case:concept:name'])) / len(event_logs[e]['case:concept:name']) for e in range(len(event_logs))]

[0.018,
 0.051,
 0.019,
 0.308,
 0.018,
 0.024,
 0.008,
 0.029,
 0.158,
 0.028,
 0.037,
 0.023,
 0.02]

In [10]:
# what about timestamp?
[len(set(event_logs[e]['time:timestamp'])) / len(event_logs[e]['time:timestamp']) for e in range(len(event_logs))]

[0.507,
 0.981,
 0.842,
 0.998,
 0.111,
 0.107,
 0.135,
 1.0,
 0.998,
 0.83,
 0.943,
 0.555,
 0.092]

In [15]:
def similarity_check(event_log_df=None, n_samples=100):
    n_cases = len(set(event_log_df['case:concept:name']))
    if n_cases > n_samples:
        # sample n_sample cases
        sampled_case_ids = sample(list(set(event_log_df['case:concept:name'])), n_samples)
        event_log_df = event_log_df[event_log_df['case:concept:name'].isin(sampled_case_ids)]
    
    # get the sequence of activities by case_ids
    sets = event_log_df.groupby('case:concept:name')['concept:name']\
        .apply(list).apply(set).tolist()
    if len(sets) > 0:
        pairs = list(all_pairs(sets, similarity_func_name="cosine", similarity_threshold=0.1))
        #  print('pairs:', pairs)
        if len(pairs) == 0:
            cosine_sim = 0
        else:
            cosine_sim = round(np.mean([list(pairs[i])[2] for i in range(len(pairs))]), ndigits=2)
    else:
        cosine_sim = 0

    return cosine_sim

In [16]:
[similarity_check(event_log_df=e) for e in event_logs]

[0.65, 0.98, 0.9, 0.82, 0.68, 0.75, 0.4, 0.81, 0.9, 0.64, 0.59, 0.89, 0.76]

In [18]:
[event_logs[e].apply(lambda x: any(x == ''), axis=0) for e in range(len(event_logs))]

[monitoringResource        False
 org:resource              False
 activityNameNL            False
 concept:name              False
 question                  False
 dateFinished              False
 action_code               False
 activityNameEN            False
 planned                    True
 lifecycle:transition      False
 time:timestamp            False
 case:Includes_subCases    False
 case:concept:name         False
 case:Responsible_actor    False
 case:endDate               True
 case:caseStatus           False
 case:parts                False
 case:caseProcedure         True
 case:last_phase           False
 case:case_type            False
 case:startDate            False
 case:requestComplete      False
 case:SUMleges              True
 case:IDofConceptCase       True
 case:termName              True
 case:landRegisterID        True
 dueDate                    True
 dateStop                   True
 dtype: bool,
 org:group                False
 resource country         Fals

In [17]:
event_logs[0]

Unnamed: 0,monitoringResource,org:resource,activityNameNL,concept:name,question,dateFinished,action_code,activityNameEN,planned,lifecycle:transition,...,case:last_phase,case:case_type,case:startDate,case:requestComplete,case:SUMleges,case:IDofConceptCase,case:termName,case:landRegisterID,dueDate,dateStop
0,4634935,560530,registratie datum binnenkomst aanvraag,01_HOOFD_010,EMPTY,2013-01-10 12:07:35,01_HOOFD_010,register submission date request,2012-03-23 10:21:14+01:00,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
1,4634935,560530,OLO berichtenverkeer actief,01_HOOFD_011,False,2013-01-10 12:07:35,01_HOOFD_011,OLO messaging active,2012-03-23 10:26:05+01:00,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
2,4634935,560530,versturen ontvangstbevestiging,01_HOOFD_020,True,2013-01-10 12:07:35,01_HOOFD_020,send confirmation receipt,,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
3,4634935,560530,fase aanvraag ontvangen,01_HOOFD_015,EMPTY,2013-01-10 12:07:35,01_HOOFD_015,phase application received,,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
4,4634935,560530,aanvrager is belanghebbende,03_GBH_005,True,2013-01-10 12:07:35,03_GBH_005,applicant is stakeholder,,complete,...,Zaak afgehandeld,557669,2012-03-21 00:00:00+01:00,False,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,560521,560532,verlengen proceduretermijn,06_VD_010,False,2012-04-25 10:43:10,06_VD_010,extend procedure term,,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,
996,560521,560532,registratie datum publicatie,01_HOOFD_101,25-4-2012 10:43:07,2012-04-25 10:43:10,01_HOOFD_101,registration date publication,,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,
997,560521,560532,WAW vergunningsaspect,01_HOOFD_130,False,2012-04-25 10:43:10,01_HOOFD_130,WAW permit aspect,,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,
998,560521,560532,behandelen deelzaken volledigheid,01_HOOFD_110,EMPTY,2012-06-20 11:42:30,01_HOOFD_110,treat subcases completeness,2012-04-26 10:43:07+02:00,complete,...,Zaak afgehandeld,557669,2012-04-23 00:00:00+02:00,True,465.7575,,,,,


In [25]:
net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(event_logs[0])
net2, initial_marking2, final_marking2 = pm4py.discover_petri_net_inductive(event_logs[0])

In [57]:
# source and target: Place or Transition class
[x for x in net2.arcs]

[(230ecec9-f04d-4d18-b3cb-f39f6250e825, '01_HOOFD_809')->p_46,
 p_45->(617a544a-ea3f-49a6-9520-d3fa76f92ada, '01_HOOFD_440_2'),
 p_45->(6f18d6a0-7317-416e-b447-cca35461fdc4, '10_UOV_040'),
 p_45->(ef8f7490-a73e-4aab-aeed-fa37b15e7a2c, '15_NGV_010'),
 (init_loop_12, None)->p_39,
 p_45->(ecfeb254-7d62-4a29-ab7d-1eb1b25d833f, '16_LGSD_010'),
 (10ae8d54-41d4-45ad-813b-6511e73bad49, '06_VD_035')->p_46,
 p_45->(ab90fb59-8d7f-4936-9fce-d7f849be0725, '01_BB_775'),
 (eadf1148-b3eb-48dc-98b2-67fa585bf26c, '01_HOOFD_100')->p_42,
 p_45->(7629c820-f6d2-419a-acf6-eefa1f04cf08, '01_BB_765'),
 p_45->(a98e0f71-2075-421e-be1d-17f67a76039b, '01_HOOFD_195'),
 p_9->(22cd1cbf-7877-43ad-8462-986ce37c1a4a, '01_HOOFD_490_5a'),
 p_45->(8755850e-9134-475e-9e25-20d85b477b6e, '01_HOOFD_110_2'),
 p_22->(32b72cf7-6674-4389-990a-245bc95065be, '01_HOOFD_012'),
 p_45->(e6204784-ddf6-4d00-ab00-5dc732421eea, '01_BB_766'),
 (24434952-3b8d-4120-8d8f-62dd05f09fb1, '10_UOV_030')->p_46,
 (tauSplit_8, None)->p_35,
 (b6fedc3e-1

In [68]:
def save_bpmn(dataset):
    # extract the dataset name
    dataset_name = re.match('../datasets/(.+).csv', dataset).groups()[0]
    # read the event log
    event_log = load_event_log(dataset, n_rows=1000)
    # discover a process model
    pm4py.discover_petri_net_inductive(event_log)
    # convert it from petri nets to a process tree
    pt = pm4py.convert.convert_to_process_tree(net, initial_marking, final_marking)
    # convert the process tree to a bpmn
    bpmn = pm4py.convert.convert_to_bpmn(pt)
    # save it
    print('../plots/' + dataset_name + '.bpmn')
    pm4py.write.write_bpmn(bpmn, '../plots/' + dataset_name + '.bpmn')

datasets = glob.glob('../datasets/*.csv', recursive=False)
[save_bpmn(d) for d in datasets]

Reading ../datasets/BPIC2015_2.csv
../plots/BPIC2015_2.bpmn


ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH