In [None]:
import networkx as nx
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import pm4py as pm4
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
import pandas as pd
from statistics import mean

##### Generate subgroup logs

In [None]:
elog_data_first = pd.read_csv('data/export/cv19_event_log_wv1.csv')
elog_data_second = pd.read_csv('data/export/cv19_event_log_wv2.csv')
elog_data_full = pd.read_csv('data/export/cv19_event_log_full.csv')
elog_data_first['start'] = pd.to_datetime(elog_data_first['start'])
elog_data_second['start'] = pd.to_datetime(elog_data_second['start'])

In [None]:
base_cohort = pd.read_csv('data/export/cv19_cohort.csv')

In [None]:
el_full_ooh = elog_data_full.groupby('pid')['ooh'].count().reset_index()
el_full_ooh.ooh.quantile(0.8)

In [None]:
elog_data_first = pd.merge(elog_data_first, base_cohort[['pid', 'sex']], how='left', on='pid')
elog_data_second = pd.merge(elog_data_second, base_cohort[['pid', 'sex']], how='left', on='pid')

In [None]:
base_cohort.sex.value_counts()

In [None]:
elog_data_first.sex.value_counts()

In [None]:
elog_data_second.sex.value_counts()

In [None]:
elog_data_first.isnull().sum()

In [None]:
base_cohort.groupby('wave').n_morbid.value_counts()

In [None]:
elog_data_first.columns

In [None]:
el_ooh_1 = elog_data_first.groupby('pid')['ooh'].count().reset_index().rename(columns={'ooh': 'ooh_count'})
el_ooh_2 = elog_data_second.groupby('pid')['ooh'].count().reset_index().rename(columns={'ooh': 'ooh_count'})
elog_data_first = pd.merge(elog_data_first, el_ooh_1[['pid', 'ooh_count']], how='left', on='pid')
elog_data_second = pd.merge(elog_data_second, el_ooh_2[['pid', 'ooh_count']], how='left', on='pid')

In [None]:
results_df = pd.DataFrame(columns=['subset', 'level', 'L1W2_lf', 'L1W2_prec', 'L1W2_gen', 'L2W1_lf', 'L2W1_prec', 'L2W1_gen', 'm_lf',
                          'm_prec', 'm_gen', 'GED', 'L1W2_percfit', 'L2W1_percfit', 'm_percfit'])

In [None]:
elog_data_fp = elog_data_first.rename(columns={"pid": "case:concept:name", "provider_event": "concept:name",
                                            "start": "time:timestamp"})
elog_data_fp = pm4.format_dataframe(elog_data_fp, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp',
                                   timest_format='%Y-%m-%d %H:%M:%S')
elog_data_sp = elog_data_second.rename(columns={"pid": "case:concept:name", "provider_event": "concept:name",
                                            "start": "time:timestamp"})
elog_data_sp = pm4.format_dataframe(elog_data_sp, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp',
                                   timest_format='%Y-%m-%d %H:%M:%S')
elog_data_fa = elog_data_first.rename(columns={"pid": "case:concept:name", "act_code": "concept:name",
                                            "start": "time:timestamp"})
elog_data_fa = pm4.format_dataframe(elog_data_fa, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp',
                                   timest_format='%Y-%m-%d %H:%M:%S')
elog_data_sa = elog_data_second.rename(columns={"pid": "case:concept:name", "act_code": "concept:name",
                                            "start": "time:timestamp"})
elog_data_sa = pm4.format_dataframe(elog_data_sa, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp',
                                   timest_format='%Y-%m-%d %H:%M:%S')

#### Subsets

In [None]:
### Sex f
el_data_fp_a75 = elog_data_fp[elog_data_fp.sex == 'F']
el_data_sp_a75 = elog_data_sp[elog_data_sp.sex == 'F']
el_data_fa_a75 = elog_data_fa[elog_data_fa.sex == 'F']
el_data_sa_a75 = elog_data_sa[elog_data_sa.sex == 'F']
### Sex m
el_data_fp_int = elog_data_fp[elog_data_fp.sex == 'M']
el_data_sp_int = elog_data_sp[elog_data_sp.sex == 'M']
el_data_fa_int = elog_data_fa[elog_data_fa.sex == 'M']
el_data_sa_int = elog_data_sa[elog_data_sa.sex == 'M']

In [None]:
el_fp_a75 = pm4.convert_to_event_log(el_data_fp_a75)
el_sp_a75 = pm4.convert_to_event_log(el_data_sp_a75)
el_fa_a75 = pm4.convert_to_event_log(el_data_fa_a75)
el_sa_a75 = pm4.convert_to_event_log(el_data_sa_a75)

el_fp_int = pm4.convert_to_event_log(el_data_fp_int)
el_sp_int = pm4.convert_to_event_log(el_data_sp_int)
el_fa_int = pm4.convert_to_event_log(el_data_fa_int)
el_sa_int = pm4.convert_to_event_log(el_data_sa_int)

##### Discover and save Petri Nets with Inductive Miner

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_fp_a75, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_fp_a75.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_sp_a75, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_sp_a75.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_fa_a75, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_fa_a75.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_sa_a75, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_sa_a75.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_fp_int, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_fp_int.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_sp_int, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_sp_int.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_fa_int, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_fa_int.pnml')

In [None]:
net, initial_marking, final_marking = pm4.discover_petri_net_inductive(el_sa_int, noise_threshold=0.3, multi_processing=True)
pm4.view_petri_net(net, initial_marking, final_marking)
pm4.write_pnml(net, initial_marking, final_marking, 'process_mining/petri_nets/el_sa_int.pnml')

#### Cross-log conformance checking

In [None]:
net1, im1, fm1 = pm4.read_pnml('process_mining/petri_nets/el_fp_a75.pnml')
net2, im2, fm2 = pm4.read_pnml('process_mining/petri_nets/el_sp_a75.pnml')

In [None]:
fitness_L1W2 = pm4.fitness_token_based_replay(el_fp_a75, net2, im2, fm2)
prec_L1W2 = pm4.precision_token_based_replay(el_fp_a75, net2, im2, fm2)
gen_L1W2 = generalization_evaluator.apply(el_fp_a75, net2, im2, fm2)

fitness_L2W1 = pm4.fitness_token_based_replay(el_sp_a75, net1, im1, fm1)
prec_L2W1 = pm4.precision_token_based_replay(el_sp_a75, net1, im1, fm1)
gen_L2W1 = generalization_evaluator.apply(el_sp_a75, net1, im1, fm1)

In [None]:
nx_pn = pm4.convert_petri_net_to_networkx(net1, im1, fm1)
nx_pn2 = pm4.convert_petri_net_to_networkx(net2, im2, fm2)
ged = nx.graph_edit_distance(nx_pn, nx_pn2, timeout=30)

In [None]:
fitness_L2W1

In [None]:
res_l = ['>75 years', 'Provider', round(fitness_L1W2['log_fitness'], 3), round(prec_L1W2, 3), round(gen_L1W2, 3),
         round(fitness_L2W1['log_fitness'], 3), round(prec_L2W1, 3), round(gen_L2W1, 3),
         round(mean([fitness_L1W2['log_fitness'], fitness_L2W1['log_fitness']]), 3),
         round(mean([prec_L1W2, prec_L2W1]), 3),
         round(mean([gen_L1W2, gen_L2W1]), 3),
        int(ged),
        round(fitness_L1W2['perc_fit_traces'], 3),
        round(fitness_L2W1['perc_fit_traces'], 3),
        round(mean([fitness_L1W2['perc_fit_traces'], fitness_L2W1['perc_fit_traces']]), 3)]
results_df.loc[len(results_df)] = res_l

In [None]:
results_df

In [None]:
net1, im1, fm1 = pm4.read_pnml('process_mining/petri_nets/el_fa_a75.pnml')
net2, im2, fm2 = pm4.read_pnml('process_mining/petri_nets/el_sa_a75.pnml')

fitness_L1W2 = pm4.fitness_token_based_replay(el_fa_a75, net2, im2, fm2)
prec_L1W2 = pm4.precision_token_based_replay(el_fa_a75, net2, im2, fm2)
gen_L1W2 = generalization_evaluator.apply(el_fa_a75, net2, im2, fm2)

fitness_L2W1 = pm4.fitness_token_based_replay(el_sa_a75, net1, im1, fm1)
prec_L2W1 = pm4.precision_token_based_replay(el_sa_a75, net1, im1, fm1)
gen_L2W1 = generalization_evaluator.apply(el_sa_a75, net1, im1, fm1)

nx_pn = pm4.convert_petri_net_to_networkx(net1, im1, fm1)
nx_pn2 = pm4.convert_petri_net_to_networkx(net2, im2, fm2)
ged = nx.graph_edit_distance(nx_pn, nx_pn2, timeout=30)

res_l = ['>75 years', 'Activity', round(fitness_L1W2['log_fitness'], 3), round(prec_L1W2, 3), round(gen_L1W2, 3),
         round(fitness_L2W1['log_fitness'], 3), round(prec_L2W1, 3), round(gen_L2W1, 3),
         round(mean([fitness_L1W2['log_fitness'], fitness_L2W1['log_fitness']]), 3),
         round(mean([prec_L1W2, prec_L2W1]), 3),
         round(mean([gen_L1W2, gen_L2W1]), 3),
        int(ged),
        round(fitness_L1W2['perc_fit_traces'], 3),
        round(fitness_L2W1['perc_fit_traces'], 3),
        round(mean([fitness_L1W2['perc_fit_traces'], fitness_L2W1['perc_fit_traces']]), 3)]
results_df.loc[len(results_df)] = res_l

In [None]:
results_df

In [None]:
net1, im1, fm1 = pm4.read_pnml('process_mining/petri_nets/el_fp_int.pnml')
net2, im2, fm2 = pm4.read_pnml('process_mining/petri_nets/el_sp_int.pnml')

fitness_L1W2 = pm4.fitness_token_based_replay(el_fp_int, net2, im2, fm2)
prec_L1W2 = pm4.precision_token_based_replay(el_fp_int, net2, im2, fm2)
gen_L1W2 = generalization_evaluator.apply(el_fp_int, net2, im2, fm2)

fitness_L2W1 = pm4.fitness_token_based_replay(el_sp_int, net1, im1, fm1)
prec_L2W1 = pm4.precision_token_based_replay(el_sp_int, net1, im1, fm1)
gen_L2W1 = generalization_evaluator.apply(el_sp_int, net1, im1, fm1)

nx_pn = pm4.convert_petri_net_to_networkx(net1, im1, fm1)
nx_pn2 = pm4.convert_petri_net_to_networkx(net2, im2, fm2)
ged = nx.graph_edit_distance(nx_pn, nx_pn2, timeout=30)

res_l = ['Intensive Therapy', 'Provider', round(fitness_L1W2['log_fitness'], 3), round(prec_L1W2, 3), round(gen_L1W2, 3),
         round(fitness_L2W1['log_fitness'], 3), round(prec_L2W1, 3), round(gen_L2W1, 3),
         round(mean([fitness_L1W2['log_fitness'], fitness_L2W1['log_fitness']]), 3),
         round(mean([prec_L1W2, prec_L2W1]), 3),
         round(mean([gen_L1W2, gen_L2W1]), 3),
        int(ged),
        round(fitness_L1W2['perc_fit_traces'], 3),
        round(fitness_L2W1['perc_fit_traces'], 3),
        round(mean([fitness_L1W2['perc_fit_traces'], fitness_L2W1['perc_fit_traces']]), 3)]
results_df.loc[len(results_df)] = res_l

In [None]:
results_df

In [None]:
net1, im1, fm1 = pm4.read_pnml('process_mining/petri_nets/el_fa_int.pnml')
net2, im2, fm2 = pm4.read_pnml('process_mining/petri_nets/el_sa_int.pnml')

fitness_L1W2 = pm4.fitness_token_based_replay(el_fa_int, net2, im2, fm2)
prec_L1W2 = pm4.precision_token_based_replay(el_fa_int, net2, im2, fm2)
gen_L1W2 = generalization_evaluator.apply(el_fa_int, net2, im2, fm2)

fitness_L2W1 = pm4.fitness_token_based_replay(el_sa_int, net1, im1, fm1)
prec_L2W1 = pm4.precision_token_based_replay(el_sa_int, net1, im1, fm1)
gen_L2W1 = generalization_evaluator.apply(el_sa_int, net1, im1, fm1)

nx_pn = pm4.convert_petri_net_to_networkx(net1, im1, fm1)
nx_pn2 = pm4.convert_petri_net_to_networkx(net2, im2, fm2)
ged = nx.graph_edit_distance(nx_pn, nx_pn2, timeout=30)

res_l = ['Intensive Therapy', 'Activity', round(fitness_L1W2['log_fitness'], 3), round(prec_L1W2, 3), round(gen_L1W2, 3),
         round(fitness_L2W1['log_fitness'], 3), round(prec_L2W1, 3), round(gen_L2W1, 3),
         round(mean([fitness_L1W2['log_fitness'], fitness_L2W1['log_fitness']]), 3),
         round(mean([prec_L1W2, prec_L2W1]), 3),
         round(mean([gen_L1W2, gen_L2W1]), 3),
        int(ged),
        round(fitness_L1W2['perc_fit_traces'], 3),
        round(fitness_L2W1['perc_fit_traces'], 3),
        round(mean([fitness_L1W2['perc_fit_traces'], fitness_L2W1['perc_fit_traces']]), 3)]
results_df.loc[len(results_df)] = res_l

In [None]:
results_df