This notebook takes the generated event log in CSV format and classifies the event attributes according to their process characteristic. Further, it calculates the coefficient of variation(CV), which is used to measure the degree of variety.

In [None]:
import numpy as np
from psycopg2 import connect
import pandas as pd
import pm4py
import numpy as np
import pandasql as ps
from pm4py.objects.conversion.log import converter as log_converter
from scipy.stats import variation
from scipy import stats
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization
from pm4py.statistics.eventually_follows.log import get as efg_get
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import sklearn.preprocessing as sk
from scipy.stats import chi2_contingency
import math
import statistics
import pingouin as pg
import graphviz
from statsmodels.stats import multitest
from statsmodels.stats.contingency_tables import SquareTable as ST
import sys

In [None]:
#Load event log 
final_pm = pd.read_csv("Kidney_Failure_Log.csv")
hadms = list(final_pm["hadm_id"].unique())

In [None]:
parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'hadm_id'}
event_log = pm4py.format_dataframe(final_pm, case_id='hadm_id', activity_key='department', timestamp_key='intime')
log = pm4py.convert_to_event_log(event_log)

In [None]:
#retrieve all possible process variants and remove variants occuring < 20 times due to their small sample size
from pm4py.algo.filtering.log.variants import variants_filter
variants = variants_filter.get_variants(log)
variants = list(variants.keys())
var = final_pm.groupby('hadm_id')['department'].apply(list).reset_index()
var["department"] = var['department'].apply(lambda x: ','.join(map(str, x)))
var = var.rename({"department":"variant"}, axis=1)
final_pm_var = final_pm.merge(var, how="left", on="hadm_id")
var_count= final_pm_var.drop_duplicates("hadm_id").groupby("variant").count()
to_drop = list(var_count.loc[var_count["hadm_id"] < 20].reset_index()["variant"])
for ele in to_drop:
    variants.remove(ele)

In [None]:
def classify_attributes(proc_c):
    for index, row in proc_c.iterrows():
        if((row["numberOfActivities"] == 1) & (row["numberOfTraceOccurence (Mean)"] == 1)):
            proc_c.at[index, "class"] = "static"
        elif((row["numberOfActivities"] > 1) & (row["numberOfTraceOccurence (Mean)"] == 1)):
            proc_c.at[index, "class"] = "semi-dynamic"
        else:
            proc_c.at[index, "class"] = "dynamic"
    return proc_c

In [None]:
#specify activity column
activity = "department"
#specify case id
case_id = "hadm_id"
#specify attributes which should not be classified
columns_to_drop = ['Unnamed: 0','subject_id','transfer_id','intime','outtime']

In [None]:
#Classify event attributes, so that dynamic event attributes can be identified
final_pm = final_pm.drop(columns_to_drop, axis=1)

activities = final_pm[activity].unique()

matrix = pd.DataFrame(data=None, columns=activities)

#identify attributes for activities
att_card = pd.DataFrame(data=None,columns=final_pm.columns)
for dep in activities:
    dep_data = final_pm.loc[final_pm[activity] == dep]
    y = dep_data.groupby(activity).agg({lambda x: x.notnull().sum()})
    y.columns = y.columns.droplevel(1)
    y = y.reset_index().drop(activity, axis=1)
    row_num = len(dep_data)
    row = y.loc[0]
    for col in y.columns:
        t = 0.05
        if(row[col] > (row_num*t)):
            row[col] = 1
        else:
            row[col] = 0
    row[activity] = dep
    att_card = att_card.append(row)
    


att_card.drop(case_id, axis=1, inplace=True)

# for each attribute: number of activities + number of occurence in a trace

number_trace_occurence = final_pm.groupby(case_id).agg({lambda x: x.notnull().sum()})

#drop concept:name
number_trace_occurence.drop(activity, axis=1, inplace=True)

number_trace_occurence.columns = number_trace_occurence.columns.droplevel(1)

number_trace_occurence = number_trace_occurence.replace(0, np.NaN)

number_trace_occurence = number_trace_occurence.mean()

number_trace_occurence = number_trace_occurence.rename("numberOfTraceOccurence (Mean)")

number_of_activities = pd.Series([], name="numberOfActivities")

for col in final_pm.columns:
    if((col != case_id) & (col != activity)):
        number_of_activities[col] = len(final_pm[[activity, col]].dropna()[activity].unique())

process_characteristics = pd.concat([number_of_activities, number_trace_occurence], axis=1)

for col in final_pm.columns:
    if (final_pm[col].nunique()/final_pm[col].count() < 0.05):
        process_characteristics.loc[col, "type"] = "categorical"
    else:
        process_characteristics.loc[col, "type"] = "continuous"

process_characteristics = process_characteristics.drop(labels=[case_id, activity])

x = process_characteristics

x = classify_attributes(process_characteristics)

x = x.reset_index()

x = x.rename({"index":"Activity"}, axis=1)

attribute_classes = x[["Activity", "class", "type"]]

attribute_classes["CV"] = 0

deps = ["Emergency Department", "Pre-ICU Medicine", "Pre-ICU Cardiology", "Cardiac ICU", "Medical ICU", "Surgical ICU", "Post-ICU Cardiology", "Post-ICU Medicine", "Post-ICU Surgery", "Discharged"]

attribute_list_con = list(attribute_classes.loc[(attribute_classes["class"] == "dynamic") & (attribute_classes["type"] == "continuous")]["Activity"])

attribute_list_cat = list(attribute_classes.loc[(attribute_classes["class"] == "dynamic") & (attribute_classes["type"] == "categorical")]["Activity"])

In [None]:
dfg = dfg_discovery.apply(log)

In [None]:
#remove small sample size relations (optional)
l = list()
for x in dfg:
    if(("Pre-ICU Surgery" in x[0]) | ("Pre-ICU Surgery" in x[1])):
        l.append(x)
    elif (dfg[x] <= 30):
        l.append(x)
for e in l:
    del(dfg[e])       

efg_graph = efg_get.apply(log)

#remove small sample size relations (optional)
l = list()
for x in efg_graph:
    if(("Pre-ICU Surgery" in x[0]) | ("Pre-ICU Surgery" in x[1])):
        l.append(x)
    elif (efg_graph[x] <= 30):
        l.append(x)
for e in l:
    del(efg_graph[e])       

l = list()
for ele in efg_graph:
    if(ele in dfg):
        l.append(ele)
for e in l:
    del(efg_graph[e])      

In [None]:
# we have directly and eventually follow relations + info about where which attribute is used (att_card) 
#+ process characteristics (attribute_classes)
#perform statistical tests now

In [None]:
def consecutive_hadms(df, act_1, act_2):
    df = df.loc[df["department"].isin([act_1, act_2])]
    curr_hadm = 0
    index_1 = 0
    l = []
    for index, row in df.iterrows():
        #first row
        if(curr_hadm != row["hadm_id"]):
            curr_hadm = row["hadm_id"]
            index_1 = index
        else:
            if(index - index_1 == 1):
                l.append(row["hadm_id"])
    return df.loc[df["hadm_id"].isin(l)]

In [None]:
def eventually_follow_hadms(df, act_1, act_2):
    df = df.loc[df["department"].isin([act_1, act_2])]
    curr_hadm = 0
    l = []
    for index, row in df.iterrows():
        #first row
        if(curr_hadm != row["hadm_id"]):
            curr_hadm = row["hadm_id"]
        else:
            l.append(row["hadm_id"])
    return df.loc[df["hadm_id"].isin(l)]

In [None]:
def stat_value_con(dep_1, dep_2, ea, df):
    df_wo_na = df.loc[~df[ea].isna()]
    summary = df_wo_na.groupby("hadm_id").count()
    df_wo_na = summary.loc[(summary["department"] > 1) & (summary["department"] < 3)]
    hadms_wo_na = list(df_wo_na.reset_index()["hadm_id"])
    df_wo_na = df.loc[df["hadm_id"].isin(hadms_wo_na)]
    df = df_wo_na
    l1 = list(df.loc[(df["department"] == dep_1) & (~df[ea].isna())][ea])
    l2 = list(df.loc[(df["department"] == dep_2) & (~df[ea].isna())][ea])
    df1 = df.loc[(df["department"] == dep_1) & (~df[ea].isna())]
    df2 = df.loc[(df["department"] == dep_2) & (~df[ea].isna())]
    
    if((len(l1) < 8) | (len(l2) < 8)):
        return(np.nan,np.nan, np.nan, np.nan,np.nan,np.nan, np.nan, np.nan)
    try:
        p = pg.wilcoxon(l1, l2)["p-val"][0]
        cles = pg.wilcoxon(l1, l2)["CLES"][0]
        rbc = pg.wilcoxon(l1, l2)["RBC"][0]
        z = stats.norm.isf(p / 2)
        r = z / np.sqrt(len(l1)*2)        
        cohen = 2*r / np.sqrt(1-np.square(r))
        return (p, cles, rbc, len(l1), df1[ea].mean(), df2[ea].mean(), df1[ea].std(), df2[ea].std())
    except:
        return(1,0,0,0, 0, 0, 0, 0)

In [None]:
con_All = pd.DataFrame()
df_con = pd.DataFrame()
for rel in dfg:
    #varianten aus consecutive df extrahieren
    consecutive_df = consecutive_hadms(final_pm_var, rel[0], rel[1])
    variants = consecutive_df["variant"].unique()
    att_list = att_card.loc[att_card["department"].isin([rel[0], rel[1]])].sum().to_frame().reset_index()
    att_list = att_list.rename({"index":"e_At", 0:"cardinality"}, axis=1)
    att_list = att_list.loc[(att_list["cardinality"] == 2) & (att_list["e_At"].isin(attribute_list_con))].reset_index()
    for e_at in att_list["e_At"]:
        p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, consecutive_df)
        con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)    
        if(p <= (0.05 / len(att_list))):
            df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)
        for var in variants:
            df_var = consecutive_df.loc[consecutive_df["variant"] == var]
            p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, df_var)
            con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)
            if(p <= (0.05 / len(att_list))):
                df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':True}, ignore_index=True)
       

In [None]:
for rel in efg_graph:
    #varianten aus consecutive df extrahieren
    consecutive_df = eventually_follow_hadms(final_pm_var, rel[0], rel[1])
    variants = consecutive_df["variant"].unique()
    att_list = att_card.loc[att_card["department"].isin([rel[0], rel[1]])].sum().to_frame().reset_index()
    att_list = att_list.rename({"index":"e_At", 0:"cardinality"}, axis=1)
    att_list = att_list.loc[(att_list["cardinality"] == 2) & (att_list["e_At"].isin(attribute_list_con))].reset_index()
    for e_at in att_list["e_At"]:
        p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, consecutive_df)
        con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)    
        if(p <= (0.05 / len(att_list))):
            df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : 'ALL', '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)
            for var in variants:
                df_var = consecutive_df.loc[consecutive_df["variant"] == var]
                p, cles, rbc, num_p, m1, m2, st1, st2 = stat_value_con(rel[0], rel[1], e_at, df_var)
                con_All = con_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)
                if(p <= (0.05 / len(att_list))):
                    df_con = df_con.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "RBC": rbc, 'abs(RBC)': abs(rbc), 'var' : var, '#Patients' : num_p, 'M1':m1, 'M2':m2, 'ST1':st1, 'ST2':st2, 'Directly':False}, ignore_index=True)

con_All = con_All.loc[~con_All["P"].isna()]       

In [None]:
def stuart_maxwell(cons_df, dep1, dep2, att):
    graph_stats = cons_df[["hadm_id", "department", att]]
    to_remove = graph_stats.loc[graph_stats[att].isna()]["hadm_id"]
    graph_stats = graph_stats.loc[~graph_stats["hadm_id"].isin(to_remove)]
    curr_hadm = ""
    first_val = ""
    second_val = ""
    abnormal_col = graph_stats.columns[2]
    val_count = graph_stats[abnormal_col].value_counts()
    graph_cat = pd.DataFrame(columns=["Source", "Target", "Frequency"])
    for col_source in val_count.index:
        for col_target in val_count.index:
            new_row = {"Source":col_source, "Target":col_target, "Frequency": 0}
            graph_cat = graph_cat.append(new_row, ignore_index=True)
    for index, row in graph_stats.iterrows():
        if(curr_hadm != row["hadm_id"]):
            curr_hadm = row["hadm_id"]
            first_val = row[abnormal_col]
        else:
            second_val = row[abnormal_col]
            if((pd.isna(first_val)) | (pd.isna(second_val))):
                pass
            else:
                freq = graph_cat.loc[(graph_cat["Source"] == first_val) & (graph_cat["Target"] == second_val)]["Frequency"].iloc[0]
                graph_cat.loc[(graph_cat["Source"] == first_val) & (graph_cat["Target"] == second_val), "Frequency"] = freq+1
    tab = graph_cat.set_index(['Source', 'Target'])
    tab = tab.unstack()
    tab.columns = tab.columns.get_level_values(1)
    sqtab = ST(tab)
    test = sqtab.homogeneity()
    p = test.pvalue
    chi2 = test.statistic
    return tab, p, chi2


In [None]:
def stat_value_cat(dep_1, dep_2, ea, df):
    df_wo_na = df.loc[~df[ea].isna()]
    summary = df_wo_na.groupby("hadm_id").count()
    df_wo_na = summary.loc[(summary["department"] > 1) & (summary["department"] < 3)]
    hadms_wo_na = list(df_wo_na.reset_index()["hadm_id"])
    df_wo_na = df.loc[df["hadm_id"].isin(hadms_wo_na)]
    df = df_wo_na
    num_p = len(df.loc[(df["department"] == dep_1) & (~df[ea].isna())])
    count_1 = df.loc[(df["department"] == dep_1) & (~df[ea].isna())][ea].value_counts()
    count_2 = df.loc[(df["department"] == dep_2) & (~df[ea].isna())][ea].value_counts()
    if((len(count_1) < 2) | (len(count_2) < 2)):
        return(np.nan,np.nan, np.nan)
    g, p, chi2 = stuart_maxwell(df, dep_1, dep_2, ea)
    return (p, chi2, num_p)

In [None]:
cat_All = pd.DataFrame()
df_cat = pd.DataFrame()
for rel in dfg:
    #varianten aus consecutive df extrahieren
    consecutive_df = consecutive_hadms(final_pm_var, rel[0], rel[1])
    variants = consecutive_df["variant"].unique()
    att_list = att_card.loc[att_card["department"].isin([rel[0], rel[1]])].sum().to_frame().reset_index()
    att_list = att_list.rename({"index":"e_At", 0:"cardinality"}, axis=1)
    att_list = att_list.loc[(att_list["cardinality"] == 2) & (att_list["e_At"].isin(attribute_list_cat))].reset_index()
    for e_at in att_list["e_At"]:
        stat_value_cat(rel[0], rel[1], e_at, consecutive_df)
        p, chi2, num_p = stat_value_cat(rel[0], rel[1], e_at, consecutive_df)
        cat_All = cat_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : 'ALL', '#Patients' : num_p, 'Directly':True}, ignore_index=True)
        if(p <= (0.05) / len(att_list)):
            df_cat = df_cat.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : 'ALL', '#Patients' : num_p, 'Directly':True}, ignore_index=True)
        for var in variants:
            df_var = consecutive_df.loc[consecutive_df["variant"] == var]
            p, chi2, num_p = stat_value_cat(rel[0], rel[1], e_at, df_var)
            cat_All = cat_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : var, '#Patients' : num_p, 'Directly':True}, ignore_index=True)
            if(p <= (0.05) / len(att_list)):
                df_cat = df_cat.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : var, '#Patients' : num_p, 'Directly':True}, ignore_index=True)
       

In [None]:
for rel in efg_graph:
    #varianten aus consecutive df extrahieren
    consecutive_df = eventually_follow_hadms(final_pm_var, rel[0], rel[1])
    variants = consecutive_df["variant"].unique()
    att_list = att_card.loc[att_card["department"].isin([rel[0], rel[1]])].sum().to_frame().reset_index()
    att_list = att_list.rename({"index":"e_At", 0:"cardinality"}, axis=1)
    att_list = att_list.loc[(att_list["cardinality"] == 2) & (att_list["e_At"].isin(attribute_list_cat))].reset_index()
    for e_at in att_list["e_At"]:
        stat_value_cat(rel[0], rel[1], e_at, consecutive_df)
        p, chi2, num_p = stat_value_cat(rel[0], rel[1], e_at, consecutive_df)
        cat_All = cat_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : 'ALL', '#Patients' : num_p, 'Directly':False}, ignore_index=True)
        if(p <= (0.05) / len(att_list)):
            df_cat = df_cat.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : 'ALL', '#Patients' : num_p, 'Directly':False}, ignore_index=True)
        for var in variants:
            df_var = consecutive_df.loc[consecutive_df["variant"] == var]
            p, chi2, num_p = stat_value_cat(rel[0], rel[1], e_at, df_var)
            cat_All = cat_All.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : var, '#Patients' : num_p, 'Directly':False}, ignore_index=True)
            if(p <= (0.05)/ len(att_list)):
                df_cat = df_cat.append({'Act_1': rel[0], 'Act_2': rel[1], 'E_At': e_at, 'P': p, "Chi2": chi2, 'var' : var, '#Patients' : num_p, 'Directly':False}, ignore_index=True)
cat_All = cat_All.loc[~cat_All["P"].isna()]         
       

In [None]:
con_All.to_csv("con_All.csv")

In [None]:
cat_All.to_csv("cat_All.csv")

In [None]:
df_cat["Con_E_At"] = df_cat["E_At"].str.split(' ', 1, expand=True)[1]

In [None]:
df_cat.to_csv("df_cat.csv")

In [None]:
df_con.to_csv("df_con.csv")

In [None]:
final_pm_var.to_csv("Kidney_Failure_Variant.csv")