This notebook takes the generated event log in CSV format and classifies the event attributes according to their process characteristic. Further, it calculates the coefficient of variation(CV), which is used to measure the degree of variety.

In [None]:
import numpy as np
from psycopg2 import connect
import pandas as pd
import pm4py
import numpy as np
import pandasql as ps
from scipy.stats import variation
import matplotlib.cm as cm
import matplotlib
import sklearn.preprocessing as sk

In [None]:
final_pm = pd.read_csv("Heart_Failure_Log.csv")

In [None]:
#drop unnecessary attributes(optional)
final_pm = final_pm.drop(["subject_id", "transfer_id", "intime", "outtime", "Unnamed: 0", "stay_id"], axis=1)

In [None]:
#specify column where the activity is stored
activity = "department"

In [None]:
#specify column where the case identifier is stored
case_id = "hadm_id"

In [None]:
activities = final_pm[activity].unique()

In [None]:
matrix = pd.DataFrame(data=None, columns=activities)

In [None]:
#identify attributes for activities
att_card = pd.DataFrame(data=None,columns=final_pm.columns)
for dep in activities:
    print(dep)
    dep_data = final_pm.loc[final_pm[activity] == dep]
    y = dep_data.groupby(activity).agg({lambda x: x.notnull().sum()})
    y.columns = y.columns.droplevel(1)
    y = y.reset_index().drop(activity, axis=1)
    row_num = len(dep_data)
    row = y.loc[0]
    for col in y.columns:
        t = 0.05
        if(row[col] > (row_num*t)):
            row[col] = 1
        else:
            row[col] = 0
    row[activity] = dep
    att_card = att_card.append(row)
    


In [None]:
att_card.drop(case_id, axis=1, inplace=True)

In [None]:
att_card.to_csv("attributesForActivity.csv")

In [None]:
# for each attribute: number of activities + number of occurence in a trace

In [None]:
number_trace_occurence = final_pm.groupby(case_id).agg({lambda x: x.notnull().sum()})

In [None]:
#drop concept:name
number_trace_occurence.drop(activity, axis=1, inplace=True)

In [None]:
number_trace_occurence.columns = number_trace_occurence.columns.droplevel(1)

In [None]:
number_trace_occurence = number_trace_occurence.mean()

In [None]:
number_trace_occurence = number_trace_occurence.rename("numberOfTraceOccurence (Mean)")

In [None]:
number_of_activities = pd.Series([], name="numberOfActivities")

In [None]:
for col in final_pm.columns:
    if((col != case_id) & (col != activity)):
        number_of_activities[col] = len(final_pm[[activity, col]].dropna()[activity].unique())

In [None]:
process_characteristics = pd.concat([number_of_activities, number_trace_occurence], axis=1)

In [None]:
for col in final_pm.columns:
    if (final_pm[col].nunique()/final_pm[col].count() < 0.05):
        process_characteristics.loc[col, "type"] = "categorical"
    else:
        process_characteristics.loc[col, "type"] = "continuous"

In [None]:
process_characteristics = process_characteristics.drop(labels=[case_id, activity])

In [None]:
x = process_characteristics

In [None]:
def classify_attributes(proc_c):
    for index, row in proc_c.iterrows():
        if((row["numberOfActivities"] == 1) & (row["numberOfTraceOccurence (Mean)"] <= 1)):
            proc_c.at[index, "class"] = "static"
        elif((row["numberOfActivities"] > 1) & (row["numberOfTraceOccurence (Mean)"] <= 1)):
            proc_c.at[index, "class"] = "semi-dynamic"
        else:
            proc_c.at[index, "class"] = "dynamic"
    return proc_c

In [None]:
x = classify_attributes(process_characteristics)

In [None]:
x = x.reset_index()

In [None]:
x = x.rename({"index":"Activity"}, axis=1)

In [None]:
attribute_classes = x[["Activity", "class", "type"]]

In [None]:
attribute_classes["CV"] = 0

In [None]:
recom = final_pm.copy(deep=True)

In [None]:
scaler = sk.MinMaxScaler(feature_range=(1,2))

In [None]:
#normalize float values and create numbers for categories
i = 1
for col in recom.columns:
    if ((col != case_id) & (col != activity)):
        if(attribute_classes.loc[attribute_classes["Activity"] == col]["type"].iloc[0] != "continuous"):
            v_counts = final_pm[col].value_counts()
            i = 1
            for index, value in v_counts.items():
                if((index != 1) & (index != 2)):
                    recom[col] = final_pm[col].replace(index, i)
                    i = i + 1
                else:
                    i = i + 1


In [None]:
def co_var(x):
    return variation(x, nan_policy = "omit")

In [None]:
#recom filtering all non dynamic out

In [None]:
recom.drop(activity, axis=1, inplace=True)
for col in recom.columns:
    if((col != case_id) & (col != activity)):
        if(attribute_classes.loc[attribute_classes["Activity"] == col]["class"].iloc[0] != "dynamic"):
            recom.drop(col, axis=1, inplace=True)

In [None]:
co_vars = recom.groupby("hadm_id").agg(co_var)

In [None]:
final_co_vars = co_vars.mean().sort_values(ascending=False)

In [None]:
for index, row in attribute_classes.iterrows():
    if(row["class"] == "dynamic"):
        attribute_classes.loc[attribute_classes["Activity"] == row["Activity"], "CV"] = final_co_vars[row["Activity"]]

In [None]:
attribute_classes.to_csv("attributeClasses.csv")