In [1]:
import pandas as pd
import statistics
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

Import an event log as a pandas dataframe

In [2]:
df = pd.read_csv("small3insert.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

In [3]:
df.head()

Unnamed: 0,Case,Event,Activity,Timestamp,Resource,Resource_failure_rate,Resource_Pass/Fail,order,resource_anomaly_type,resource_parameter,trace_change_resource,variant_num
0,case_0,9367,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,0.026984,0,1,normal,,0,var_358
1,case_0,9368,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,0.018454,0,2,normal,,0,var_358
2,case_0,9369,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,0.002551,0,3,normal,,0,var_358
3,case_0,9370,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,0.029775,0,4,normal,,0,var_358
4,case_0,9371,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,0.018454,0,5,normal,,0,var_358


Create an event log that retains only the attributes caseid, activity, timestamp, resource and outcome. Rename the attributes as “caseid, activity, ts, resource, outcome” if names are different (note: for BPIC17 the outcome label coliumn is “(case) Accepted”, for the small log` is “resource_anomaly_type”)

In [4]:
df = df[["Case","Activity","Timestamp","Resource","resource_anomaly_type","Event"]]

In [5]:
df.columns = ["caseid","activity", "ts","resource","outcome","event"]

In [6]:
# In order to calculate the delta time we needed to check if the caseID is the same
list_ = []
for i in range(len(df)-1):
    if df.iloc[i+1]["caseid"] == df.iloc[i]["caseid"]:
        list_ = list_ + [df.iloc[i+1]["ts"]-df.iloc[i]["ts"] ]
    else:
        list_ = list_ + [list_[-1]]
list_ = list_ + [list_[-1]]
df["t"] = list_

In [7]:
df.head()

Unnamed: 0,caseid,activity,ts,resource,outcome,event,t
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal,9367,0 days 01:00:00
1,case_0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal,9368,0 days 01:00:00
2,case_0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal,9369,0 days 01:00:00
3,case_0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal,9370,0 days 01:00:00
4,case_0,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal,9371,0 days 01:00:00


Create a function to extract the prefixes of length L from the event log. This function retains only the first L events of all cases that have at least L events    

In [8]:
def first_L_events(df,L):
    df_t = df.groupby(["caseid"]).agg(list).reset_index()[["caseid","event"]]

    list_len = df_t["event"].map(lambda x: len(x))
    df_t["len"] = list_len
    df_L = df_t.loc[df_t['len'] >= L] 
    df_L["event_prefix"] = df_L["event"].map(lambda x: x[0:L])
    return df_L[["caseid","event_prefix"]]
 
    
    

In [9]:
first_L_events(df, 5)

Unnamed: 0,caseid,event_prefix
0,case_0,"[9367, 9368, 9369, 9370, 9371]"
1,case_1,"[13225, 13226, 13227, 13227, 13227]"
2,case_10,"[1025, 1026, 1027, 1028, 1029]"
3,case_100,"[39553, 39554, 39555, 39556, 39557]"
4,case_1000,"[27951, 27952, 27953, 27954, 27955]"
...,...,...
4995,case_995,"[11080, 11081, 11082, 11083, 11084]"
4996,case_996,"[9536, 9537, 9538, 9539, 9540]"
4997,case_997,"[7567, 7568, 7569, 7570, 7571]"
4998,case_998,"[8949, 8950, 8951, 8952, 8953]"


Create a function that does aggregation encoding of a prefix extracted from the event log: Activity and resource are aggregated using frequency Timestamp are aggregated using average

In [10]:
def aggregation_encoding(df):
    def avg(x):
        avg = x[0]
        for i in x[1:]:
            avg = avg + i
        return avg/len(x)
    
    
    df_grouped = df.groupby(["caseid"]).agg(list)
    for activity in df["activity"].unique():
        df_grouped[activity] = df_grouped["activity"].map(lambda x : x.count(activity)/len(x))

    for activity in df["resource"].unique():
        df_grouped[activity] = df_grouped["resource"].map(lambda x : x.count(activity)/len(x))
    df_grouped["avg_t"] = df_grouped["t"].map(lambda x : avg(x))


    return df_grouped[df_grouped.columns[6:]].copy()
    

In [11]:
a = aggregation_encoding(df)
a.head()

Unnamed: 0_level_0,Activity A,Activity B,Activity C,Activity D,Activity E,Activity T,Activity F,Activity G,Activity R,Activity K,...,Activity I,Activity J,Activity Q,Resource_Group2_res_1,Resource_Group0_res_0,Resource_Group1_res_0,Resource_Group2_res_2,Resource_Group2_res_0,Resource_Group1_res_1,avg_t
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
case_0,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.142857,0.571429,0.142857,0.142857,0.0,0.0,0 days 01:00:00
case_1,0.083333,0.083333,0.083333,0.083333,0.083333,0.0,0.083333,0.083333,0.083333,0.083333,...,0.0,0.0,0.0,0.166667,0.5,0.0,0.0,0.083333,0.25,0 days 00:50:00
case_10,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.142857,0.571429,0.142857,0.0,0.142857,0.0,0 days 01:00:00
case_100,0.1,0.1,0.1,0.1,0.1,0.0,0.1,0.0,0.0,0.1,...,0.0,0.0,0.0,0.1,0.4,0.3,0.0,0.1,0.1,0 days 01:00:00
case_1000,0.1,0.1,0.1,0.1,0.1,0.0,0.1,0.0,0.1,0.1,...,0.0,0.0,0.0,0.1,0.4,0.3,0.0,0.1,0.1,0 days 01:00:00


Create a function that does index-based encoding of a extrcted from the evebnt log.
Activity and resource are one-hot encoded Timestamp is left as-is


In [15]:
def index_encoding(df):
    df_grouped= df.groupby(["caseid"]).agg(list)
    max_lenght = df_grouped["activity"].map(lambda x: len(x)).max()
    
    for i in range(0,max_lenght):
        for act in df["activity"].unique():
            df_grouped[f"{act}_{i+1}"] = df_grouped["activity"].map(lambda x : 1 if act in x else 0).copy()
        for resource in df["resource"].unique():
            df_grouped[f"{resource}_{i+1}"] = df_grouped["resource"].map(lambda x : 1 if resource in x else 0).copy() 
    
        t_i = []
    
        for index in range(0,len(df_grouped)):
            try:
                t_i.append(df_grouped.iloc[index]["ts"][i])
            except:
                t_i.append(0)
        
        df_grouped[f"t_{i+1}"] = t_i

    df_grouped["out"] = df_grouped["outcome"].map(lambda x : x[0])
    return df_grouped[df_grouped.columns[6:]].copy()

In [16]:
a  =index_encoding(df)
a.head()

Unnamed: 0_level_0,Activity A_1,Activity B_1,Activity C_1,Activity D_1,Activity E_1,Activity T_1,Activity F_1,Activity G_1,Activity R_1,Activity K_1,...,Activity J_13,Activity Q_13,Resource_Group2_res_1_13,Resource_Group0_res_0_13,Resource_Group1_res_0_13,Resource_Group2_res_2_13,Resource_Group2_res_0_13,Resource_Group1_res_1_13,t_13,out
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
case_0,1,1,1,1,1,1,1,0,0,0,...,0,0,1,1,1,1,0,0,0,normal
case_1,1,1,1,1,1,0,1,1,1,1,...,0,0,1,1,0,0,1,1,0,insert
case_10,1,1,1,1,1,1,1,0,0,0,...,0,0,1,1,1,0,1,0,0,normal
case_100,1,1,1,1,1,0,1,0,0,1,...,0,0,1,1,1,0,1,1,0,normal
case_1000,1,1,1,1,1,0,1,0,1,1,...,0,0,1,1,1,0,1,1,0,normal


Train a decision tree (DT) and a randomforest (RF) from the scikit-learn package for outcome label prediction for the event log for prefix lengths L=2 and L=5.  Split 70-30 for training-testing (no temporal splitting or cross-validation needed!).

### Random Forest

In [17]:
df_rf = aggregation_encoding(df)

In [18]:
df_rf["avg_t"] = df_rf["avg_t"].map(lambda x : x.total_seconds())

In [19]:
df_grouped= df.groupby(["caseid"]).agg(list)
df_rf["out"] = df_grouped["outcome"].map(lambda x : x[0])

In [20]:
events = first_L_events(df,2)

In [22]:
XY = df_rf[df_rf.index.isin(events["caseid"])]

In [23]:
labels = XY.pop('out').copy()
data = XY.copy()

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3)

In [25]:
clf = RandomForestClassifier(max_depth=5, oob_score=True)
clf.fit(X_train, y_train)

In [26]:
prediction = clf.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
accuracy_score(y_test, prediction)

1.0

### Decision Tree

In [29]:
clf = RandomForestClassifier(max_depth=5, oob_score=True,n_estimators=1)
clf.fit(X_train, y_train)

In [30]:
prediction = clf.predict(X_test)

In [31]:
accuracy_score(y_test, prediction)

0.9626666666666667