In [1]:
# import libraties
import pandas as pd
import numpy as np
import json
import pickle

# https://pm4py.fit.fraunhofer.de/documentation
import pm4py
from pm4py.objects.log.util.log import project_traces
from pm4py.objects.log.util import interval_lifecycle
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
from pm4py.objects.log.obj import EventLog, Trace

import warnings
warnings.filterwarnings("ignore")

In [2]:
# function to project the trace
def project_nth(log, index):
    print(str(project_traces(log)[index]))

In [3]:
# read data in csv 
trace = pd.read_csv('../data/BPIC2020_CSV/filterd_TravelPermits.csv')
trace.head()

Unnamed: 0,case,event,startTime,completeTime,dec_id_5,dec_id_6,dec_id_3,dec_id_4,dec_id_1,dec_id_2,...,Task_6,org:resource,event_id,org:role,concept:name,time:timestamp,@@index,@@case_index,start_timestamp,case:concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,2018-02-20 13:51:27+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10031_0,EMPLOYEE,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,0,0,2018-02-20 13:51:27+00:00,travel permit 10022
1,travel permit 10022,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,2018-02-20 13:51:34+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10030_0,ADMINISTRATION,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,1,0,2018-02-20 13:51:34+00:00,travel permit 10022
2,travel permit 10022,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2018-02-20 16:27:33+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10029_0,BUDGET OWNER,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2,0,2018-02-20 16:27:33+00:00,travel permit 10022
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,2018-02-21 12:58:49+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,st_step 10028_0,SUPERVISOR,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,3,0,2018-02-21 12:58:49+00:00,travel permit 10022
4,travel permit 10022,Start trip,2018-03-17 00:00:00+00:00,2018-03-17 00:00:00+00:00,,,,,declaration 10026,,...,,STAFF MEMBER,rv_travel permit 10022_6,EMPLOYEE,Start trip,2018-03-17 00:00:00+00:00,4,0,2018-03-17 00:00:00+00:00,travel permit 10022


# Comman Data Pre Processing

In [4]:
# remove columns where missing > 30%

# get persentage of missing values in a column
percent_missing = trace.isnull().sum() * 100 / len(trace)
missing_value_df = pd.DataFrame({'column_name': trace.columns,
                                 'percent_missing': percent_missing})

# extract list of coumns which are more then 30%
miss_ls = list(percent_missing[percent_missing>30].keys())

# keep the remaining columns where missing data is less then 30%
trace = trace.loc[:, ~trace.columns.isin(miss_ls)]
trace.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,OrganizationalEntity,org:resource,event_id,org:role,concept:name,time:timestamp,@@index,@@case_index,start_timestamp,case:concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,2018-02-20 13:51:27+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,organizational unit 65466,STAFF MEMBER,st_step 10031_0,EMPLOYEE,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,0,0,2018-02-20 13:51:27+00:00,travel permit 10022
1,travel permit 10022,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,2018-02-20 13:51:34+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,organizational unit 65466,STAFF MEMBER,st_step 10030_0,ADMINISTRATION,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,1,0,2018-02-20 13:51:34+00:00,travel permit 10022
2,travel permit 10022,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2018-02-20 16:27:33+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,organizational unit 65466,STAFF MEMBER,st_step 10029_0,BUDGET OWNER,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2,0,2018-02-20 16:27:33+00:00,travel permit 10022
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,2018-02-21 12:58:49+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,organizational unit 65466,STAFF MEMBER,st_step 10028_0,SUPERVISOR,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,3,0,2018-02-21 12:58:49+00:00,travel permit 10022
4,travel permit 10022,Start trip,2018-03-17 00:00:00+00:00,2018-03-17 00:00:00+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,organizational unit 65466,STAFF MEMBER,rv_travel permit 10022_6,EMPLOYEE,Start trip,2018-03-17 00:00:00+00:00,4,0,2018-03-17 00:00:00+00:00,travel permit 10022


In [5]:
# adding time features month and year
trace['startTime'] = pd.to_datetime(trace['startTime'])
trace['completeTime'] = pd.to_datetime(trace['completeTime'])
trace['year'] = trace['startTime'].dt.year
trace['month'] = trace['startTime'].dt.month

# converting dataframe to event log
trace_log = pm4py.format_dataframe(trace, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

# add other time features
trace_log = interval_lifecycle.assign_lead_cycle_time(trace_log)

### Keeping traces only till rejection event or complete if not rejected

- No bucketing/clustering/binding is applied (to do if needed)
- we extracted traces before the rejection for decleration, because we are intrested in prediction for the traces before that only 
- some process do not follow the above path but those are considered as accapted

In [6]:
# extract traces only till the decleration is rejected(included), otherwise include it payment handled 
prefix_traces = []
for trace in trace_log:
    trace_end_flag = False
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            trace_end_flag = True
            i+=1
            break
        if "Payment Handled" in event['event']:
            trace_end_flag = True
    if trace_end_flag:
        prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [7]:
len(prefix_traces)

5131

### Extracting target variables

In [8]:
# convert extracted traces to dataframe
trace = pm4py.convert_to_dataframe(prefix_traces)
trace.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,concept:name,time:timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,2018-02-20 13:51:27+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,0.0,0.0,0.0,0.0,1.0,travel permit 10022
1,travel permit 10022,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,2018-02-20 13:51:34+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,0.0,7.0,7.0,7.0,0.0,travel permit 10022
2,travel permit 10022,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2018-02-20 16:27:33+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,0.0,9366.0,9366.0,9359.0,0.0,travel permit 10022
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,2018-02-21 12:58:49+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,0.0,32842.0,32842.0,23476.0,0.0,travel permit 10022
4,travel permit 10022,Start trip,2018-03-17 00:00:00+00:00,2018-03-17 00:00:00+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,3,Start trip,2018-03-17 00:00:00+00:00,0.0,659313.0,659313.0,626471.0,0.0,travel permit 10022


In [9]:
# saving original trace dataframe for one hot encoding 
original_df = trace.copy()

# train test split

- Do temporal train test split and preprocess them seperately so that no data leakage will be present
- we split data based on 70% 30 % (Temporal split)
- classifier will be trained with all cases that started before a given date T1 which would represent a current point in time in a real-life scenario, and the testing is done only on cases that start afterwards.

In [10]:
# get completion time and sort
completion_time_ls = list(trace.groupby(['case'])['completeTime'].max())
completion_time_ls = sorted(completion_time_ls)

In [11]:
# split on 70% max time 
train_split_portion = 0.70
val_split_seperation = 0.85
total_data = len(completion_time_ls)
train_len = int(train_split_portion*total_data)
val_len = int(val_split_seperation*total_data)
last_train_completion_time = completion_time_ls[train_len]
last_val_completion_time = completion_time_ls[val_len]
val_start_time = last_train_completion_time
last_train_completion_time,last_val_completion_time

(Timestamp('2018-09-18 09:50:18+0000', tz='UTC'),
 Timestamp('2018-11-08 13:31:56+0000', tz='UTC'))

In [12]:
# take all traces where start dates are after the last_train_completion_time
dtype_list = list(trace.dtypes) # get original types of the columns
train_df = pd.DataFrame(columns = trace.columns)
test_df = pd.DataFrame(columns = trace.columns)
val_df = pd.DataFrame(columns = trace.columns)
train_count,test_count,val_count = 0,0,0
intersecting_traces = []
for name, group in trace.groupby(['case'],as_index=False):
    if group['completeTime'].iloc[-1] <= last_train_completion_time:
        train_df = train_df.append(group)
        train_count+=1
    elif (group['startTime'].iloc[0] >= last_train_completion_time) and (group['completeTime'].iloc[-1] <= last_val_completion_time):
        val_df = val_df.append(group)
        val_count+=1        
    elif group['startTime'].iloc[0] >= last_val_completion_time:
        test_df = test_df.append(group)
        test_count+=1
    else:
        intersecting_traces.append(group)


# converting train and test to their original data types
for i,col in enumerate(train_df.columns):
    train_df[col] = train_df[col].astype(dtype_list[i])

for i,col in enumerate(test_df.columns):
    test_df[col] = test_df[col].astype(dtype_list[i])

for i,col in enumerate(val_df.columns):
    val_df[col] = val_df[col].astype(dtype_list[i])


print("train, val and test count")
print(train_count,val_count,test_count)

train, val and test count
3592 157 167


In [13]:
# loss of traces due to temporal intersection
# these are the traces which started and intersecting with split times
len(intersecting_traces)

1215

# Define common variables

In [14]:
# trace length and saving path
t_length = 10
save_path_base = '../data/training_data/'

# ====================================
# Preparing Train Data

- define trace length and df_type (train or test)
- Working on train and test seperately, so run whole code for the same trace length for train and then for test
- the repeating code will be modularized when developing python file

In [15]:
# permits is just for the variable name, as below code is using permit as dataframe name
df_type = 'train'
permits = train_df

# Extracting Prefix till specified length

- Trace length ( 10 for now - no specific reason, process is not big so randomly chosen 10 / to do)
- so basically all the traces of length less than equal to 10 are chosen, 
    - We need to deal if vector length is not same (10 in our case) - we performed padding (adding zeros at the end) during encoding for those traces
    - so we have traces with length 1,2,... 10 
    - any trace with length less then equal to 10 can be predicted using our model

In [16]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

### Extracting target variable

- In our case target variable is REJECTED Declaration, in the process whoever rejects the Declaration (Administrartor, pre-approvar, etc.) it is considered as Declaration REJECTED so using it to identify our target variable

- One of our assumption is - If the decleration is not submitted by Employee and the payment is handeled directly, it is also considered as Declaration Accapted (so basically all the Declarations which is not rejected by employee are considerd as Accapted )

- we are not dealing with situations where it got rejected twice. For now we are only interested in 1st time rejection


In [17]:
# to extract target varaible, 
# if event starts with the name decleration rejected it is considered as rejected
declerations = []
for trace in trace_log:
    flag = False
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            flag = True
            break
    
    if flag:
        declerations.append(1)
    else:
        declerations.append(0)

In [18]:
# extract traces only till the decleration is rejected (excluded), otherwise complete trace 
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [19]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in prefix_traces])

In [20]:
# check the trace length
print([len(trace) for trace in prefix_traces][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[16, 16, 8, 8, 6, 17, 8, 10, 8, 9, 8, 15, 10, 8, 7]
[10, 10, 8, 8, 6, 10, 8, 10, 8, 9, 8, 10, 10, 8, 7]


In [21]:
# check if all good
project_nth(trace_prefixes, 0)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION']


In [22]:
# to check which traces are not of our desired length (these will be padded while preparing the the training data)
for i,trace in enumerate(trace_prefixes):
    if len(trace)!=t_length:
        print(i, len(trace))
        break

2 8


In [23]:
# convert logs to dataframe
# final base dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,2018-02-20 13:51:27+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,0.0,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,travel permit 10022
1,travel permit 10022,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,2018-02-20 13:51:34+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,7.0,7.0,7.0,0.0,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,travel permit 10022
2,travel permit 10022,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2018-02-20 16:27:33+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,9366.0,9366.0,9359.0,0.0,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,travel permit 10022
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,2018-02-21 12:58:49+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,32842.0,32842.0,23476.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,travel permit 10022
4,travel permit 10022,Start trip,2018-03-17 00:00:00+00:00,2018-03-17 00:00:00+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,3,0.0,659313.0,659313.0,626471.0,0.0,Start trip,2018-03-17 00:00:00+00:00,travel permit 10022


In [24]:
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name
0,travel permit 10022,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,2018-02-20 13:51:27+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,0.0,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-02-20 13:51:27+00:00,travel permit 10022
1,travel permit 10022,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,2018-02-20 13:51:34+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,7.0,7.0,7.0,0.0,Permit APPROVED by ADMINISTRATION,2018-02-20 13:51:34+00:00,travel permit 10022
2,travel permit 10022,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,2018-02-20 16:27:33+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,9366.0,9366.0,9359.0,0.0,Permit APPROVED by BUDGET OWNER,2018-02-20 16:27:33+00:00,travel permit 10022
3,travel permit 10022,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,2018-02-21 12:58:49+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,2,0.0,32842.0,32842.0,23476.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-02-21 12:58:49+00:00,travel permit 10022
4,travel permit 10022,Start trip,2018-03-17 00:00:00+00:00,2018-03-17 00:00:00+00:00,travel permit 10022,395,False,travel permit number 10023,-50.662542,2134.047941,...,2018,3,0.0,659313.0,659313.0,626471.0,0.0,Start trip,2018-03-17 00:00:00+00:00,travel permit 10022


## Check last activity distribution for all trace length

- Run the below code when trace length is 10
- because it will give you all traces ending at 10,8,6,4 lengths
- if trace is smaller then nth (4,6,8,10) length we keep the last activity if the trace

In [25]:
# save distribution
event_distribution_LIST = []
# event_distribution_LIST['10'],event_distribution_LIST['8'],event_distribution_LIST['6'],event_distribution_LIST['4'] = [],[],[],[]
trace_lengths = [4,6,8,10]
for name, group in df.groupby(['case'],sort=False):
    # print(name)
    event_list = list(group['event'])
    for t_len in trace_lengths:
        index = t_len-1
        try:
            event_distribution_LIST.append([event_list[index],t_len])
        except:
            event_distribution_LIST.append([event_list[-1],t_len])

In [26]:
# create dataframe
act_distribution = pd.DataFrame(event_distribution_LIST, columns=['Activities','Trace length'])
act_distribution.head()

Unnamed: 0,Activities,Trace length
0,Permit FINAL_APPROVED by SUPERVISOR,4
1,End trip,6
2,Declaration APPROVED by ADMINISTRATION,8
3,Declaration APPROVED by ADMINISTRATION,10
4,Request For Payment SUBMITTED by EMPLOYEE,4


In [27]:
# value count and normalized
df_dist = act_distribution.groupby('Trace length')['Activities'].value_counts(normalize=True)
df_dist = df_dist.mul(100)
df_dist = df_dist.rename('Percent count').reset_index()
df_dist.head()

Unnamed: 0,Trace length,Activities,Percent count
0,4,Start trip,39.86637
1,4,Permit FINAL_APPROVED by SUPERVISOR,19.682628
2,4,End trip,12.834076
3,4,Request For Payment SUBMITTED by EMPLOYEE,11.219376
4,4,Permit FINAL_APPROVED by DIRECTOR,4.008909


In [28]:
df_dist.to_csv('../last_activity_distribution_against_tracelength.csv',index='False')

In [35]:
# plot the distribution
import plotly.express as px
fig = px.bar(df_dist, x="Trace length", y="Percent count", color="Activities", title="Last Activity v/s Trace Length Distribution")
fig.update_layout(xaxis_title="Trace Length (in minutes)", yaxis_title="Percentage(%) Count of Activities")
fig.update_xaxes(tickvals=[4,6,8,10])
fig.show()
fig.write_html("../docs/activity_vs_trace_length_distribution.html")

## Feature Selection ( Based on Data Exploration )

In [149]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name']
str_tr_attr = ['OrganizationalEntity','month']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = []

# Data encoding

- preparing input data to be passed, 
- there are different encodings we can go with
    - boolean encoding (one hot encoding if the activity present or not)
    - frequency encoding (count of activity)
    - simple index encoding (n events one hot encoding)
    - latest payload encoding (with trace attributes)
    - index payload encoding (n events one hot encoding + with trace attributes)
    - complex index based encoding (static feature + nevents encoding + event features )
    - lstm encoding (m x n)
    
=========  

- Since the complex index based encoding covers the above 3 ( we can simply filter columns to get those, also feature importance (random forest) can ignore those which are not imporant ) we basically generated 4 different encodings -
    - boolean encoding (one hot encoding if the activity present or not)
    - frequency encoding (count of activity)
    - complex index based encoding (static feature + nevents encoding + event features )
    - lstm encoding (m x n)

=======
- No need of paddings in boolean and frequency encoding as the activities are fixed \

**PADDING** 
- for one hot encoded attributes (categorical values) the desired vector length is  - t_length * ohe_length (Ex. 10*number of activities) 
- for numerical values the desired length is t_length

**Note** - For now process changes are not been considered, if a new activity arrives in future

## Common functions for Encodings 

In [150]:
# function to save the data
def save_data(X,y,feature_names, save_path):
    data_dict = {}
    data_dict['X'] = X
    data_dict['y'] = y
    data_dict['feature_names'] = feature_names

    # save pickle
    with open(save_path, 'wb') as handle:
        pickle.dump(data_dict, handle)

In [151]:
# load the data from pickle
def load_data(load_path):
    with open(load_path, 'rb') as handle:
        data = pickle.load(handle)
    return data

In [152]:
# function to get the one hot encoded vectors of categorical values
def get_ohe_dict(categorical_vars, df):
    ohe_dict = {}
    for var in categorical_vars:
        var_dict = {}
        var_data = sorted(df[var].unique())
        var_len = len(var_data)
        for i,cat in enumerate(var_data):
            var_dict[cat] = [0]*var_len
            var_dict[cat][i] = 1

        ohe_dict[var] = var_dict

    return ohe_dict

In [153]:
# padding function for ohe encoding
def cat_padding(vec, t_length, attr_length):
    desired_length = t_length*attr_length
    vec_length = len(vec)
    if vec_length != desired_length:
        pad_vec = [0]*(desired_length-vec_length)
        vec.extend(pad_vec)
    return vec

In [154]:
# padding function for non-ohe encoding
def num_padding(vec, t_length):
    vec_length = len(vec)
    if vec_length != t_length:
        pad_vec = [0]*(t_length-vec_length)
        vec.extend(pad_vec)
    return vec

In [155]:
# create one hot encoding dict fot categorical variables
# variables which we want to be one hot encoded
categorical_vars = str_ev_attr + str_tr_attr
ohe_dict = get_ohe_dict(categorical_vars, original_df)
categorical_vars

['concept:name', 'OrganizationalEntity', 'month']

## Boolean encoding

- can be easily extended with adding more attributes of traces other then activities, but for now kept it simple and as discussed in course

In [156]:
# here for each trace we extract ohe vector for activity and sum them up and if count is greater then 1 we make them 1 
# because this encoding only provides info, if the activity was there or not

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        # make it a non frequency vector (if count is greater then 1 make it 1)
        for i,num in enumerate(str_ev_vec):
            if num>1:
                str_ev_vec[i]=1

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [157]:
# saving data 
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Frequency encoding

In [158]:
# here for each trace we extract ohe vector for activity and sum them up 
# because this encoding only provides count of how many times the activity appears

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [159]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Complex index based encoding - static feature (trace attributes) + n events encoding + event features

In [160]:
# here for each trace we put events encoded in order and there aatributes along with padding to make vector length same
# similatly for trace attributes but since it is trace attributes that is only for once

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes along with paddings 

    for cat_atr in str_ev_attr:
        str_ev_vec = []
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        for ca in group[cat_atr]:
            str_ev_vec.extend(ohe_dict[cat_atr][ca])
        
        # padding
        str_ev_vec = cat_padding(str_ev_vec, t_length, attr_length)
        feature_vec.extend(str_ev_vec)


    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.extend(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)



    # add categorical and numerical trace attributes
    for num_t_atr in num_tr_attr:
        feature_vec.extend(group[num_t_atr].iloc[0])


    for cat_t_atr in str_tr_attr:
        feature_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

    # add vector to data
    data.append(feature_vec)


In [161]:
# check if all vector lengths are same 
vec_len = len(data[0])
for i, d in enumerate(data):
    if len(d)!=vec_len:
        print(i, len(d))

In [162]:
# save results
encode_name = 'complexindex_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## LSTM encoding

In [163]:
# here we create sequence of each trace
# so the dimentions will be (number of examples * trace_length * feature_length )

data = []

for id, group in df.groupby(['case:concept:name']):
    
    feature_vec = []

    for index, row in group.iterrows():

        row_vec = []

        for cat_atr in str_ev_attr:
            row_vec.extend(ohe_dict[cat_atr][row[cat_atr]])

        for num_atr in num_ev_attr:
            row_vec.append(row[num_atr])


        # add categorical and numerical trace attributes
        for num_t_atr in num_tr_attr:
            row_vec.append(group[num_t_atr].iloc[0])

        for cat_t_atr in str_tr_attr:
            row_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

        feature_vec.append(row_vec)
    

    # add vector to data
    data.append(feature_vec)


In [164]:
# converting to array
data = np.array([np.array(ls) for ls in data])

In [165]:
# shape we want for all the traces
feature_len = len(data[0][0])
desired_shape = (t_length,feature_len)
desired_shape

(10, 87)

In [166]:
# padding data to make equal shape of vectors
padded_data = []
for case in data:
    pd_case = np.zeros(desired_shape)
    pd_case[:case.shape[0],:case.shape[1]] = case
    padded_data.append(pd_case)

padded_data = np.array(padded_data)
padded_data.shape

(3592, 10, 87)

In [167]:
# save results
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(padded_data, declerations, ohe_dict ,save_path)

# =============================================
# Preparing Test Data

- exactly same, just using **train encoding** to create features
- define trace length and df_type (train or test)
- Working on train and test seperately, so run whole code for the same trace length for train and then for test

In [168]:
df_type = 'test'
permits = test_df

In [169]:
test_df.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,concept:name,time:timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
872,travel permit 11323,Permit SUBMITTED by EMPLOYEE,2018-11-09 14:34:55+00:00,2018-11-09 14:34:55+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,Permit SUBMITTED by EMPLOYEE,2018-11-09 14:34:55+00:00,0.0,0.0,0.0,0.0,1.0,travel permit 11323
873,travel permit 11323,Permit APPROVED by ADMINISTRATION,2018-11-09 14:38:05+00:00,2018-11-09 14:38:05+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,Permit APPROVED by ADMINISTRATION,2018-11-09 14:38:05+00:00,0.0,190.0,190.0,190.0,0.0,travel permit 11323
874,travel permit 11323,Permit FINAL_APPROVED by SUPERVISOR,2018-11-19 09:50:56+00:00,2018-11-19 09:50:56+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,Permit FINAL_APPROVED by SUPERVISOR,2018-11-19 09:50:56+00:00,0.0,198961.0,198961.0,198771.0,0.0,travel permit 11323
875,travel permit 11323,Start trip,2018-11-20 00:00:00+00:00,2018-11-20 00:00:00+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,Start trip,2018-11-20 00:00:00+00:00,0.0,224705.0,224705.0,25744.0,0.0,travel permit 11323
876,travel permit 11323,End trip,2018-11-21 00:00:00+00:00,2018-11-21 00:00:00+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,End trip,2018-11-21 00:00:00+00:00,0.0,260705.0,260705.0,36000.0,0.0,travel permit 11323


### Adding some stastitics features in event log using pm4py

In [170]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

### Extracting target variable

- In our case target variable is REJECTED Declaration, in the process whoever rejects the Declaration (Administrartor, pre-approvar, etc.) it is considered as Declaration REJECTED so using it to identify our target variable

- One of our assumption is - If the decleration is not submitted by Employee and the payment is handeled directly, it is also considered as Declaration Accapted (so basically all the Declarations which is not rejected by employee are considerd as Accapted )

- we are not dealing with situations where it got rejected twice. For now we are only interested in 1st time rejection


In [171]:
# to extract target varaible, 
# if event starts with the name decleration rejected it is considered as rejected
declerations = []
for trace in trace_log:
    flag = False
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            flag = True
            break
    
    if flag:
        declerations.append(1)
    else:
        declerations.append(0)

In [172]:
len(declerations)

167

In [173]:
# extract traces only till the decleration is rejected (excluded), otherwise complete trace 
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [174]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in prefix_traces])

In [175]:
# check the trace length
print([len(trace) for trace in prefix_traces][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[8, 6, 6, 8, 12, 10, 16, 10, 8, 8, 12, 8, 10, 6, 8]
[8, 6, 6, 8, 10, 10, 10, 10, 8, 8, 10, 8, 10, 6, 8]


In [176]:
# check if all good
project_nth(trace_prefixes, 0)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration FINAL_APPROVED by SUPERVISOR']


In [177]:
# to check which traces are not of our desired length (these will be padded while preparing the the training data)
for i,trace in enumerate(trace_prefixes):
    if len(trace)!=t_length:
        print(i, len(trace))
        break

0 8


In [178]:
# convert logs to dataframe
# final base dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name
0,travel permit 11323,Permit SUBMITTED by EMPLOYEE,2018-11-09 14:34:55+00:00,2018-11-09 14:34:55+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,0.0,0.0,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-11-09 14:34:55+00:00,travel permit 11323
1,travel permit 11323,Permit APPROVED by ADMINISTRATION,2018-11-09 14:38:05+00:00,2018-11-09 14:38:05+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,0.0,190.0,190.0,190.0,0.0,Permit APPROVED by ADMINISTRATION,2018-11-09 14:38:05+00:00,travel permit 11323
2,travel permit 11323,Permit FINAL_APPROVED by SUPERVISOR,2018-11-19 09:50:56+00:00,2018-11-19 09:50:56+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,0.0,198961.0,198961.0,198771.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-11-19 09:50:56+00:00,travel permit 11323
3,travel permit 11323,Start trip,2018-11-20 00:00:00+00:00,2018-11-20 00:00:00+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,0.0,224705.0,224705.0,25744.0,0.0,Start trip,2018-11-20 00:00:00+00:00,travel permit 11323
4,travel permit 11323,End trip,2018-11-21 00:00:00+00:00,2018-11-21 00:00:00+00:00,travel permit 11323,74,True,travel permit number 11324,3.870184,52.303706,...,2018,11,0.0,260705.0,260705.0,36000.0,0.0,End trip,2018-11-21 00:00:00+00:00,travel permit 11323


## Feature Selection ( Based on Data Exploration )

In [179]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name']
str_tr_attr = ['OrganizationalEntity','month']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = []

## Boolean encoding

- can be easily extended with adding more attributes of traces other then activities, but for now kept it simple and as discussed in course

In [180]:
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [181]:
# here for each trace we extract ohe vector for activity and sum them up and if count is greater then 1 we make them 1 
# because this encoding only provides info, if the activity was there or not

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        # make it a non frequency vector (if count is greater then 1 make it 1)
        for i,num in enumerate(str_ev_vec):
            if num>1:
                str_ev_vec[i]=1

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [182]:
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Frequency encoding

In [183]:
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [184]:
# here for each trace we extract ohe vector for activity and sum them up 
# because this encoding only provides count of how many times the activity appears

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [185]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Complex index based encoding - static feature (trace attributes) + n events encoding + event features

In [186]:
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

FileNotFoundError: [Errno 2] No such file or directory: '../data/training_data/complex_index_encode_train_trace_len_10.pickle'

In [187]:
# here for each trace we put events encoded in order and there aatributes along with padding to make vector length same
# similatly for trace attributes but since it is trace attributes that is only for once

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes along with paddings 

    for cat_atr in str_ev_attr:
        str_ev_vec = []
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        for ca in group[cat_atr]:
            str_ev_vec.extend(ohe_dict[cat_atr][ca])
        
        # padding
        str_ev_vec = cat_padding(str_ev_vec, t_length, attr_length)
        feature_vec.extend(str_ev_vec)


    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.extend(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)



    # add categorical and numerical trace attributes
    for num_t_atr in num_tr_attr:
        feature_vec.extend(group[num_t_atr].iloc[0])


    for cat_t_atr in str_tr_attr:
        feature_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

    # add vector to data
    data.append(feature_vec)


In [188]:
# check if all vector lengths are same 
vec_len = len(data[0])

for i, d in enumerate(data):
    if len(d)!=vec_len:
        print(i, len(d))

In [189]:
# save results
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## LSTM encoding

In [190]:
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [191]:
# here we create sequence of each trace
# so the dimentions will be (number of examples * trace_length * feature_length )

data = []

for id, group in df.groupby(['case:concept:name']):
    
    feature_vec = []

    for index, row in group.iterrows():

        row_vec = []

        for cat_atr in str_ev_attr:
            row_vec.extend(ohe_dict[cat_atr][row[cat_atr]])

        for num_atr in num_ev_attr:
            row_vec.append(row[num_atr])


        # add categorical and numerical trace attributes
        for num_t_atr in num_tr_attr:
            row_vec.append(group[num_t_atr].iloc[0])

        for cat_t_atr in str_tr_attr:
            row_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

        feature_vec.append(row_vec)
    

    # add vector to data
    data.append(feature_vec)


In [192]:
# converting to array
data = np.array([np.array(ls) for ls in data])

In [193]:
feature_len = len(data[0][0])
desired_shape = (t_length,feature_len)
desired_shape

(10, 87)

In [194]:
# padding data to make equal shape of vectors
padded_data = []
for case in data:
    pd_case = np.zeros(desired_shape)
    pd_case[:case.shape[0],:case.shape[1]] = case
    padded_data.append(pd_case)

padded_data = np.array(padded_data)
padded_data.shape

(167, 10, 87)

In [195]:
# save results
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

# =============================================
# Preparing Val Data

In [196]:
df_type = 'val'
permits = val_df

In [197]:
val_df.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,concept:name,time:timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
616,travel permit 10898,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05+00:00,2018-09-27 12:35:05+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,9,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05+00:00,0.0,0.0,0.0,0.0,1.0,travel permit 10898
617,travel permit 10898,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10+00:00,2018-09-27 12:35:10+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,9,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10+00:00,0.0,5.0,5.0,5.0,0.0,travel permit 10898
618,travel permit 10898,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28+00:00,2018-09-28 15:45:28+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,9,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28+00:00,0.0,47423.0,47423.0,47418.0,0.0,travel permit 10898
619,travel permit 10898,Start trip,2018-10-08 00:00:00+00:00,2018-10-08 00:00:00+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,10,Start trip,2018-10-08 00:00:00+00:00,0.0,231895.0,231895.0,184472.0,0.0,travel permit 10898
620,travel permit 10898,End trip,2018-10-12 00:00:00+00:00,2018-10-12 00:00:00+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,10,End trip,2018-10-12 00:00:00+00:00,0.0,375895.0,375895.0,144000.0,0.0,travel permit 10898


### Adding some stastitics features in event log using pm4py

In [198]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

### Extracting target variable

- In our case target variable is REJECTED Declaration, in the process whoever rejects the Declaration (Administrartor, pre-approvar, etc.) it is considered as Declaration REJECTED so using it to identify our target variable

- One of our assumption is - If the decleration is not submitted by Employee and the payment is handeled directly, it is also considered as Declaration Accapted (so basically all the Declarations which is not rejected by employee are considerd as Accapted )

- we are not dealing with situations where it got rejected twice. For now we are only interested in 1st time rejection


In [199]:
# to extract target varaible, 
# if event starts with the name decleration rejected it is considered as rejected
declerations = []
for trace in trace_log:
    flag = False
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            flag = True
            break
    
    if flag:
        declerations.append(1)
    else:
        declerations.append(0)

In [200]:
len(declerations)

157

In [201]:
# extract traces only till the decleration is rejected (excluded), otherwise complete trace 
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [202]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in prefix_traces])

In [203]:
# check the trace length
print([len(trace) for trace in prefix_traces][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[8, 9, 8, 8, 8, 7, 8, 10, 10, 8, 7, 13, 8, 6, 7]
[8, 9, 8, 8, 8, 7, 8, 10, 10, 8, 7, 10, 8, 6, 7]


In [204]:
# check if all good
project_nth(trace_prefixes, 0)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration FINAL_APPROVED by SUPERVISOR']


In [205]:
# to check which traces are not of our desired length (these will be padded while preparing the the training data)
for i,trace in enumerate(trace_prefixes):
    if len(trace)!=t_length:
        print(i, len(trace))
        break

0 8


In [206]:
# convert logs to dataframe
# final base dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,year,month,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name
0,travel permit 10898,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05+00:00,2018-09-27 12:35:05+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,9,0.0,0.0,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05+00:00,travel permit 10898
1,travel permit 10898,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10+00:00,2018-09-27 12:35:10+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,9,0.0,5.0,5.0,5.0,0.0,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10+00:00,travel permit 10898
2,travel permit 10898,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28+00:00,2018-09-28 15:45:28+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,9,0.0,47423.0,47423.0,47418.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28+00:00,travel permit 10898
3,travel permit 10898,Start trip,2018-10-08 00:00:00+00:00,2018-10-08 00:00:00+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,10,0.0,231895.0,231895.0,184472.0,0.0,Start trip,2018-10-08 00:00:00+00:00,travel permit 10898
4,travel permit 10898,End trip,2018-10-12 00:00:00+00:00,2018-10-12 00:00:00+00:00,travel permit 10898,482,True,travel permit number 10899,44.996875,347.303505,...,2018,10,0.0,375895.0,375895.0,144000.0,0.0,End trip,2018-10-12 00:00:00+00:00,travel permit 10898


## Feature Selection ( Based on Data Exploration )

In [207]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name']
str_tr_attr = ['OrganizationalEntity','month']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = []

## Boolean encoding

- can be easily extended with adding more attributes of traces other then activities, but for now kept it simple and as discussed in course

In [208]:
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [209]:
# here for each trace we extract ohe vector for activity and sum them up and if count is greater then 1 we make them 1 
# because this encoding only provides info, if the activity was there or not

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        # make it a non frequency vector (if count is greater then 1 make it 1)
        for i,num in enumerate(str_ev_vec):
            if num>1:
                str_ev_vec[i]=1

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [210]:
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Frequency encoding

In [211]:
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [212]:
# here for each trace we extract ohe vector for activity and sum them up 
# because this encoding only provides count of how many times the activity appears

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [213]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Complex index based encoding - static feature (trace attributes) + n events encoding + event features

In [214]:
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

FileNotFoundError: [Errno 2] No such file or directory: '../data/training_data/complex_index_encode_train_trace_len_10.pickle'

In [215]:
# here for each trace we put events encoded in order and there aatributes along with padding to make vector length same
# similatly for trace attributes but since it is trace attributes that is only for once

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes along with paddings 

    for cat_atr in str_ev_attr:
        str_ev_vec = []
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        for ca in group[cat_atr]:
            str_ev_vec.extend(ohe_dict[cat_atr][ca])
        
        # padding
        str_ev_vec = cat_padding(str_ev_vec, t_length, attr_length)
        feature_vec.extend(str_ev_vec)


    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.extend(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)



    # add categorical and numerical trace attributes
    for num_t_atr in num_tr_attr:
        feature_vec.extend(group[num_t_atr].iloc[0])


    for cat_t_atr in str_tr_attr:
        feature_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

    # add vector to data
    data.append(feature_vec)


In [216]:
# check if all vector lengths are same 
vec_len = len(data[0])

for i, d in enumerate(data):
    if len(d)!=vec_len:
        print(i, len(d))

In [217]:
# save results
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## LSTM encoding

In [218]:
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [219]:
# here we create sequence of each trace
# so the dimentions will be (number of examples * trace_length * feature_length )

data = []

for id, group in df.groupby(['case:concept:name']):
    
    feature_vec = []

    for index, row in group.iterrows():

        row_vec = []

        for cat_atr in str_ev_attr:
            row_vec.extend(ohe_dict[cat_atr][row[cat_atr]])

        for num_atr in num_ev_attr:
            row_vec.append(row[num_atr])


        # add categorical and numerical trace attributes
        for num_t_atr in num_tr_attr:
            row_vec.append(group[num_t_atr].iloc[0])

        for cat_t_atr in str_tr_attr:
            row_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

        feature_vec.append(row_vec)
    

    # add vector to data
    data.append(feature_vec)


In [220]:
# converting to array
data = np.array([np.array(ls) for ls in data])

In [221]:
feature_len = len(data[0][0])
desired_shape = (t_length,feature_len)
desired_shape

(10, 87)

In [222]:
# padding data to make equal shape of vectors
padded_data = []
for case in data:
    pd_case = np.zeros(desired_shape)
    pd_case[:case.shape[0],:case.shape[1]] = case
    padded_data.append(pd_case)

padded_data = np.array(padded_data)
padded_data.shape

(157, 10, 87)

In [223]:
# save results
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

# =================== END =================



# Extra

## Preparing data for task Declaration REJECTED vs Accapted by Administrator/Employee

- keeping all the traces \
or
Extract traces only where decleration is submitted by employee is in a trace ( we are not considering other traces where decision of accatped, rejected is not being made /discuss )
- create target variables, if application is rejected or not
- get traces which starts from permit submitted by employee to the trace decleration is submitted by employee (discuss)
- filter traces of length n (10,15,20 decide) 
- choose encoding and decide features to involve which we belive will be there before the activity decleration is submitted


### Filterings

1. Filtering on timeframe (from this date to that)
2. Filter on case performance (traces finished withing 10 days)
3. Filter on start and end activities (give list of start and end)
4. Filter on variants (keeping only frequent trace flows like - [a,b,c,d] and [a d b c], or 0.4 threshold etc)
5. Filter on attributes values (selection and projection of traces)
6. Filter on numeric attribute values (from declared amount 500 to 1000)
7. Between Filter (filtering the activities from say permit apply to permit accapted)
8. case size (number of activiries in case)


### Statistics

1. Throughput Time (time to complete traces)
2. Case Arrival/Dispersion Ratio (arrival time between two traces (not events))
3. Performance Spectrum (time between activities)
4. Cycle Time and Waiting Time (cycle time - time between activities, lead Time - the overall time in which the instance was worked, from the start to the end,accumelative )
5. Sojourn Time - time taken for an activity to complete from the end of previous activity
6. other stats if needed


In [7]:
# # To extract target varaible, 
# # if event starts with the name decleration rejected it is considered as rejected application
# declerations = []
# for trace in trace_log:
#     flag = False
#     for i,event in enumerate(trace):
#         if "Declaration REJECTED" in event['event']:
#             flag = True
#             break
#     if flag:
#         declerations.append(1)
#     else:
#         declerations.append(0)

In [None]:
# prefix_data = pd.DataFrame(columns = trace.columns)
# for name, group in trace.groupby(['case']):
#     for i in range(len(group)):
#         if "Declaration REJECTED" in group.iloc[i]['event']:
#             break
#     temp = group.head(i)
#     prefix_data = prefix_data.append(group.head(i))
#     break
# prefix_data.head()

In [None]:
# permits.head()
# permits['startTime'] = pd.to_datetime(permits['startTime'])
# permits['completeTime'] = pd.to_datetime(permits['completeTime'])

In [None]:
# # add year and month column
# permits['year'] = permits['startTime'].dt.year
# permits['month'] = permits['startTime'].dt.month

In [None]:
# # time to complete the activity
# permits['act_completionTime'] = permits['completeTime'] - permits['startTime']
# permits['act_completionTime'] = permits['act_completionTime'].dt.total_seconds()

In [None]:
# # get minimum time for earch trace (when trace started)
# temp_df = pd.DataFrame(permits.groupby(['case'])['startTime'].min())
# temp_df['index'] = temp_df.index
# temp_df.reset_index(drop=True, inplace=True)
# temp_df.columns = ['startTime_min','case']

# # merge with permits
# permits = permits.merge(temp_df,on=['case'])

In [None]:
# # get max time for earch trace (when trace ended)
# temp_df = pd.DataFrame(permits.groupby(['case'])['completeTime'].max())
# temp_df['index'] = temp_df.index
# temp_df.reset_index(drop=True, inplace=True)
# temp_df.columns = ['completeTime_max','case']

# # merge with permits
# permits = permits.merge(temp_df,on=['case'])

In [None]:
# # calculate trace time
# permits['processCompletionTime']=permits['completeTime_max']-permits['startTime_min']
# permits['processCompletionTime'] = permits['processCompletionTime'].dt.total_seconds()

In [None]:
# # calculate time taken between activities and cumulative time
# act_time_taken=[]
# act_inc_time = []
# for name,group in permits.groupby(['case'],sort=False):
#     act_time_taken.append([name,pd.to_datetime(0) - pd.to_datetime(0)])
#     act_inc_time.append([name,pd.to_datetime(0) - pd.to_datetime(0)])
#     act_time=pd.to_datetime(0) - pd.to_datetime(0)
#     for i in range(len(group)-1):
#         st_time_1 = group.iloc[i]['startTime']
#         st_time_2 = group.iloc[i+1]['startTime']
#         diff = st_time_2-st_time_1
#         act_time+=diff
#         act_time_taken.append([name,diff])
#         act_inc_time.append([name,act_time])

# time_diff_df = pd.DataFrame(act_time_taken,columns=['case','act_time_diff'])
# time_inc_df = pd.DataFrame(act_inc_time,columns=['case','act_inc_time'])

# # add with permits
# permits['next_act_time_diff'] = time_diff_df['act_time_diff']
# permits['next_act_time_diff'] = permits['next_act_time_diff'].dt.total_seconds()

# permits['act_inc_time'] = time_inc_df['act_inc_time']
# permits['act_inc_time'] = permits['act_inc_time'].dt.total_seconds()

In [None]:
# permits.head()

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,year,month,act_completionTime,startTime_min,completeTime_max,processCompletionTime,next_act_time_diff,act_inc_time
0,travel permit 10066,Permit SUBMITTED by EMPLOYEE,2018-02-21 16:28:17,2018-02-21 16:28:17,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,0.0,0.0
1,travel permit 10066,Permit APPROVED by ADMINISTRATION,2018-02-21 16:28:19,2018-02-21 16:28:19,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,2.0,2.0
2,travel permit 10066,Start trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,27101.0,27103.0
3,travel permit 10066,End trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,0.0,27103.0
4,travel permit 10066,Permit FINAL_APPROVED by SUPERVISOR,2018-02-22 08:27:05,2018-02-22 08:27:05,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,30425.0,57528.0


In [21]:
# # Create dataframe for Target variable and combine with above trace dataframe
# cases = list(df.groupby(['case'],sort=False)['id'].first().keys())
# df_dict = dict()
# df_dict['case'] = cases
# df_dict['declerations'] = declerations
# temp_df = pd.DataFrame(df_dict)

# # merge target variable with permits
# trace = df.merge(temp_df,on=['case'])
# trace.head()

In [None]:
# from pm4py.statistics.sojourn_time.log import get as soj_time_get
# soj_time = soj_time_get.apply(trace_log, parameters={soj_time_get.Parameters.TIMESTAMP_KEY: "time:timestamp", soj_time_get.Parameters.START_TIMESTAMP_KEY: "startTime"})
# print(soj_time)

In [None]:
# # to get all trace cases
# for i, trace in enumerate(trace_log):
#     print(trace[0]['case'])

In [None]:
# # throughput time (time to complete traces) all in seconds
# all_case_durations = pm4py.get_all_case_durations(trace_log)
# all_case_durations

In [None]:
# # arrival between cases
# case_arrival_ratio = pm4py.get_case_arrival_average(trace_log)
# case_arrival_ratio/(60*60)

In [None]:
# from pm4py.objects.log.util import interval_lifecycle
# enriched_log = interval_lifecycle.assign_lead_cycle_time(trace_log)
# enriched_log

### boolean bigram encoding

In [None]:
# data_2gram, feature_names = log_to_features.apply(trace_prefixes, 
#                                                   parameters={"str_ev_attr": [], 
#                                                         "str_tr_attr": [], 
#                                                         "num_ev_attr": [], 
#                                                         "num_tr_attr": [], 
#                                                         "str_evsucc_attr": ["concept:name"]})
# feature_names

In [None]:
# data_2gram = np.asarray(data_2gram)

In [None]:
# project_nth(trace_prefixes, 0)

In [None]:
# print(data_2gram[0])

In [None]:
# train_data = {}
# train_data['X']=data_2gram
# train_data['y'] = declerations
# train_data['feature_names'] = feature_names

In [None]:
# # save json
# save_path = '../data/training_data/bigram_boolean_encode.json'
# with open(save_path, 'w', encoding='utf-8') as f:
#     json.dump(train_data, f, ensure_ascii=False, indent=4)

In [None]:
# between_log = pm4py.filter_between(dec_sub_traces, "Permit SUBMITTED by EMPLOYEE", "Declaration SUBMITTED by EMPLOYEE")

In [None]:
# project_nth(between_log, 232)

In [None]:
# # we can get any attribute of trace using this
# activities = pm4py.get_event_attribute_values(trace_log, "concept:name")
# resources = pm4py.get_event_attribute_values(trace_log, "org:resource")
# resources

In [None]:
# remove Return Decleration event
# train_raw = pm4py.filter_event_attribute_values(trace_log, "concept:name", "Declaration REJECTED by EMPLOYEE", level = "event", retain=False)

In [None]:
# to display most frequent feature
# # look at the unique vectors and their occurrence frequency/count
# # just to check which trace was most frequent, just analysis
# dist_features = np.unique(data, return_counts= True, axis = 0)
# print(dist_features)


# # display most freuent feature
# most_freq_feature = dist_features[0][np.argmax(dist_features[1])]
# most_freq_feature


# # order doesn't matter here
# for i in range(0, len(most_freq_feature)):
#     if most_freq_feature[i]!=0:
#         print(feature_names[i].split("@")[1], end=' , ')

In [None]:
# # printing all activities in trace 
# for trace in trace_log[0]:
#     print(trace["concept:name"])

In [None]:
# # although the split was 70:30, but since it is sequence data, traces can have different number of data points, so actual number is different 
# test_data_percentage = (test_count/train_count)*100
# print("trace test percentage")
# print(test_data_percentage)

In [None]:
# to extract only traces where Declaration SUBMITTED by EMPLOYEE is present
# dec_sub_traces = []
# for trace in trace_log:
#     if len(list(filter(lambda e: e["concept:name"] == "Declaration SUBMITTED by EMPLOYEE" ,trace))) > 0:
#         dec_sub_traces.append(Trace(trace, attributes = trace.attributes))

# dec_sub_traces = EventLog(dec_sub_traces)