In [2]:
# import libraties
import pandas as pd
import numpy as np
import json
import pickle

# https://pm4py.fit.fraunhofer.de/documentation
import pm4py
from pm4py.objects.log.util.log import project_traces
from pm4py.objects.log.util import interval_lifecycle
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
from pm4py.objects.log.obj import EventLog, Trace

import warnings
warnings.filterwarnings("ignore")

In [6]:
# load data for the best perfroming model for trace length 6
prediction_load_path = '../data/processed/complex_6_random_forest_model_predictions.csv'
test_data_path = '../data/training_data/original_data/test_trace_len_6.csv' 
predicted_df = pd.read_csv(prediction_load_path)
test_df = pd.read_csv(test_data_path)

In [7]:
predicted_df

Unnamed: 0,accepted,rejected,predicted_class
0,0.667477,0.332523,0
1,0.734522,0.265478,0
2,0.696144,0.303856,0
3,0.727405,0.272595,0
4,0.660755,0.339245,0
...,...,...,...
764,0.744639,0.255361,0
765,0.758548,0.241452,0
766,0.759927,0.240073,0
767,0.771811,0.228189,0


In [14]:
# Create dataframe for Target variable and combine with above trace dataframe
cases = list(test_df.groupby(['case'],sort=False)['id'].first().keys())
predicted_df['case'] = cases

# merge target variable with permits
test_df = test_df.merge(predicted_df,on=['case'])

In [27]:
wrong_predictions = test_df[(test_df['predicted_class']!=test_df['declerations'])]
wrong_predictions = wrong_predictions.reset_index(drop=True)
wrong_predictions.to_csv('../data/processed/complex_6_random_forest_model_predictions_wrong.csv',index=False)
wrong_predictions.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name,declerations,accepted,rejected,predicted_class
0,travel permit 10316,Permit SUBMITTED by EMPLOYEE,2018-09-06 09:53:00+00:00,2018-09-06 09:53:00+00:00,travel permit 10316,974.740383456614,True,travel permit number 10317,792.894294,2404.280654,...,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-09-06 09:53:00+00:00,travel permit 10316,1,0.727405,0.272595,0
1,travel permit 10316,Permit APPROVED by ADMINISTRATION,2018-09-06 09:53:09+00:00,2018-09-06 09:53:09+00:00,travel permit 10316,974.740383456614,True,travel permit number 10317,792.894294,2404.280654,...,9.0,9.0,0.0,Permit APPROVED by ADMINISTRATION,2018-09-06 09:53:09+00:00,travel permit 10316,1,0.727405,0.272595,0
2,travel permit 10316,Permit FINAL_APPROVED by SUPERVISOR,2018-09-11 11:25:17+00:00,2018-09-11 11:25:17+00:00,travel permit 10316,974.740383456614,True,travel permit number 10317,792.894294,2404.280654,...,113537.0,113528.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-09-11 11:25:17+00:00,travel permit 10316,1,0.727405,0.272595,0
3,travel permit 10316,Request For Payment SUBMITTED by EMPLOYEE,2018-09-17 09:46:50+00:00,2018-09-17 09:46:50+00:00,travel permit 10316,974.740383456614,True,travel permit number 10317,792.894294,2404.280654,...,251630.0,138093.0,0.0,Request For Payment SUBMITTED by EMPLOYEE,2018-09-17 09:46:50+00:00,travel permit 10316,1,0.727405,0.272595,0
4,travel permit 10316,Request For Payment APPROVED by ADMINISTRATION,2018-09-17 09:48:48+00:00,2018-09-17 09:48:48+00:00,travel permit 10316,974.740383456614,True,travel permit number 10317,792.894294,2404.280654,...,251748.0,118.0,0.0,Request For Payment APPROVED by ADMINISTRATION,2018-09-17 09:48:48+00:00,travel permit 10316,1,0.727405,0.272595,0


In [28]:
len(wrong_predictions['case'].unique())

153

In [29]:
correct_predictions = test_df[(test_df['predicted_class']==test_df['declerations'])]
correct_predictions = correct_predictions.reset_index(drop=True)
correct_predictions.to_csv('../data/processed/complex_6_random_forest_model_predictions_correct.csv',index=False)
correct_predictions.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name,declerations,accepted,rejected,predicted_class
0,travel permit 10215,Permit SUBMITTED by EMPLOYEE,2018-09-05 08:37:34+00:00,2018-09-05 08:37:34+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-09-05 08:37:34+00:00,travel permit 10215,0,0.667477,0.332523,0
1,travel permit 10215,Permit APPROVED by ADMINISTRATION,2018-09-05 08:38:28+00:00,2018-09-05 08:38:28+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,54.0,54.0,0.0,Permit APPROVED by ADMINISTRATION,2018-09-05 08:38:28+00:00,travel permit 10215,0,0.667477,0.332523,0
2,travel permit 10215,Permit APPROVED by BUDGET OWNER,2018-09-05 08:55:23+00:00,2018-09-05 08:55:23+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,1069.0,1015.0,0.0,Permit APPROVED by BUDGET OWNER,2018-09-05 08:55:23+00:00,travel permit 10215,0,0.667477,0.332523,0
3,travel permit 10215,Permit FINAL_APPROVED by SUPERVISOR,2018-09-06 15:06:29+00:00,2018-09-06 15:06:29+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,59335.0,58266.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-09-06 15:06:29+00:00,travel permit 10215,0,0.667477,0.332523,0
4,travel permit 10215,Start trip,2018-11-06 00:00:00+00:00,2018-11-06 00:00:00+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,1578146.0,1518811.0,0.0,Start trip,2018-11-06 00:00:00+00:00,travel permit 10215,0,0.667477,0.332523,0


In [30]:
len(correct_predictions['case'].unique())

616

## Check last activity distribution for all trace length

- Run the below code when trace length is 10
- because it will give you all traces ending at 10,8,6,4 lengths
- if trace is smaller then nth (4,6,8,10) length we keep the last activity if the trace

### Correct predictions

In [38]:
# save distribution
event_distribution_LIST = []
# event_distribution_LIST['10'],event_distribution_LIST['8'],event_distribution_LIST['6'],event_distribution_LIST['4'] = [],[],[],[]
trace_lengths = [6]
for name, group in correct_predictions.groupby(['case'],sort=False):
    # print(name)
    event_list = list(group['event'])
    for t_len in trace_lengths:
        index = t_len-1
        try:
            event_distribution_LIST.append([event_list[index],t_len])
        except:
            event_distribution_LIST.append([event_list[-1],t_len])

In [39]:
# create dataframe
act_distribution = pd.DataFrame(event_distribution_LIST, columns=['Activities','Trace length'])
act_distribution.head()

Unnamed: 0,Activities,Trace length
0,End trip,6
1,Declaration SUBMITTED by EMPLOYEE,6
2,Request For Payment SUBMITTED by EMPLOYEE,6
3,End trip,6
4,Declaration SUBMITTED by EMPLOYEE,6


In [40]:
# value count and normalized
df_dist = act_distribution.groupby('Trace length')['Activities'].value_counts(normalize=True)
df_dist = df_dist.mul(100)
df_dist = df_dist.rename('Percent count').reset_index()
df_dist.head()

Unnamed: 0,Trace length,Activities,Percent count
0,6,Declaration SUBMITTED by EMPLOYEE,31.331169
1,6,End trip,24.188312
2,6,Request For Payment FINAL_APPROVED by SUPERVISOR,11.688312
3,6,Request For Payment APPROVED by ADMINISTRATION,9.090909
4,6,Send Reminder,4.707792


In [41]:
# plot the distribution
import plotly.express as px
fig = px.bar(df_dist, x="Trace length", y="Percent count", color="Activities", title="Last activity vs trace length distribution")
fig.show()
# fig.write_html("../docs/activity_vs_trace_length_distribution.html")

### Wrong predictions

In [42]:
# save distribution
event_distribution_LIST = []
# event_distribution_LIST['10'],event_distribution_LIST['8'],event_distribution_LIST['6'],event_distribution_LIST['4'] = [],[],[],[]
trace_lengths = [6]
for name, group in wrong_predictions.groupby(['case'],sort=False):
    # print(name)
    event_list = list(group['event'])
    for t_len in trace_lengths:
        index = t_len-1
        try:
            event_distribution_LIST.append([event_list[index],t_len])
        except:
            event_distribution_LIST.append([event_list[-1],t_len])

In [43]:
# create dataframe
act_distribution = pd.DataFrame(event_distribution_LIST, columns=['Activities','Trace length'])
act_distribution.head()

Unnamed: 0,Activities,Trace length
0,Request For Payment SUBMITTED by EMPLOYEE,6
1,Send Reminder,6
2,Declaration SUBMITTED by EMPLOYEE,6
3,Declaration SUBMITTED by EMPLOYEE,6
4,End trip,6


In [44]:
# value count and normalized
df_dist = act_distribution.groupby('Trace length')['Activities'].value_counts(normalize=True)
df_dist = df_dist.mul(100)
df_dist = df_dist.rename('Percent count').reset_index()
df_dist.head()

Unnamed: 0,Trace length,Activities,Percent count
0,6,Declaration SUBMITTED by EMPLOYEE,35.294118
1,6,End trip,28.75817
2,6,Request For Payment APPROVED by ADMINISTRATION,7.189542
3,6,Permit FINAL_APPROVED by SUPERVISOR,4.575163
4,6,Request For Payment FINAL_APPROVED by SUPERVISOR,4.575163


In [45]:
# plot the distribution
import plotly.express as px
fig = px.bar(df_dist, x="Trace length", y="Percent count", color="Activities", title="Last activity vs trace length distribution")
fig.show()
# fig.write_html("../docs/activity_vs_trace_length_distribution.html")

# ===================== END ====================== ######