In [30]:
# import libraties
import pandas as pd
import numpy as np
import json
import pickle

# https://pm4py.fit.fraunhofer.de/documentation
import pm4py
from pm4py.objects.log.util.log import project_traces
from pm4py.objects.log.util import interval_lifecycle
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
from pm4py.objects.log.obj import EventLog, Trace
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [21]:
# load data for the best perfroming model for trace length 6
prediction_load_path = '../data/processed/complex_6_decision_tree_model_predictions.csv'
test_data_path = '../data/training_data/original_data/test_trace_len_6.csv' 
predicted_df = pd.read_csv(prediction_load_path)
test_df = pd.read_csv(test_data_path)

In [28]:
pred_df = pd.DataFrame(test_df.groupby(['case'],sort=False)['declerations'].first()).reset_index()
pred_df['declerations'].value_counts()

0    690
1    171
Name: declerations, dtype: int64

In [40]:
import random
random.seed(43)

In [67]:
base_pred = []
random.seed(43)
for i in range(10):
    if random.random() <=0.75:
        base_pred.append(0)
    else:
        base_pred.append(1)

In [68]:
base_pred

[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

In [31]:
df_baseline = np.zeros(len(pred_df))
base_val = pd.Series({'Model': 'Baseline', 'F-score': f1_score(pred_df['declerations'], df_baseline, average='macro'),
    'Precision': precision_score(pred_df['declerations'], df_baseline), 'Recall': recall_score(
    pred_df['declerations'], df_baseline), 'Accuracy': accuracy_score(pred_df['declerations'], df_baseline)})
base_val

Model        Decision Tree (Val)
F-score                 0.444874
Precision                    0.0
Recall                       0.0
Accuracy                0.801394
dtype: object

In [6]:
# Create dataframe for Target variable and combine with above trace dataframe
cases = list(test_df.groupby(['case'],sort=False)['id'].first().keys())
predicted_df['case'] = cases

# merge target variable with permits
test_df = test_df.merge(predicted_df,on=['case'])

In [7]:
wrong_predictions = test_df[(test_df['predicted_class']!=test_df['declerations'])]
wrong_predictions = wrong_predictions.reset_index(drop=True)
wrong_predictions.to_csv('../data/processed/complex_6_random_forest_model_predictions_wrong.csv',index=False)
wrong_predictions.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name,declerations,accepted,rejected,predicted_class
0,travel permit 10293,Permit SUBMITTED by EMPLOYEE,2018-09-05 16:41:02+00:00,2018-09-05 16:41:02+00:00,travel permit 10293,484,False,travel permit number 10294,-463.442149,1384.079011,...,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-09-05 16:41:02+00:00,travel permit 10293,0,0.0,1.0,1
1,travel permit 10293,Permit APPROVED by ADMINISTRATION,2018-09-05 16:41:04+00:00,2018-09-05 16:41:04+00:00,travel permit 10293,484,False,travel permit number 10294,-463.442149,1384.079011,...,2.0,2.0,0.0,Permit APPROVED by ADMINISTRATION,2018-09-05 16:41:04+00:00,travel permit 10293,0,0.0,1.0,1
2,travel permit 10293,Permit FINAL_APPROVED by SUPERVISOR,2018-09-06 13:27:35+00:00,2018-09-06 13:27:35+00:00,travel permit 10293,484,False,travel permit number 10294,-463.442149,1384.079011,...,24393.0,24391.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-09-06 13:27:35+00:00,travel permit 10293,0,0.0,1.0,1
3,travel permit 10293,Start trip,2018-12-03 00:00:00+00:00,2018-12-03 00:00:00+00:00,travel permit 10293,484,False,travel permit number 10294,-463.442149,1384.079011,...,2233138.0,2208745.0,0.0,Start trip,2018-12-03 00:00:00+00:00,travel permit 10293,0,0.0,1.0,1
4,travel permit 10293,End trip,2018-12-06 00:00:00+00:00,2018-12-06 00:00:00+00:00,travel permit 10293,484,False,travel permit number 10294,-463.442149,1384.079011,...,2341138.0,108000.0,0.0,End trip,2018-12-06 00:00:00+00:00,travel permit 10293,0,0.0,1.0,1


In [8]:
len(wrong_predictions['case'].unique())

282

In [9]:
correct_predictions = test_df[(test_df['predicted_class']==test_df['declerations'])]
correct_predictions = correct_predictions.reset_index(drop=True)
correct_predictions.to_csv('../data/processed/complex_6_random_forest_model_predictions_correct.csv',index=False)
correct_predictions.head()

Unnamed: 0,case,event,startTime,completeTime,id,RequestedAmount_0,Overspent,travel permit number,OverspentAmount,RequestedBudget,...,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,concept:name,time:timestamp,case:concept:name,declerations,accepted,rejected,predicted_class
0,travel permit 10215,Permit SUBMITTED by EMPLOYEE,2018-09-05 08:37:34+00:00,2018-09-05 08:37:34+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,0.0,0.0,1.0,Permit SUBMITTED by EMPLOYEE,2018-09-05 08:37:34+00:00,travel permit 10215,0,1.0,0.0,0
1,travel permit 10215,Permit APPROVED by ADMINISTRATION,2018-09-05 08:38:28+00:00,2018-09-05 08:38:28+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,54.0,54.0,0.0,Permit APPROVED by ADMINISTRATION,2018-09-05 08:38:28+00:00,travel permit 10215,0,1.0,0.0,0
2,travel permit 10215,Permit APPROVED by BUDGET OWNER,2018-09-05 08:55:23+00:00,2018-09-05 08:55:23+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,1069.0,1015.0,0.0,Permit APPROVED by BUDGET OWNER,2018-09-05 08:55:23+00:00,travel permit 10215,0,1.0,0.0,0
3,travel permit 10215,Permit FINAL_APPROVED by SUPERVISOR,2018-09-06 15:06:29+00:00,2018-09-06 15:06:29+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,59335.0,58266.0,0.0,Permit FINAL_APPROVED by SUPERVISOR,2018-09-06 15:06:29+00:00,travel permit 10215,0,1.0,0.0,0
4,travel permit 10215,Start trip,2018-11-06 00:00:00+00:00,2018-11-06 00:00:00+00:00,travel permit 10215,1957,True,travel permit number 10216,224.765145,1915.760495,...,1578146.0,1518811.0,0.0,Start trip,2018-11-06 00:00:00+00:00,travel permit 10215,0,1.0,0.0,0


In [10]:
len(correct_predictions['case'].unique())

487

## Check last activity distribution for all trace length

- Run the below code when trace length is 10
- because it will give you all traces ending at 10,8,6,4 lengths
- if trace is smaller then nth (4,6,8,10) length we keep the last activity if the trace

### Correct predictions

In [12]:
# save distribution
event_distribution_LIST = []
# event_distribution_LIST['10'],event_distribution_LIST['8'],event_distribution_LIST['6'],event_distribution_LIST['4'] = [],[],[],[]
trace_lengths = [6]
for name, group in correct_predictions.groupby(['case'],sort=False):
    # print(name)
    event_list = list(group['event'])
    for t_len in trace_lengths:
        index = t_len-1
        try:
            event_distribution_LIST.append([event_list[index],t_len])
        except:
            event_distribution_LIST.append([event_list[-1],t_len])

In [13]:
# create dataframe
act_distribution = pd.DataFrame(event_distribution_LIST, columns=['Activities','Trace length'])
act_distribution.head()

Unnamed: 0,Activities,Trace length
0,End trip,6
1,Request For Payment APPROVED by ADMINISTRATION,6
2,Request For Payment APPROVED by ADMINISTRATION,6
3,End trip,6
4,Request For Payment FINAL_APPROVED by SUPERVISOR,6


In [14]:
# value count and normalized
df_dist = act_distribution.groupby('Trace length')['Activities'].value_counts(normalize=True)
df_dist = df_dist.mul(100)
df_dist = df_dist.rename('Percent count').reset_index()
df_dist.head()

Unnamed: 0,Trace length,Activities,Percent count
0,6,Declaration SUBMITTED by EMPLOYEE,32.238193
1,6,End trip,24.845996
2,6,Request For Payment FINAL_APPROVED by SUPERVISOR,12.936345
3,6,Request For Payment APPROVED by ADMINISTRATION,7.186858
4,6,Send Reminder,4.106776


In [15]:
# plot the distribution
import plotly.express as px
fig = px.bar(df_dist, x="Trace length", y="Percent count", color="Activities", title="Last activity vs trace length distribution")
fig.show()
# fig.write_html("../docs/activity_vs_trace_length_distribution.html")

### Wrong predictions

In [16]:
# save distribution
event_distribution_LIST = []
# event_distribution_LIST['10'],event_distribution_LIST['8'],event_distribution_LIST['6'],event_distribution_LIST['4'] = [],[],[],[]
trace_lengths = [6]
for name, group in wrong_predictions.groupby(['case'],sort=False):
    # print(name)
    event_list = list(group['event'])
    for t_len in trace_lengths:
        index = t_len-1
        try:
            event_distribution_LIST.append([event_list[index],t_len])
        except:
            event_distribution_LIST.append([event_list[-1],t_len])

In [17]:
# create dataframe
act_distribution = pd.DataFrame(event_distribution_LIST, columns=['Activities','Trace length'])
act_distribution.head()

Unnamed: 0,Activities,Trace length
0,Declaration SUBMITTED by EMPLOYEE,6
1,Request For Payment SUBMITTED by EMPLOYEE,6
2,Request For Payment SUBMITTED by EMPLOYEE,6
3,End trip,6
4,Declaration SUBMITTED by EMPLOYEE,6


In [18]:
# value count and normalized
df_dist = act_distribution.groupby('Trace length')['Activities'].value_counts(normalize=True)
df_dist = df_dist.mul(100)
df_dist = df_dist.rename('Percent count').reset_index()
df_dist.head()

Unnamed: 0,Trace length,Activities,Percent count
0,6,Declaration SUBMITTED by EMPLOYEE,31.914894
1,6,End trip,25.531915
2,6,Request For Payment APPROVED by ADMINISTRATION,11.347518
3,6,Permit FINAL_APPROVED by SUPERVISOR,6.028369
4,6,Request For Payment FINAL_APPROVED by SUPERVISOR,5.673759


In [19]:
# plot the distribution
import plotly.express as px
fig = px.bar(df_dist, x="Trace length", y="Percent count", color="Activities", title="Last activity vs trace length distribution")
fig.show()
# fig.write_html("../docs/activity_vs_trace_length_distribution.html")

# ===================== END ====================== ######