In [186]:
# import libraties
import pandas as pd
import numpy as np
import json
import pickle

# https://pm4py.fit.fraunhofer.de/documentation
import pm4py
from pm4py.objects.log.obj import EventLog, Trace
from pm4py.objects.log.util.log import project_traces
from pm4py.objects.log.util import interval_lifecycle
from pm4py.statistics.sojourn_time.log import get as soj_time_get
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features

import warnings
warnings.filterwarnings("ignore")

In [187]:
# function to project the trace
def project_nth(log, index):
    print(str(project_traces(log)[index]))

In [188]:
# read data in csv 
trace = pd.read_csv('../data/Travel Permits (filtered).csv')

# to read from xes file
# data_path = '../data/Travel Permits Filtered.xes'
# trace_log = pm4py.read_xes(data_path)
# trace_log = pm4py.format_dataframe(trace_log, case_id='case:id', activity_key='concept:name', timestamp_key='time:timestamp')
# trace_log = pm4py.convert_to_event_log(trace_log)
# trace_log

In [189]:
# removing redundant columns from data and converting time to pandas datetime object
imp_cols = ['case', 'event', 'startTime', 'completeTime', 'OrganizationalEntity',
       'TotalDeclared', 'Overspent',
       'RequestedBudget', 'OverspentAmount', 'org:resource']
trace = trace[imp_cols]
trace['startTime'] = pd.to_datetime(trace['startTime'])
trace['completeTime'] = pd.to_datetime(trace['completeTime'])

# train test split

- Do temporal train test split and preprocess them seperately so that no data leakage will be present
- we split data based on 70% 30 % (Temporal split)
- classifier will be trained with all cases that started before a given date T1 which would represent a current point in time in a real-life scenario, and the testing is done only on cases that start afterwards.

In [190]:
trace.head()

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource
0,travel permit 76455,Start trip,2016-10-05 00:00:00,2016-10-05 00:00:00,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
1,travel permit 76455,End trip,2016-10-05 00:00:00,2016-10-05 00:00:00,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
2,travel permit 76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,2017-04-06 13:32:10,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
3,travel permit 76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,2017-04-06 13:32:28,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER
4,travel permit 76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,2017-04-07 13:38:14,organizational unit 65458,39.664561,False,41.613445,0.0,STAFF MEMBER


In [191]:
# get completion time and sort
completion_time_ls = list(trace.groupby(['case'])['completeTime'].max())
completion_time_ls = sorted(completion_time_ls)

In [192]:
# split on 70% max time 
split_portion = 0.70
total_data = len(completion_time_ls)
train_len = int(split_portion*total_data)
last_train_completion_time = completion_time_ls[train_len]
last_train_completion_time

Timestamp('2018-09-20 17:31:17')

In [193]:
# take all traces where start dates are after the last_train_completion_time
train_df = pd.DataFrame(columns = trace.columns)
test_df = pd.DataFrame(columns = trace.columns)
train_count,test_count = 0,0
for name, group in trace.groupby(['case']):
    if group['startTime'].iloc[0] < last_train_completion_time:
        train_df = train_df.append(group)
        train_count+=1
    else:
        test_df = test_df.append(group)
        test_count+=1

In [194]:
# although the split was 70:30, but since it is sequence data, traces can have different number of data points, so actual number is different 
test_data_percentage = (test_count/train_count)*100
print("train and test count")
print(train_count,test_count)
print("trace test percentage")
print(test_data_percentage)

train and test count
2825 460
trace test percentage
16.283185840707965


# Define comman variables

In [361]:
# trace length and saving path
t_length = 4
save_path_base = '../data/training_data/'

# ====================================
# Preparing Train Data

- define trace length and df_type (train or test)
- Working on train and test seperately, so run whole code for the same trace length for train and then for test
- the repeating code will be modularized when developing python file

In [362]:
# permits is just for the variable name, as below code is using permit as dataframe name
df_type = 'train'
permits = train_df

## Preparing data for task Declaration REJECTED vs Accapted by Administrator/Employee

- keeping all the traces \
or
Extract traces only where decleration is submitted by employee is in a trace ( we are not considering other traces where decision of accatped, rejected is not being made /discuss )
- create target variables, if application is rejected or not
- get traces which starts from permit submitted by employee to the trace decleration is submitted by employee (discuss)
- filter traces of length n (10,15,20 decide) 
- choose encoding and decide features to involve which we belive will be there before the activity decleration is submitted


### Filterings

1. Filtering on timeframe (from this date to that)
2. Filter on case performance (traces finished withing 10 days)
3. Filter on start and end activities (give list of start and end)
4. Filter on variants (keeping only frequent trace flows like - [a,b,c,d] and [a d b c], or 0.4 threshold etc)
5. Filter on attributes values (selection and projection of traces)
6. Filter on numeric attribute values (from declared amount 500 to 1000)
7. Between Filter (filtering the activities from say permit apply to permit accapted)
8. case size (number of activiries in case)


### Statistics

1. Throughput Time (time to complete traces)
2. Case Arrival/Dispersion Ratio (arrival time between two traces (not events))
3. Performance Spectrum (time between activities)
4. Cycle Time and Waiting Time (cycle time - time between activities, lead Time - the overall time in which the instance was worked, from the start to the end,accumelative )
5. Sojourn Time - time taken for an activity to complete from the end of previous activity
6. other stats if needed


# Feature Engineering

In [363]:
permits.head()
permits['startTime'] = pd.to_datetime(permits['startTime'])
permits['completeTime'] = pd.to_datetime(permits['completeTime'])

In [364]:
# add year and month column
permits['year'] = permits['startTime'].dt.year
permits['month'] = permits['startTime'].dt.month

In [365]:
# time to complete the activity
permits['act_completionTime'] = permits['completeTime'] - permits['startTime']
permits['act_completionTime'] = permits['act_completionTime'].dt.total_seconds()

In [366]:
# get minimum time for earch trace (when trace started)
temp_df = pd.DataFrame(permits.groupby(['case'])['startTime'].min())
temp_df['index'] = temp_df.index
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = ['startTime_min','case']

# merge with permits
permits = permits.merge(temp_df,on=['case'])

In [367]:
# get max time for earch trace (when trace ended)
temp_df = pd.DataFrame(permits.groupby(['case'])['completeTime'].max())
temp_df['index'] = temp_df.index
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = ['completeTime_max','case']

# merge with permits
permits = permits.merge(temp_df,on=['case'])

In [368]:
# calculate trace time
permits['processCompletionTime']=permits['completeTime_max']-permits['startTime_min']
permits['processCompletionTime'] = permits['processCompletionTime'].dt.total_seconds()

In [369]:
# calculate time taken between activities
act_time_taken=[]
for name,group in permits.groupby(['case'],sort=False):
    act_time_taken.append([name,pd.to_datetime(0) - pd.to_datetime(0)])
    for i in range(len(group)-1):
        st_time_1 = group.iloc[i]['startTime']
        st_time_2 = group.iloc[i+1]['startTime']
        diff = st_time_2-st_time_1
        act_time_taken.append([name,diff])

time_diff_df = pd.DataFrame(act_time_taken,columns=['case','act_time_diff'])

# add with permits
permits['next_act_time_diff'] = time_diff_df['act_time_diff']
permits['next_act_time_diff'] = permits['next_act_time_diff'].dt.total_seconds()

In [370]:
permits.head()

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,year,month,act_completionTime,startTime_min,completeTime_max,processCompletionTime,next_act_time_diff
0,travel permit 10066,Permit SUBMITTED by EMPLOYEE,2018-02-21 16:28:17,2018-02-21 16:28:17,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,0.0
1,travel permit 10066,Permit APPROVED by ADMINISTRATION,2018-02-21 16:28:19,2018-02-21 16:28:19,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,2.0
2,travel permit 10066,Start trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,27101.0
3,travel permit 10066,End trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,0.0
4,travel permit 10066,Permit FINAL_APPROVED by SUPERVISOR,2018-02-22 08:27:05,2018-02-22 08:27:05,organizational unit 65460,71.195831,False,64.878503,-18.925474,STAFF MEMBER,2018,2,0.0,2018-02-21 16:28:17,2018-03-01 17:31:26,694989.0,30425.0


### Adding some stastitics features in event log using pm4py

In [371]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

In [372]:
# adding lead and waiting time
# @@approx_bh_partial_lead_time	Incremental lead time associated to the event
# @@approx_bh_overall_wasted_time	Difference between the partial lead time and the partial cycle time values (for us same as above, as no cylce time for us)
# @@approx_bh_this_wasted_time	Wasted time ONLY with regards to the activity described by the ‘interval’ even (time difference between activities)

trace_log = interval_lifecycle.assign_lead_cycle_time(trace_log)

In [373]:
# time taken to start and end the activity
# found all zero based on data so nothing to add
soj_time = soj_time_get.apply(trace_log, parameters={soj_time_get.Parameters.TIMESTAMP_KEY: "time:timestamp", soj_time_get.Parameters.START_TIMESTAMP_KEY: "startTime"})
# soj_time

# Extracting target variable

- In our case target variable is REJECTED Declaration, in the process whoever rejects the Declaration (Administrartor, pre-approvar, etc.) it always goes through Declaration REJECTED by EMPLOYEE or it directly goes to Declaration REJECTED by MISSING, so using these two to identify our target variable

- One of our assumption is - If the decleration is not submitted by Employee and the payment is handeled directly, it is also considered as Declaration Accapted (so basically all the Declarations which is not rejected by employee are considerd as Accapted )

- we are not dealing with situations where it got rejected twice. For now we are only interested in 1st time rejection


<div>
<img src="../docs/huristic_net_BPI2020.png", width = 800, height = 800> 
</div>

In [374]:
declerations = [len(list(filter(lambda e: e["concept:name"] == ("Declaration REJECTED by EMPLOYEE" or "Declaration REJECTED by MISSING") ,trace))) > 0 for trace in trace_log]
declerations = [1 if dec != False else 0 for dec in declerations]
declerations[:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]

In [375]:
print(len(declerations))

2825


# Extracting Prefix 

- No bucketing/clustering/binding is applied (to do if needed)
- we extracted traces before the rejection for decleration, because we are intrested in prediction for the traces before that only 
- some process do not follow the above path but those are considered as accapted
- Trace length ( 10 for now - no specific reason, process is not big so randomly chosen 10 / to do)
- so basically all the traces of length less than equal to 10 are chosen, 
    - We need to deal if vector length is not same (10 in our case) - we performed padding (adding zeros at the end) during encoding for those traces
    - so we have traces with length 1,2,... 10 
    - any trace with length less then equal to 10 can be predicted using our model

In [376]:
# to extract only traces till the point decleration is rejected
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [377]:
# check if all good
project_nth(prefix_traces, 3)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration APPROVED by BUDGET OWNER', 'Declaration FINAL_APPROVED by SUPERVISOR', 'Request Payment']


In [378]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in trace_log])

In [379]:
# check the trace length
print([len(trace) for trace in trace_log][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[10, 10, 10, 12, 10, 12, 10, 12, 10, 13, 10, 12, 10, 10, 10]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [380]:
# check if all good
project_nth(trace_prefixes, 98)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR']


In [381]:
# to check which traces are not of our desired length (these will be padded while preparing the the training data)
for i,trace in enumerate(trace_prefixes):
    if len(trace)!=t_length:
        print(i, len(trace))
        break

## Feature selection/Filtering (Manual)

In [382]:
# convert logs to dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,...,time:timestamp,@@index,@@case_index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,travel permit 10066,Permit SUBMITTED by EMPLOYEE,2018-02-21 16:28:17,2018-02-21 16:28:17,organizational unit 65460,1970-01-01 00:00:00.000000071+00:00,False,1970-01-01 00:00:00.000000064+00:00,1969-12-31 23:59:59.999999982+00:00,STAFF MEMBER,...,2018-02-21 16:28:17,0,0,2018-02-21 16:28:17,0.0,0.0,0.0,0.0,1.0,travel permit 10066
1,travel permit 10066,Permit APPROVED by ADMINISTRATION,2018-02-21 16:28:19,2018-02-21 16:28:19,organizational unit 65460,1970-01-01 00:00:00.000000071+00:00,False,1970-01-01 00:00:00.000000064+00:00,1969-12-31 23:59:59.999999982+00:00,STAFF MEMBER,...,2018-02-21 16:28:19,1,0,2018-02-21 16:28:19,0.0,2.0,2.0,2.0,0.0,travel permit 10066
2,travel permit 10066,Start trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,1970-01-01 00:00:00.000000071+00:00,False,1970-01-01 00:00:00.000000064+00:00,1969-12-31 23:59:59.999999982+00:00,STAFF MEMBER,...,2018-02-22 00:00:00,2,0,2018-02-22 00:00:00,0.0,1903.0,1903.0,1901.0,0.0,travel permit 10066
3,travel permit 10066,End trip,2018-02-22 00:00:00,2018-02-22 00:00:00,organizational unit 65460,1970-01-01 00:00:00.000000071+00:00,False,1970-01-01 00:00:00.000000064+00:00,1969-12-31 23:59:59.999999982+00:00,STAFF MEMBER,...,2018-02-22 00:00:00,3,0,2018-02-22 00:00:00,0.0,1903.0,1903.0,0.0,0.0,travel permit 10066
4,travel permit 10077,Permit SUBMITTED by EMPLOYEE,2018-02-22 12:53:59,2018-02-22 12:53:59,organizational unit 65455,1970-01-01 00:00:00.000000658+00:00,False,1970-01-01 00:00:00.000001108+00:00,1969-12-31 23:59:59.999998627+00:00,STAFF MEMBER,...,2018-02-22 12:53:59,10,1,2018-02-22 12:53:59,0.0,0.0,0.0,0.0,1.0,travel permit 10077


In [383]:
df.columns

Index(['case', 'event', 'startTime', 'completeTime', 'OrganizationalEntity',
       'TotalDeclared', 'Overspent', 'RequestedBudget', 'OverspentAmount',
       'org:resource', 'year', 'month', 'act_completionTime', 'startTime_min',
       'completeTime_max', 'processCompletionTime', 'next_act_time_diff',
       'concept:name', 'time:timestamp', '@@index', '@@case_index',
       'start_timestamp', '@@approx_bh_partial_cycle_time',
       '@@approx_bh_partial_lead_time', '@@approx_bh_overall_wasted_time',
       '@@approx_bh_this_wasted_time', '@approx_bh_ratio_cycle_lead_time',
       'case:concept:name'],
      dtype='object')

In [384]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name','org:resource']
str_tr_attr = ['OrganizationalEntity','month']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = []

# Data encoding

- preparing input data to be passed, 
- there are different encodings we can go with
    - boolean encoding (one hot encoding if the activity present or not)
    - frequency encoding (count of activity)
    - simple index encoding (n events one hot encoding)
    - latest payload encoding (with trace attributes)
    - index payload encoding (n events one hot encoding + with trace attributes)
    - complex index based encoding (static feature + nevents encoding + event features )
    - lstm encoding (m x n)
    
=========  

- Since the complex index based encoding covers the above 3 ( we can simply filter columns to get those, also feature importance (random forest) can ignore those which are not imporant ) we basically generated 4 different encodings -
    - boolean encoding (one hot encoding if the activity present or not)
    - frequency encoding (count of activity)
    - complex index based encoding (static feature + nevents encoding + event features )
    - lstm encoding (m x n)

=======
- No need of paddings in boolean and frequency encoding as the activities are fixed \

**PADDING** 
- for one hot encoded attributes (categorical values) the desired vector length is  - t_length * ohe_length (Ex. 10*number of activities) 
- for numerical values the desired length is t_length

**Note** - For now process changes are not been considered, if a new activity arrives in future

## Common functions for Encodings 

In [385]:
# function to save the data
def save_data(X,y,feature_names, save_path):
    data_dict = {}
    data_dict['X'] = X
    data_dict['y'] = y
    data_dict['feature_names'] = feature_names

    # save pickle
    with open(save_path, 'wb') as handle:
        pickle.dump(data_dict, handle)

In [386]:
# load the data from pickle
def load_data(load_path):
    with open(load_path, 'rb') as handle:
        data = pickle.load(handle)
    return data

In [387]:
# function to get the one hot encoded vectors of categorical values
def get_ohe_dict(categorical_vars, df):
    ohe_dict = {}
    for var in categorical_vars:
        var_dict = {}
        var_data = sorted(df[var].unique())
        var_len = len(var_data)
        for i,cat in enumerate(var_data):
            var_dict[cat] = [0]*var_len
            var_dict[cat][i] = 1

        ohe_dict[var] = var_dict

    return ohe_dict

In [388]:
# padding function for ohe encoding
def cat_padding(vec, t_length, attr_length):
    desired_length = t_length*attr_length
    vec_length = len(vec)
    if vec_length != desired_length:
        pad_vec = [0]*(desired_length-vec_length)
        vec.extend(pad_vec)
    return vec

In [389]:
# padding function for non-ohe encoding
def num_padding(vec, t_length):
    vec_length = len(vec)
    if vec_length != t_length:
        pad_vec = [0]*(t_length-vec_length)
        vec.extend(pad_vec)
    return vec

In [390]:
# create one hot encoding dict fot categorical variables
# variables which we want to be one hot encoded
categorical_vars = str_ev_attr + str_tr_attr
ohe_dict = get_ohe_dict(categorical_vars, df)
categorical_vars

['concept:name', 'org:resource', 'OrganizationalEntity', 'month']

## Boolean encoding

- can be easily extended with adding more attributes of traces other then activities, but for now kept it simple and as discussed in course

In [391]:
# here for each trace we extract ohe vector for activity and sum them up and if count is greater then 1 we make them 1 
# because this encoding only provides info, if the activity was there or not

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        # make it a non frequency vector (if count is greater then 1 make it 1)
        for i,num in enumerate(str_ev_vec):
            if num>1:
                str_ev_vec[i]=1

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [392]:
# saving data 
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Frequency encoding

In [393]:
# here for each trace we extract ohe vector for activity and sum them up 
# because this encoding only provides count of how many times the activity appears

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [394]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Complex index based encoding - static feature (trace attributes) + n events encoding + event features

In [395]:
# here for each trace we put events encoded in order and there aatributes along with padding to make vector length same
# similatly for trace attributes but since it is trace attributes that is only for once

data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes along with paddings 

    for cat_atr in str_ev_attr:
        str_ev_vec = []
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        for ca in group[cat_atr]:
            str_ev_vec.append(ohe_dict[cat_atr][ca])
        
        # padding
        str_ev_vec = cat_padding(str_ev_vec, t_length, attr_length)
        feature_vec.extend(str_ev_vec)


    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.append(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)



    # add categorical and numerical trace attributes
    for num_t_atr in num_tr_attr:
        feature_vec.extend(group[num_t_atr].iloc[0])


    for cat_t_atr in str_tr_attr:
        feature_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

    # add vector to data
    data.append(feature_vec)


In [396]:
# check if all vector lengths are same 
vec_len = len(data[0])
for i, d in enumerate(data):
    if len(d)!=vec_len:
        print(i, len(d))

In [397]:
# save results
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## LSTM encoding

In [398]:
# here we create sequence of each trace
# so the dimentions will be (number of examples * trace_length * feature_length )

data = []

for id, group in df.groupby(['case:concept:name']):
    
    feature_vec = []

    for index, row in group.iterrows():

        row_vec = []

        for cat_atr in str_ev_attr:
            row_vec.extend(ohe_dict[cat_atr][row[cat_atr]])

        for num_atr in num_ev_attr:
            row_vec.append(row[num_atr])


        # add categorical and numerical trace attributes
        for num_t_atr in num_tr_attr:
            row_vec.append(group[num_t_atr].iloc[0])

        for cat_t_atr in str_tr_attr:
            row_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

        feature_vec.append(row_vec)
    

    # add vector to data
    data.append(feature_vec)


In [399]:
# converting to array
data = np.array([np.array(ls) for ls in data])

In [400]:
# shape we want for all the traces
feature_len = len(data[0][0])
desired_shape = (t_length,feature_len)
desired_shape

(4, 49)

In [401]:
# padding data to make equal shape of vectors
padded_data = []
for case in data:
    pd_case = np.zeros(desired_shape)
    pd_case[:case.shape[0],:case.shape[1]] = case
    padded_data.append(pd_case)

padded_data = np.array(padded_data).shape

In [402]:
# save results
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

# =============================================
# Preparing Test Data

- exactly same, just using **train encoding** to create features
- define trace length and df_type (train or test)
- Working on train and test seperately, so run whole code for the same trace length for train and then for test

In [403]:
df_type = 'test'
permits = test_df

In [404]:
test_df

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,year,month,act_completionTime
31446,travel permit 10898,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05,2018-09-27 12:35:05,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,9,0.0
31447,travel permit 10898,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10,2018-09-27 12:35:10,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,9,0.0
31448,travel permit 10898,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28,2018-09-28 15:45:28,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,9,0.0
31449,travel permit 10898,Start trip,2018-10-08 00:00:00,2018-10-08 00:00:00,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,10,0.0
31450,travel permit 10898,End trip,2018-10-12 00:00:00,2018-10-12 00:00:00,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,10,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33434,travel permit 58843,Declaration SUBMITTED by EMPLOYEE,2018-11-09 16:46:00,2018-11-09 16:46:00,organizational unit 65455,760.481906,False,1013.080859,-158.221068,STAFF MEMBER,2018,11,0.0
33435,travel permit 58843,Declaration APPROVED by ADMINISTRATION,2018-11-12 09:33:59,2018-11-12 09:33:59,organizational unit 65455,760.481906,False,1013.080859,-158.221068,STAFF MEMBER,2018,11,0.0
33436,travel permit 58843,Declaration FINAL_APPROVED by SUPERVISOR,2018-11-12 10:40:10,2018-11-12 10:40:10,organizational unit 65455,760.481906,False,1013.080859,-158.221068,STAFF MEMBER,2018,11,0.0
33437,travel permit 58843,Request Payment,2018-11-12 14:34:54,2018-11-12 14:34:54,organizational unit 65455,760.481906,False,1013.080859,-158.221068,SYSTEM,2018,11,0.0


# Feature Engineering

In [405]:
permits.head()
permits['startTime'] = pd.to_datetime(permits['startTime'])
permits['completeTime'] = pd.to_datetime(permits['completeTime'])

In [406]:
# add year and month column
permits['year'] = permits['startTime'].dt.year
permits['month'] = permits['startTime'].dt.month

In [407]:
# time to complete the activity
permits['act_completionTime'] = permits['completeTime'] - permits['startTime']
permits['act_completionTime'] = permits['act_completionTime'].dt.total_seconds()

In [408]:
# get minimum time for earch trace (when trace started)
temp_df = pd.DataFrame(permits.groupby(['case'])['startTime'].min())
temp_df['index'] = temp_df.index
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = ['startTime_min','case']

# merge with permits
permits = permits.merge(temp_df,on=['case'])

In [409]:
# get max time for earch trace (when trace ended)
temp_df = pd.DataFrame(permits.groupby(['case'])['completeTime'].max())
temp_df['index'] = temp_df.index
temp_df.reset_index(drop=True, inplace=True)
temp_df.columns = ['completeTime_max','case']

# merge with permits
permits = permits.merge(temp_df,on=['case'])

In [410]:
# calculate trace time
permits['processCompletionTime']=permits['completeTime_max']-permits['startTime_min']
permits['processCompletionTime'] = permits['processCompletionTime'].dt.total_seconds()

In [411]:
# calculate time taken between activities
act_time_taken=[]
for name,group in permits.groupby(['case'],sort=False):
    act_time_taken.append([name,pd.to_datetime(0) - pd.to_datetime(0)])
    for i in range(len(group)-1):
        st_time_1 = group.iloc[i]['startTime']
        st_time_2 = group.iloc[i+1]['startTime']
        diff = st_time_2-st_time_1
        act_time_taken.append([name,diff])

time_diff_df = pd.DataFrame(act_time_taken,columns=['case','act_time_diff'])

# add with permits
permits['next_act_time_diff'] = time_diff_df['act_time_diff']
permits['next_act_time_diff'] = permits['next_act_time_diff'].dt.total_seconds()

In [412]:
permits.head()

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,year,month,act_completionTime,startTime_min,completeTime_max,processCompletionTime,next_act_time_diff
0,travel permit 10898,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05,2018-09-27 12:35:05,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,9,0.0,2018-09-27 12:35:05,2018-11-05 17:31:18,3387373.0,0.0
1,travel permit 10898,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10,2018-09-27 12:35:10,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,9,0.0,2018-09-27 12:35:05,2018-11-05 17:31:18,3387373.0,5.0
2,travel permit 10898,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28,2018-09-28 15:45:28,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,9,0.0,2018-09-27 12:35:05,2018-11-05 17:31:18,3387373.0,97818.0
3,travel permit 10898,Start trip,2018-10-08 00:00:00,2018-10-08 00:00:00,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,10,0.0,2018-09-27 12:35:05,2018-11-05 17:31:18,3387373.0,807272.0
4,travel permit 10898,End trip,2018-10-12 00:00:00,2018-10-12 00:00:00,organizational unit 65458,264.493826,True,347.303505,44.996875,STAFF MEMBER,2018,10,0.0,2018-09-27 12:35:05,2018-11-05 17:31:18,3387373.0,345600.0


### Adding some stastitics features in event log using pm4py

In [413]:
# converting dataframe to event log
trace_log = pm4py.format_dataframe(permits, case_id='case', activity_key='event', timestamp_key='completeTime', start_timestamp_key='startTime')
trace_log = pm4py.convert_to_event_log(trace_log)

In [414]:
# adding lead and waiting time
# @@approx_bh_partial_lead_time	Incremental lead time associated to the event
# @@approx_bh_overall_wasted_time	Difference between the partial lead time and the partial cycle time values (for us same as above, as no cylce time for us)
# @@approx_bh_this_wasted_time	Wasted time ONLY with regards to the activity described by the ‘interval’ even (time difference between activities)

trace_log = interval_lifecycle.assign_lead_cycle_time(trace_log)

In [415]:
# time taken to start and end the activity
# found all zero based on data so nothing to add
soj_time = soj_time_get.apply(trace_log, parameters={soj_time_get.Parameters.TIMESTAMP_KEY: "time:timestamp", soj_time_get.Parameters.START_TIMESTAMP_KEY: "startTime"})
soj_time

{'Permit SUBMITTED by EMPLOYEE': 0.0,
 'Permit APPROVED by ADMINISTRATION': 0.0,
 'Permit FINAL_APPROVED by SUPERVISOR': 0.0,
 'Start trip': 0.0,
 'End trip': 0.0,
 'Declaration SUBMITTED by EMPLOYEE': 0.0,
 'Declaration APPROVED by ADMINISTRATION': 0.0,
 'Declaration FINAL_APPROVED by SUPERVISOR': 0.0,
 'Request Payment': 0.0,
 'Payment Handled': 0.0,
 'Permit APPROVED by BUDGET OWNER': 0.0,
 'Declaration APPROVED by BUDGET OWNER': 0.0,
 'Declaration REJECTED by ADMINISTRATION': 0.0,
 'Declaration REJECTED by EMPLOYEE': 0.0,
 'Permit APPROVED by SUPERVISOR': 0.0,
 'Permit FINAL_APPROVED by DIRECTOR': 0.0,
 'Declaration APPROVED by SUPERVISOR': 0.0,
 'Declaration FINAL_APPROVED by DIRECTOR': 0.0,
 'Send Reminder': 0.0}

# Extracting target variable

In [416]:
declerations = [len(list(filter(lambda e: e["concept:name"] == ("Declaration REJECTED by EMPLOYEE" or "Declaration REJECTED by MISSING") ,trace))) > 0 for trace in trace_log]
declerations = [1 if dec != False else 0 for dec in declerations]
declerations[:20]

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

In [417]:
print(len(declerations))

460


# Extracting Prefix 

In [418]:
# to extract only traces till the point decleration is rejected
prefix_traces = []
for trace in trace_log:
    for i,event in enumerate(trace):
        if "Declaration REJECTED" in event['event']:
            break
    prefix_traces.append(Trace(trace[:i], attributes = trace.attributes))
prefix_traces = EventLog(prefix_traces)

In [419]:
# check if all good
project_nth(prefix_traces, 3)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Permit APPROVED by BUDGET OWNER', 'Permit FINAL_APPROVED by SUPERVISOR', 'Start trip', 'End trip', 'Declaration SUBMITTED by EMPLOYEE', 'Declaration APPROVED by ADMINISTRATION', 'Declaration APPROVED by BUDGET OWNER', 'Declaration FINAL_APPROVED by SUPERVISOR', 'Request Payment']


In [420]:
# generate prefixes, note that we need to add the casts to EventLog and Trace to make sure that the result is a PM4Py EventLog object
trace_prefixes = EventLog([Trace(trace[0:t_length], attributes = trace.attributes) for trace in trace_log])

In [421]:
# check the trace length
print([len(trace) for trace in trace_log][0:15])
print([len(trace) for trace in trace_prefixes][0:15])

[10, 10, 10, 12, 10, 10, 10, 13, 10, 12, 12, 10, 10, 10, 13]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [422]:
# check if all good
project_nth(trace_prefixes, 98)

['Permit SUBMITTED by EMPLOYEE', 'Permit APPROVED by ADMINISTRATION', 'Start trip', 'End trip']


In [423]:
# to check which traces are not of our desired length (these will be padded while preparing the the training data)
for i,trace in enumerate(trace_prefixes):
    if len(trace)!=t_length:
        print(i, len(trace))
        break

In [424]:
# convert logs to dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,...,time:timestamp,@@index,@@case_index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,travel permit 10898,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05,2018-09-27 12:35:05,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-09-27 12:35:05,0,0,2018-09-27 12:35:05,0.0,0.0,0.0,0.0,1.0,travel permit 10898
1,travel permit 10898,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10,2018-09-27 12:35:10,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-09-27 12:35:10,1,0,2018-09-27 12:35:10,0.0,5.0,5.0,5.0,0.0,travel permit 10898
2,travel permit 10898,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28,2018-09-28 15:45:28,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-09-28 15:45:28,2,0,2018-09-28 15:45:28,0.0,47423.0,47423.0,47418.0,0.0,travel permit 10898
3,travel permit 10898,Start trip,2018-10-08 00:00:00,2018-10-08 00:00:00,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-10-08 00:00:00,3,0,2018-10-08 00:00:00,0.0,231895.0,231895.0,184472.0,0.0,travel permit 10898
4,travel permit 11220,Permit SUBMITTED by EMPLOYEE,2018-10-04 14:04:31,2018-10-04 14:04:31,organizational unit 65464,1970-01-01 00:00:00.000000264+00:00,False,1970-01-01 00:00:00.000000334+00:00,1969-12-31 23:59:59.999999907+00:00,STAFF MEMBER,...,2018-10-04 14:04:31,10,1,2018-10-04 14:04:31,0.0,0.0,0.0,0.0,1.0,travel permit 11220


## Feature selection/Filtering (Manual)

In [425]:
# convert logs to dataframe
df = pm4py.convert_to_dataframe(trace_prefixes)
df.head(5)

Unnamed: 0,case,event,startTime,completeTime,OrganizationalEntity,TotalDeclared,Overspent,RequestedBudget,OverspentAmount,org:resource,...,time:timestamp,@@index,@@case_index,start_timestamp,@@approx_bh_partial_cycle_time,@@approx_bh_partial_lead_time,@@approx_bh_overall_wasted_time,@@approx_bh_this_wasted_time,@approx_bh_ratio_cycle_lead_time,case:concept:name
0,travel permit 10898,Permit SUBMITTED by EMPLOYEE,2018-09-27 12:35:05,2018-09-27 12:35:05,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-09-27 12:35:05,0,0,2018-09-27 12:35:05,0.0,0.0,0.0,0.0,1.0,travel permit 10898
1,travel permit 10898,Permit APPROVED by ADMINISTRATION,2018-09-27 12:35:10,2018-09-27 12:35:10,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-09-27 12:35:10,1,0,2018-09-27 12:35:10,0.0,5.0,5.0,5.0,0.0,travel permit 10898
2,travel permit 10898,Permit FINAL_APPROVED by SUPERVISOR,2018-09-28 15:45:28,2018-09-28 15:45:28,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-09-28 15:45:28,2,0,2018-09-28 15:45:28,0.0,47423.0,47423.0,47418.0,0.0,travel permit 10898
3,travel permit 10898,Start trip,2018-10-08 00:00:00,2018-10-08 00:00:00,organizational unit 65458,1970-01-01 00:00:00.000000264+00:00,True,1970-01-01 00:00:00.000000347+00:00,1970-01-01 00:00:00.000000044+00:00,STAFF MEMBER,...,2018-10-08 00:00:00,3,0,2018-10-08 00:00:00,0.0,231895.0,231895.0,184472.0,0.0,travel permit 10898
4,travel permit 11220,Permit SUBMITTED by EMPLOYEE,2018-10-04 14:04:31,2018-10-04 14:04:31,organizational unit 65464,1970-01-01 00:00:00.000000264+00:00,False,1970-01-01 00:00:00.000000334+00:00,1969-12-31 23:59:59.999999907+00:00,STAFF MEMBER,...,2018-10-04 14:04:31,10,1,2018-10-04 14:04:31,0.0,0.0,0.0,0.0,1.0,travel permit 11220


In [426]:
df.columns

Index(['case', 'event', 'startTime', 'completeTime', 'OrganizationalEntity',
       'TotalDeclared', 'Overspent', 'RequestedBudget', 'OverspentAmount',
       'org:resource', 'year', 'month', 'act_completionTime', 'startTime_min',
       'completeTime_max', 'processCompletionTime', 'next_act_time_diff',
       'concept:name', 'time:timestamp', '@@index', '@@case_index',
       'start_timestamp', '@@approx_bh_partial_cycle_time',
       '@@approx_bh_partial_lead_time', '@@approx_bh_overall_wasted_time',
       '@@approx_bh_this_wasted_time', '@approx_bh_ratio_cycle_lead_time',
       'case:concept:name'],
      dtype='object')

In [427]:
# passed features we want to extract

# str_ev_attr	String attributes at the event level: these are hot-encoded into features that may assume value 0 or value 1.
# str_tr_attr	String attributes at the trace level: these are hot-encoded into features that may assume value 0 or value 1.
# num_ev_attr	Numeric attributes at the event level: these are encoded by including the last value of the attribute among the events of the trace.
# num_tr_attr	Numeric attributes at trace level: these are encoded by including the numerical value.
# str_evsucc_attr	Successions related to the string attributes values at the event level: for example, if we have a trace [A,B,C], it might be important to include not only the presence of the single values A, B and C as features; but also the presence of the directly-follows couples (A,B) and (B,C).
# ================================================

str_ev_attr = ['concept:name','org:resource']
str_tr_attr = ['OrganizationalEntity','month']
num_ev_attr = ['@@approx_bh_partial_lead_time','@@approx_bh_this_wasted_time']
num_tr_attr = []

## Boolean encoding

- can be easily extended with adding more attributes of traces other then activities, but for now kept it simple and as discussed in course

In [428]:
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [429]:
data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        # make it a non frequency vector (if count is greater then 1 make it 1)
        for i,num in enumerate(str_ev_vec):
            if num>1:
                str_ev_vec[i]=1

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [430]:
encode_name = 'boolean_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Frequency encoding

In [431]:
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [432]:
data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes
    for cat_atr in str_ev_attr[:1]:
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        str_ev_vec = np.array([0]*attr_length)

        for ca in group[cat_atr]:
            str_ev_vec  = str_ev_vec + np.array(ohe_dict[cat_atr][ca])

        feature_vec.extend(list(str_ev_vec))

    data.append(feature_vec)

    # data.append(feature_vec)

In [433]:
# save results
encode_name = 'frequency_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## Complex index based encoding - static feature (trace attributes) + n events encoding + event features

In [434]:
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [435]:
data = []

for id, group in df.groupby(['case:concept:name']):
    feature_vec = []

    # add categorical and numerical event attributes along with paddings 

    for cat_atr in str_ev_attr:
        str_ev_vec = []
        attr_length = len(list(ohe_dict[cat_atr].values())[0])
        for ca in group[cat_atr]:
            str_ev_vec.append(ohe_dict[cat_atr][ca])
        
        # padding
        str_ev_vec = cat_padding(str_ev_vec, t_length, attr_length)
        feature_vec.extend(str_ev_vec)


    for num_atr in num_ev_attr:
        num_ev_vec = []
        num_ev_vec.append(list(group[num_atr]))

        # padding
        num_ev_vec = num_padding(num_ev_vec, t_length)
        feature_vec.extend(num_ev_vec)



    # add categorical and numerical trace attributes
    for num_t_atr in num_tr_attr:
        feature_vec.extend(group[num_t_atr].iloc[0])


    for cat_t_atr in str_tr_attr:
        feature_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

    # add vector to data
    data.append(feature_vec)


In [436]:
# check if all vector lengths are same 
vec_len = len(data[0])

for i, d in enumerate(data):
    if len(d)!=vec_len:
        print(i, len(d))

In [437]:
# save results
encode_name = 'complex_index_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

## LSTM encoding

In [438]:
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + 'train' +'_trace_len_'+str(t_length)+ '.pickle'
loaded_data = load_data(save_path)
ohe_dict = loaded_data['feature_names']

In [439]:
data = []

for id, group in df.groupby(['case:concept:name']):
    
    feature_vec = []

    for index, row in group.iterrows():

        row_vec = []

        for cat_atr in str_ev_attr:
            row_vec.extend(ohe_dict[cat_atr][row[cat_atr]])

        for num_atr in num_ev_attr:
            row_vec.append(row[num_atr])


        # add categorical and numerical trace attributes
        for num_t_atr in num_tr_attr:
            row_vec.append(group[num_t_atr].iloc[0])

        for cat_t_atr in str_tr_attr:
            row_vec.extend(ohe_dict[cat_t_atr][group[cat_t_atr].iloc[0]])

        feature_vec.append(row_vec)
    

    # add vector to data
    data.append(feature_vec)


In [440]:
# converting to array
data = np.array([np.array(ls) for ls in data])

In [441]:
feature_len = len(data[0][0])
desired_shape = (t_length,feature_len)
desired_shape

(4, 49)

In [442]:
# padding data to make equal shape of vectors
padded_data = []
for case in data:
    pd_case = np.zeros(desired_shape)
    pd_case[:case.shape[0],:case.shape[1]] = case
    padded_data.append(pd_case)

padded_data = np.array(padded_data).shape

In [443]:
# save results
encode_name = 'lstm_encode_'
save_path = save_path_base + encode_name + df_type +'_trace_len_'+str(t_length)+ '.pickle'
save_data(data, declerations, ohe_dict ,save_path)

# =================== END =================



# Extra

In [None]:
# from pm4py.statistics.sojourn_time.log import get as soj_time_get
# soj_time = soj_time_get.apply(trace_log, parameters={soj_time_get.Parameters.TIMESTAMP_KEY: "time:timestamp", soj_time_get.Parameters.START_TIMESTAMP_KEY: "startTime"})
# print(soj_time)

In [None]:
# # to get all trace cases
# for i, trace in enumerate(trace_log):
#     print(trace[0]['case'])

In [None]:
# # throughput time (time to complete traces) all in seconds
# all_case_durations = pm4py.get_all_case_durations(trace_log)
# all_case_durations

In [None]:
# # arrival between cases
# case_arrival_ratio = pm4py.get_case_arrival_average(trace_log)
# case_arrival_ratio/(60*60)

In [None]:
# from pm4py.objects.log.util import interval_lifecycle
# enriched_log = interval_lifecycle.assign_lead_cycle_time(trace_log)
# enriched_log

### boolean bigram encoding

In [None]:
data_2gram, feature_names = log_to_features.apply(trace_prefixes, 
                                                  parameters={"str_ev_attr": [], 
                                                        "str_tr_attr": [], 
                                                        "num_ev_attr": [], 
                                                        "num_tr_attr": [], 
                                                        "str_evsucc_attr": ["concept:name"]})
feature_names

In [None]:
data_2gram = np.asarray(data_2gram)

In [None]:
project_nth(trace_prefixes, 0)

In [None]:
print(data_2gram[0])

In [None]:
train_data = {}
train_data['X']=data_2gram
train_data['y'] = declerations
train_data['feature_names'] = feature_names

In [None]:
# save json
save_path = '../data/training_data/bigram_boolean_encode.json'
with open(save_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

In [None]:
# between_log = pm4py.filter_between(dec_sub_traces, "Permit SUBMITTED by EMPLOYEE", "Declaration SUBMITTED by EMPLOYEE")

In [None]:
# project_nth(between_log, 232)

In [None]:
# we can get any attribute of trace using this
activities = pm4py.get_event_attribute_values(trace_log, "concept:name")
resources = pm4py.get_event_attribute_values(trace_log, "org:resource")
resources

In [None]:
# remove Return Decleration event
# train_raw = pm4py.filter_event_attribute_values(trace_log, "concept:name", "Declaration REJECTED by EMPLOYEE", level = "event", retain=False)

In [None]:
# to display most frequent feature
# # look at the unique vectors and their occurrence frequency/count
# # just to check which trace was most frequent, just analysis
# dist_features = np.unique(data, return_counts= True, axis = 0)
# print(dist_features)


# # display most freuent feature
# most_freq_feature = dist_features[0][np.argmax(dist_features[1])]
# most_freq_feature


# # order doesn't matter here
# for i in range(0, len(most_freq_feature)):
#     if most_freq_feature[i]!=0:
#         print(feature_names[i].split("@")[1], end=' , ')

In [None]:
# # printing all activities in trace 
# for trace in trace_log[0]:
#     print(trace["concept:name"])

In [None]:
# to extract only traces where Declaration SUBMITTED by EMPLOYEE is present
# dec_sub_traces = []
# for trace in trace_log:
#     if len(list(filter(lambda e: e["concept:name"] == "Declaration SUBMITTED by EMPLOYEE" ,trace))) > 0:
#         dec_sub_traces.append(Trace(trace, attributes = trace.attributes))

# dec_sub_traces = EventLog(dec_sub_traces)