In [None]:
import os
import psutil
import numpy as np  # import auxiliary library, typical idiom
import pandas as pd  # import the Pandas library, typical idiom
from pandas import read_csv
import statsmodels.api as sm
import time
import pm4py
from datetime import datetime
from datetime import date
from datetime import datetime
from datetime import timedelta

from numba import jit

from sklearn.linear_model import LinearRegression  # for linear regression
from sklearn import linear_model
from sklearn.cluster import KMeans  # for clustering
from sklearn.tree import DecisionTreeClassifier  # for decision tree mining
from sklearn.metrics import mean_absolute_error, confusion_matrix, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf 
import statsmodels.api as sm
from statsmodels.graphics.gofplots import ProbPlot
from matplotlib import pyplot
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [None]:
file_export = 'export2018.csv'
data = pd.read_csv(file_export)

# Pre-processing

Pre-processing
* Visualization
1. Unix time
2. Encoding of categorical features
3. Temporal ordering
4. Aditional features:
- Previous event
- Next event
- Day of the week
- Time of day
- Event duration
5. Separate 80-20 
- Visualization
6. Get rid of overlap

In [None]:
data = data.sort_values(by=['case','startTime'])

In [None]:
#Duration
@jit(parallel = True)
def calculator_nb(case, startTime):
    res = np.empty(len(case), dtype=object)
    idx = 0
    for _ in case:
        if (idx+1 >= len(case)):
            break

        if (case[idx + 1] == case[idx]):
            res[idx] = startTime[idx + 1]
        else:
            res[idx] = startTime[idx]

        idx+=1
    return res

data['completeTime'] = calculator_nb(data['case'].values, data['startTime'].values)
data.at[317373, 'completeTime'] = data.at[317373, 'startTime']

data['startTime'] =  pd.to_datetime(data['startTime'])
data['completeTime'] =  pd.to_datetime(data['completeTime'])
data['duration'] = data['completeTime'] - data['startTime']
#to turn duration into seconds:
duration = data['duration']
duration = duration / np.timedelta64(1, 's')
data['duration'] = duration

In [None]:
#Next event
@jit(parallel = True)
def calculator_nb(case, event):
    res = np.empty(len(case), dtype=object)
    idx = 0
    for _ in case:
        if (idx+1 >= len(case)):
            break
       
        if (case[idx + 1] == case[idx]):
            res[idx] = event[idx + 1]

        idx+=1
    return res

data['next_event'] = calculator_nb(data['case'].values, data['event'].values)

In [None]:
#Previous event
@jit(parallel = True)
def calculator_nb(case, event):
    res = np.empty(len(case), dtype=object)
    idx = 0
    for _ in case:
        if (idx+1 >= len(case)):
            break
       
        if (case[idx + 1] == case[idx]):
            res[idx + 1] = event[idx]

        idx+=1
    return res

data['prev_event'] = calculator_nb(data['case'].values, data['event'].values)

In [None]:
#Removing null values
data['next_event'] = data['next_event'].fillna(value='None')
data['prev_event'] = data['prev_event'].fillna(value='None')

In [None]:
#unix time
pd.set_option('display.float_format', lambda x: '%.3f' % x)

data['startTime'] = pd.to_datetime(data['startTime'], dayfirst=True)
unixTransform = lambda x: time.mktime(x.timetuple())
data["UNIX_starttime"] = data["startTime"].apply(unixTransform).astype(int)

data['completeTime'] = pd.to_datetime(data['completeTime'], dayfirst=True)
unixTransform = lambda x: time.mktime(x.timetuple())
data["UNIX_completeTime"] = data["completeTime"].apply(unixTransform).astype(int)

#data['REG_DATE'] = pd.to_datetime(data['REG_DATE'], dayfirst=True)
#unixTransform = lambda x: time.mktime(x.timetuple())
#data["UNIX_REG_DATE"] = data["REG_DATE"].apply(unixTransform).astype(int)

#print(data)

In [None]:
#Day of the week
data['weekday'] = data['startTime'].dt.dayofweek

In [None]:
#encoding of categorical data
ordinal_encoder = OrdinalEncoder()
label_encoder = LabelEncoder()
data['enc_event'] = ordinal_encoder.fit_transform(data[['event']]).astype(int)

In [None]:
#ensure we have acces to orignal indexing to keep track of the order of events in a process
data['original index'] = data.index

#sorting on time
data.sort_values(by = "UNIX_starttime", ignore_index=True)

In [None]:
#separation
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, shuffle=False)

In [None]:
#removing overlap - if case is in both datasets, remove

train_cases = train['case'].unique().tolist()
test_cases = test['case'].unique().tolist()

intersect_list = list(set(train_cases).intersection(test_cases))

In [None]:
#only removes first value in intersect list (needs modification for multiple overlaping values)

#train = train[train['case'] != intersect_list[0]]
#test = test[test['case'] != intersect_list[0]]

#works for more values
org_train = train.copy()
org_test = test.copy()
df_ordinal_encoder = LabelEncoder()
train=train.apply(df_ordinal_encoder.fit_transform)
test=test.apply(df_ordinal_encoder.fit_transform)

train = train[train['case'].isin(intersect_list) == False]
X_train_time = train.drop(columns='duration')
Y_train_time = train["duration"]
X_train_event = train.drop(columns=["next_event"])
Y_train_event = train["event"]

test = test[test['case'].isin(intersect_list) == False]
X_test_time = test.drop(columns='duration')
Y_test_time = test["duration"]
X_test_event = test.drop(columns=['next_event'])
Y_test_event = test["event"]

In [None]:
#separation visualisation

g = sns.scatterplot(x="UNIX_starttime", y="case", hue="enc_event", data=data, palette='colorblind', legend=False)

#add lines for separation - horizontal and vertical

In [None]:
train.columns

Index(['case', 'event', 'startTime', 'completeTime', 'penalty_JLP1',
       'penalty_JLP3', 'number_parcels', 'penalty_JLP2', 'penalty_JLP5',
       'year', 'penalty_JLP7', 'penalty_JLP6', 'redistribution',
       'amount_applied1', 'amount_applied0', 'amount_applied3',
       'amount_applied2', 'identity:id', 'penalty_V5', 'payment_actual0',
       'payment_actual2', 'payment_actual1', 'penalty_B5F', 'payment_actual3',
       'penalty_B16', 'penalty_GP1', 'basic payment', 'penalty_AGP', 'area',
       'selected_manually', 'penalty_B3', 'penalty_B2', 'selected_risk',
       'penalty_B5', 'penalty_AVBP', 'penalty_B4', 'penalty_B6', 'penalty_ABP',
       'penalty_AVGP', 'penalty_C4', 'greening', 'rejected',
       'cross_compliance', 'penalty_C9', 'penalty_AVJLP', 'penalty_CC',
       'penalty_AVUVP', 'penalty_BGK', 'penalty_C16', 'penalty_BGP',
       'department', 'small farmer', 'risk_factor', 'applicant',
       'penalty_AUVP', 'penalty_amount2', 'penalty_BGKV', 'penalty_amount3',
  

# Feature prediction for time and event based on KBest(z-scores)
note: don't run takes a significant time

In [None]:
from sklearn.feature_selection import SelectKBest

select = SelectKBest(k=10) # takes best 10 arguments 
z = select.fit_transform(X_train_time, Y_train_time)
filter = select.get_support()
print(np.extract(filter, train.columns))
# ['event' 'selected_random' 'note' 'eventid' 'activity' 'subprocess' 'org:resource' 'duration' 'next_event' 'weekday'] for time



select = SelectKBest(k=10) # takes best 10 arguments for event
z = select.fit_transform(X_train_event, Y_train_event)
filter = select.get_support()
print(np.extract(filter, train.columns)) 
# ['penalty_B2' 'penalty_AJLP' 'young farmer' 'note' 'eventid' 'docid' 'docid_uuid' 'success' 'duration' 'UNIX_completeTime'] for event

# Naive Baseline Models

In [None]:
# Naive event (needs restructuring)
data_baseline= test.copy()

@jit(parallel = True)
def calculator_pos(case):
    res = np.empty(len(case), dtype=object)
    idx = 0
    count=1
    for _ in case:
        if (idx+1 >= len(case)):
            break
       
        if (case[idx] == case[idx-1]):
            count+=1
            res[idx] = count
            
        else:
            count=1
            res[idx]=count

        idx+=1
    res[-1]=count+1
    return res

data_baseline["pos"] = calculator_pos(data_baseline['case'].values)

event_to_num = {}
list_of_events = train["event"].unique()
i=0
for event in list_of_events:
    event_to_num[str(event)] = i
    i += 1
event_to_num['None'] = i

pop=data_baseline.sort_values(by='pos')
pop['eventnum']=pop['enc_event']
pop2=pop.set_index('pos')
pop3=pop[['pos','eventnum']]
pop4=pop3.groupby(['pos', 'eventnum']).apply(pd.DataFrame.mode).reset_index(drop=True)
pop5=pop4.drop_duplicates(subset='pos')
ptenum= dict(zip(pop5.pos, pop5.eventnum))
num_to_event = {value:key for key, value in event_to_num.items()}
data_baseline['predicted_event_num'] = (data_baseline['pos']+1).map(ptenum)
data_baseline['predicted_event'] = (data_baseline['predicted_event_num']).map(num_to_event)
data_baseline_final=data_baseline.drop(['predicted_event_num'],axis=1)

next_task=[]
predicted_event=[]
for event in data_baseline_final['next_event']:
    next_task.append(str(event))
    

for case in data_baseline_final['predicted_event']:
    predicted_event.append(str(case))

accuracy_score(next_task,predicted_event)


test["naive_event"] = predicted_event

In [None]:
# Naive time
#Sums up count for each event and the time each event takes
events_count = train.groupby("event")['duration'].agg('count')
event_duration_sum = train.groupby("event")['duration'].agg('sum')

#Computes average duration per event (basically our trained data that can be mapped onto test data)
duration_per_event = event_duration_sum / events_count 

test["naive_time"] = test['event'].map(duration_per_event)

# Random Forest

In [None]:
# Random forest event
DT = DecisionTreeClassifier()
RF = RandomForestClassifier()

DT_fit = DT.fit(X_train_event.iloc[:1000], Y_train_event.iloc[:1000])
RF_fit = RF.fit(X_train_event['event'].iloc[:1000], Y_train_event.iloc[:1000])

DT_pred = DT_fit.predict(X_test_event.iloc[:1000])
RF_pred = RF_fit.predict(X_test_event['event'].iloc[:1000])

# LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [None]:
listVal = train
listVal.columns

Index(['case', 'event', 'startTime', 'completeTime', 'penalty_JLP1',
       'penalty_JLP3', 'number_parcels', 'penalty_JLP2', 'penalty_JLP5',
       'year', 'penalty_JLP7', 'penalty_JLP6', 'redistribution',
       'amount_applied1', 'amount_applied0', 'amount_applied3',
       'amount_applied2', 'identity:id', 'penalty_V5', 'payment_actual0',
       'payment_actual2', 'payment_actual1', 'penalty_B5F', 'payment_actual3',
       'penalty_B16', 'penalty_GP1', 'basic payment', 'penalty_AGP', 'area',
       'selected_manually', 'penalty_B3', 'penalty_B2', 'selected_risk',
       'penalty_B5', 'penalty_AVBP', 'penalty_B4', 'penalty_B6', 'penalty_ABP',
       'penalty_AVGP', 'penalty_C4', 'greening', 'rejected',
       'cross_compliance', 'penalty_C9', 'penalty_AVJLP', 'penalty_CC',
       'penalty_AVUVP', 'penalty_BGK', 'penalty_C16', 'penalty_BGP',
       'department', 'small farmer', 'risk_factor', 'applicant',
       'penalty_AUVP', 'penalty_amount2', 'penalty_BGKV', 'penalty_amount3',
  

In [None]:
columnNames = ['case', 'event', 'UNIX_starttime', 'UNIX_completeTime', 'weekday'] # chose better features

listValSelected = listVal[columnNames]
listValSelected_prediction = train[columnNames]
listValSelected_prediction = listValSelected_prediction.values
listValDuration_prediction = org_train['duration']
listValDuration_prediction = listValDuration_prediction.values
listValDuration = listVal['duration'].values

listValSelected = listValSelected.values

In [None]:
# choose a number of time steps
n_steps = len(listValSelected[0])
# split into samples


n_features = 1
X = listValSelected.reshape((listValSelected.shape[0], listValSelected.shape[1], n_features))
y = listValDuration

# define model
model = Sequential()
model.add(LSTM(50, input_shape=(n_steps, n_features), return_sequences=False, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(50, activation='relu'))
# Dropout for regularization
# model.add(Dropout(0.5))
# model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(X, y, epochs=1, verbose=1, workers=-1)

# demonstrate prediction
x_input = listValSelected_prediction

x_input = x_input.reshape((x_input.shape[0], n_steps, n_features))



In [None]:
yhat = model.predict(x_input.astype('float32'), verbose=2)
lenVal_len = len(listValDuration_prediction) 


62857/62857 - 88s - 88s/epoch - 1ms/step


AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [None]:

print(mean_absolute_error(listValDuration_prediction, yhat.flatten()[:lenVal_len])) # 504639.3534027153 seconds
# test_df["duration_prediction"] = yhat.flatten()[:len(test_df)]

504639.3534027153


# Neural Network

In [None]:
def normalize(df_name, col_name):
    col_as_array = df_name[col_name].to_numpy()
    col_as_array = np.where(col_as_array == 0, 0.01, col_as_array)
    col_as_array_norm = np.log10(col_as_array)
    mean = col_as_array_norm.mean()
    stdev = col_as_array_norm.std()
    epsilon = 0.01
    return (col_as_array_norm - mean) / (stdev + epsilon)

In [None]:
def prepfeatures(df_name):
    event = df_name['enc_event'].to_numpy()
    event = event.reshape(-1,1)
    
    duration = normalize(df_name,'duration')
    startTime = normalize(df_name,'UNIX_starttime')
    weekday = df_name['weekday'].to_numpy()
    
    prev_event = df_name['prev_event'].to_numpy()
    prev_event = prev_event.reshape(-1,1)
    prev_event = ordinal_encoder.fit_transform(prev_event)
    
    features = []
    for i in range(len(event)):
        current = event[i]
        current = np.append(current,duration[i])
        current = np.append(current,startTime[i])
        current = np.append(current,prev_event[i])
        current = np.append(current,weekday[i])
        features.append(current)
        
    return np.array(features)

In [None]:
def preplabels(df_name):
    labels = df_name['next_event'].to_numpy()
    labels = label_encoder.fit_transform(labels)
    labels = labels.reshape(-1, 1)
    
    return np.array(labels)

In [None]:
features = prepfeatures(train)
labels = preplabels(train)

In [None]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(5,)),
    keras.layers.Dense(10, activation='softplus'),
    keras.layers.Dense(15, activation='softplus'),
    keras.layers.Dense(20, activation='softplus'),
    keras.layers.Dropout(1/20),
    keras.layers.Dense(25, activation='softplus'),
    keras.layers.Dense(30, activation='softplus'),
    keras.layers.Dropout(1/30),
    keras.layers.Dense(35, activation='softplus'),
    keras.layers.Dense(42, activation='softplus')
])

model.compile(optimizer='Adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(features,labels,epochs=5,verbose=1)

In [None]:
features_test = prepfeatures(test)
labels_test = preplabels(test)

In [None]:
test['neuralnet_event'] = model.predict(features_test)

# Regression

In [None]:
def prepfeatures_OLS(df_name):
    startTime = pd.to_datetime(df_name['startTime'])
    
    for i in range(len(startTime)):
        startTime[i] = startTime[i].timestamp()
        
    startTime = np.array(startTime).reshape(-1,1)
    
    event = df_name['event'].to_numpy()
    event = event.reshape(-1,1)
    event = ordinal_encoder.fit_transform(event)
    
    #payment_actual0 = normalize(df_name,'payment_actual0')
    #penalty_amount0 = normalize(df_name,'penalty_amount0')
    #number_parcels = normalize(df_name,'number_parcels')
    #area = normalize(df_name,'area')
    payment_actual0 = df_name['payment_actual0'].to_numpy()
    penalty_amount0 = df_name['penalty_amount0'].to_numpy()
    number_parcels = df_name['number_parcels'].to_numpy()
    area = df_name['area'].to_numpy()
    
    X = []
    for i in range(len(event)):
        current = startTime[i]
        current = np.append(current, event[i])
        current = np.append(current, payment_actual0[i])
        current = np.append(current, penalty_amount0[i])
        current = np.append(current, number_parcels[i])
        current = np.append(current, area[i])
        X.append(current)
        
    return np.array(X, dtype=float)

In [None]:
def preplabels_OLS(df_name):
    duration = df_name['duration'].to_numpy()
    return np.array(duration, dtype=float)

In [None]:
X = prepfeatures_OLS(train)
y = preplabels_OLS(train)

huber = HuberRegressor().fit(X, y)

X_test = prepfeatures_OLS(test)

test['regression_duration'] = huber.predict(X_test)
test['error'] = np.absolute(test['duration'] - test['regression_duration'])
test['error'].mean()