In [5]:
import os 
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
import pandas as pd

curr_path = os.path.abspath('')
folder_path = os.path.join(curr_path, 'data')
filepath = os.path.join(folder_path, 'BPI_Challenge_2012.xes')
log = pm4py.read_xes(filepath)
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

In [439]:
unique_activities = pd.unique(pd_dataframe['concept:name'])
activity_encoder = dict(zip(unique_activities,range(3,len(unique_activities) + 3)))
activity_encoder['Start'] = 1
activity_encoder['End'] = 2
df['concept:encoded'] = df['concept:name'].apply(lambda x: activity_encoder[x])

In [440]:
import pandas as pd

def process_arrays(df, complete=False, W=False):
    proc_df = df
    if complete:
        proc_df = proc_df[proc_df['lifecycle:transition'] == 'COMPLETE']
    if W:
        proc_df = proc_df[proc_df['concept:name'].str.startswith('W_')]
    
    arrays = proc_df.groupby(['case:concept:name']).agg(list)
    arrays.reset_index(inplace=True)
    arrays['time:interarrival_min'] = arrays['time:timestamp'].apply(lambda x: [0] + [0] + [((x[i+1] - x[i]).total_seconds() / 60) for i in range(len(x)-1)] + [0])
    arrays['concept:encoded'] = arrays['concept:encoded'].apply(lambda x: [1] + x + [2])
    return arrays


In [441]:
arrays = process_arrays(df, complete=True, W=True)

In [442]:
from sklearn.model_selection import train_test_split, KFold
import numpy as np
# sampled_arrays = arrays['case:concept:name'].sample(n= 100000)

kf = KFold(n_splits=3)
splits = []
for train_index, test_index in kf.split(arrays['case:concept:name']):
    id_tr = arrays['case:concept:name'].iloc[train_index]
    id_te = arrays['case:concept:name'].iloc[test_index]
    splits.append((id_tr, id_te))

In [782]:
i = 0
id_train, id_test = splits[i]

In [783]:
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras import layers
import numpy as np
# Many to one + context


activity_decoder = {v:k for k,v in activity_encoder.items()}

def mto_lstm_prep(journey):
    inp = [journey[:i] for i in range(1,len(journey))]
    out = journey[1:]
    return (inp,out)

def make_data(id_indexes, arrays_df):
    X_j = []
    Y_j = []
    X_t = []
    Y_t = []

    selected = arrays_df[arrays_df["case:concept:name"].isin(id_indexes)]

    for index, row in selected.iterrows():
        j_inp, j_out = mto_lstm_prep(row['concept:encoded'])
        t_inp, t_out = mto_lstm_prep(row['time:interarrival_min'])
        X_j.extend(j_inp)
        X_t.extend(t_inp)
        Y_j.extend(j_out)
        Y_t.extend(t_out)
    X_j = keras.preprocessing.sequence.pad_sequences(X_j, padding='pre', maxlen=60)
    X_j = to_categorical(X_j)
    X_t = keras.preprocessing.sequence.pad_sequences(X_t, padding='pre', maxlen=60)
    Y_j = np.asarray(Y_j).astype("float32")
    Y_j = to_categorical(Y_j)
    Y_t = np.asarray(Y_t).astype("float32")
    return (X_j, X_t, Y_j, Y_t)

In [784]:
X_j_train, X_t_train, Y_j_train, Y_t_train = make_data(id_train.values, arrays)
X_j_test, X_t_test, Y_j_test, Y_t_test = make_data(id_test.values, arrays)

In [785]:
X_t_train.shape

(54873, 60)

In [786]:
X_j_train.shape

(54873, 60, 27)

## Only Next Activity Prediction VRNN 

In [787]:
import tensorflow as tf
lstm_dim = 128
timesteps = X_j_train.shape[1]
timefeat_dim = X_j_train.shape[2]
z_dim = 2

class Sample(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

input_j = layers.Input(shape=(timesteps,timefeat_dim))
lstm = layers.LSTM(lstm_dim)(input_j)


input_t = layers.Input(shape=(timesteps, 1))
t_lstm = layers.LSTM(32)(input_t)
t_output = layers.Dense(2, activation='relu')(t_lstm)

merged = layers.Concatenate()([lstm, t_lstm])

z_p_means = layers.Dense(z_dim)(merged)
z_p_log_var = layers.Dense(z_dim)(merged)

z_q_means = layers.Dense(z_dim)(merged)
z_q_log_var = layers.Dense(z_dim)(merged)

z = Sample()([z_q_means, z_q_log_var])

encoder = keras.Model([input_j, input_t], [z, z_q_means, z_q_log_var, z_p_means, z_p_log_var, t_output])
encoder.summary()

Model: "model_99"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_127 (InputLayer)          [(None, 60, 27)]     0                                            
__________________________________________________________________________________________________
input_128 (InputLayer)          [(None, 60, 1)]      0                                            
__________________________________________________________________________________________________
lstm_63 (LSTM)                  (None, 128)          79872       input_127[0][0]                  
__________________________________________________________________________________________________
lstm_64 (LSTM)                  (None, 32)           4352        input_128[0][0]                  
___________________________________________________________________________________________

## Activity Decoder

In [788]:
input_z = layers.Input(shape=(z_dim,))
output = layers.Dense(timefeat_dim, activation='softmax')(input_z)
output = layers.Dropout(0.5)(output)

decoder = keras.Model(input_z, output)
decoder.summary()

Model: "model_100"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_129 (InputLayer)       [(None, 2)]               0         
_________________________________________________________________
dense_256 (Dense)            (None, 27)                81        
_________________________________________________________________
dropout_54 (Dropout)         (None, 27)                0         
Total params: 81
Trainable params: 81
Non-trainable params: 0
_________________________________________________________________


## Time Decoder

In [789]:
input_z = layers.Input(shape=(z_dim,))
output = layers.Dense(1, activation="softplus")(input_z)
output = layers.Dropout(0.5)(output)

t_decoder = keras.Model(input_z, output)
t_decoder.summary()

Model: "model_101"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_130 (InputLayer)       [(None, 2)]               0         
_________________________________________________________________
dense_257 (Dense)            (None, 1)                 3         
_________________________________________________________________
dropout_55 (Dropout)         (None, 1)                 0         
Total params: 3
Trainable params: 3
Non-trainable params: 0
_________________________________________________________________


## Vanilla VRNN

In [790]:
import tensorflow.keras.backend as K
from keras.callbacks import Callback

def custom_KL(posterior_means, prior_means, posterior_log_var, prior_log_var):
    # KL of p under q    
    kl = prior_log_var - posterior_log_var + (tf.exp(posterior_log_var) + 
                                       tf.square(posterior_means - prior_means)) / tf.exp(prior_log_var) - 1
    kl = 0.5 * tf.reduce_sum(kl, axis=1)
    return kl

# total number of epochs
n_epochs = 20
# The number of epochs at which KL loss should be included
klstart = 1
# number of epochs over which KL scaling is increased from 0 to 1
kl_annealtime = 4

class AnnealingCallback(Callback):
    def __init__(self, weight):
        self.weight = weight
    def on_epoch_end (self, epoch, logs={}):
        if epoch > klstart :
            new_weight = min(K.get_value(self.weight) + (1./ kl_annealtime), 1.)
            K.set_value(self.weight, new_weight)
        print ("Current KL Weight is " + str(K.get_value(self.weight)))

class RVAE(keras.Model):
    def __init__(self, encoder, decoder, t_decoder, **kwargs):
        super(RVAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.t_decoder = t_decoder
    def compile(self, optimizer, beta):
        super(RVAE, self).compile()
        self.optimizer = optimizer
        self.beta = beta
        
    def train_step(self,data):
        if isinstance(data, tuple):
            journeys, times = data[0]
            output, t_output = data[1]
        
        with tf.GradientTape() as tape:
            z, pos_means, pos_log_var, prior_means, prior_log_var, t_output= self.encoder([journeys, times])
            pred_activity = self.decoder(z)
            pred_time = self.t_decoder(z)
            
            journey_loss = tf.reduce_mean(
                tf.reduce_sum(tf.keras.losses.categorical_crossentropy(output, pred_activity))
            )
            
            time_loss = tf.reduce_mean(
                tf.reduce_sum(tf.keras.losses.mean_absolute_error(t_output, pred_time))
            )
                        
            t_kl_divergence =   tf.reduce_mean(
                custom_KL(pos_means, prior_means, pos_log_var, prior_log_var)
            )
            


            total_loss = journey_loss + time_loss + self.beta * (t_kl_divergence)

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "journey_loss":journey_loss,
            "time_loss":time_loss,
            "t_kl_divergence": t_kl_divergence
        }
     
    def test_step(self, data):
        journeys, times = data[0]
        outputs, t_outputs = data[1]
        pred_activity, pred_time = self([journeys, times], training=False)
        journey_loss = tf.keras.losses.categorical_crossentropy(outputs, pred_activity)
        time_loss = tf.keras.losses.mean_absolute_error(t_outputs, pred_time)
        return {
            "journey_loss": journey_loss,
            "time_loss":time_loss,
        }
            
    def call(self, data):
        journeys, times = data
        z, pos_means, pos_log_var, prior_means, prior_log_var, t_output = self.encoder([journeys, times])
        return(self.decoder(z), self.t_decoder(z))

In [791]:
weight = K.variable(0.)
weight._trainable = False
#early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
rvae = RVAE(encoder, decoder, t_decoder)
rvae.compile(optimizer=keras.optimizers.Adam(lr=0.001), beta=weight)
history = rvae.fit([X_j_train, X_t_train], [Y_j_train, Y_t_train], validation_split=0.25, epochs=n_epochs, 
                   batch_size=32, callbacks=[AnnealingCallback(weight)])

Epoch 1/20
Current KL Weight is 0.0
Epoch 2/20
Current KL Weight is 0.0
Epoch 3/20
Current KL Weight is 0.25
Epoch 4/20
Current KL Weight is 0.5
Epoch 5/20
Current KL Weight is 0.75
Epoch 6/20
Current KL Weight is 1.0
Epoch 7/20
Current KL Weight is 1.0
Epoch 8/20
Current KL Weight is 1.0
Epoch 9/20
Current KL Weight is 1.0
Epoch 10/20
Current KL Weight is 1.0
Epoch 11/20
Current KL Weight is 1.0
Epoch 12/20
Current KL Weight is 1.0
Epoch 13/20
Current KL Weight is 1.0
Epoch 14/20
Current KL Weight is 1.0
Epoch 15/20
Current KL Weight is 1.0
Epoch 16/20
Current KL Weight is 1.0
Epoch 17/20
Current KL Weight is 1.0
Epoch 18/20
Current KL Weight is 1.0
Epoch 19/20
Current KL Weight is 1.0
Epoch 20/20
Current KL Weight is 1.0


In [792]:
from sklearn.metrics import f1_score, accuracy_score
preds, pred_times= rvae.predict([X_j_test, X_t_test])
pred_activities = np.argmax(preds, axis=1)
truth_activities = np.argmax(Y_j_test, axis=1)
print("f1:", f1_score(truth_activities, pred_activities, average='weighted'))
print("accuracy:", accuracy_score(truth_activities, pred_activities))

f1: 0.6430657475275449
accuracy: 0.6597543937054196


In [454]:
# Hidden dimension 2
# BPI 12
results1 = [85.11, 85.47, 84.96]

# BPI 12 Complete

results2 = [78.33, 78.77, 76.8]

# BPI12 W
results3  = [83.96, 84.55, 82.99]

# BPI12 W Complete
results4  = [66.09, 66.8, 64.78]

In [675]:
columns = ['BPI 12', 'BPI 12 COMPLETE', 'BPI 12 W', 'BPI 12 W COMPLETE']

In [681]:
res = pd.DataFrame({'BPI 12': results1, 'BPI 12 COMPLETE':results2, 'BPI 12 W':results3, 'BPI 12 W COMPLETE':results4}, index=range(1,4))
res

Unnamed: 0,BPI 12,BPI 12 COMPLETE,BPI 12 W,BPI 12 W COMPLETE
1,85.11,78.33,83.96,66.09
2,85.47,78.77,84.55,66.8
3,84.96,76.8,82.99,64.78


In [680]:
res.agg(['mean'])

Unnamed: 0,BPI 12,BPI 12 COMPLETE,BPI 12 W,BPI 12 W COMPLETE
mean,85.18,77.966667,83.833333,65.89


In [793]:
losses = tf.keras.losses.categorical_crossentropy(Y_j_test, preds) +\
tf.keras.losses.mean_absolute_error(Y_t_test, pred_times)

In [794]:
score = (losses - np.min(losses)) / (np.max(losses) - np.min(losses))

In [795]:
outlier_inds = np.argwhere(score >= np.median(score) +  3 * np.std(score))

In [796]:
prefix = np.argmax(X_j_test[outlier_inds.flatten()], axis=1)
pred_acc = np.argmax(Y_j_test[outlier_inds.flatten()], axis=1)

In [743]:
activity_decoder = {v:k for k,v in activity_encoder.items()}

In [744]:
print([activity_decoder[x] for x in prefix[0] if x >0])

KeyError: 49

In [797]:
targets = np.argmax(Y_j_test, axis=1)

In [798]:
len(np.argwhere(targets == 26))

89

In [799]:
np.sum(pred_acc == 26)

47

In [779]:
len(pred_acc)

720

In [780]:
np.sum(pred_acc == 2)

572

In [750]:
end_inds = np.argwhere(pred_acc == 2)

In [751]:
end_outliers = np.argmax(X_j_test[end_inds.flatten(),:,:], axis=2)

In [752]:
end_outliers

array([[ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  6, 12, 12],
       [ 0,  0,  0, ..., 12, 12, 12],
       ...,
       [ 0,  0,  0, ...,  6, 12, 12],
       [ 0,  0,  0, ..., 12, 12, 12],
       [ 0,  0,  0, ..., 12, 12, 14]])

In [673]:
end_outliers[40]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 23,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6])

In [667]:
pred_acc[20]

12

In [738]:
Y_j_test.shape

(27198, 27)

In [None]:
# first fold 89/27198, second fold 93/28451, 88/26422

In [781]:
activity_encoder

{'A_SUBMITTED': 3,
 'A_PARTLYSUBMITTED': 4,
 'A_PREACCEPTED': 5,
 'W_Completeren aanvraag': 6,
 'A_ACCEPTED': 7,
 'O_SELECTED': 8,
 'A_FINALIZED': 9,
 'O_CREATED': 10,
 'O_SENT': 11,
 'W_Nabellen offertes': 12,
 'O_SENT_BACK': 13,
 'W_Valideren aanvraag': 14,
 'A_REGISTERED': 15,
 'A_APPROVED': 16,
 'O_ACCEPTED': 17,
 'A_ACTIVATED': 18,
 'O_CANCELLED': 19,
 'W_Wijzigen contractgegevens': 20,
 'A_DECLINED': 21,
 'A_CANCELLED': 22,
 'W_Afhandelen leads': 23,
 'O_DECLINED': 24,
 'W_Nabellen incomplete dossiers': 25,
 'W_Beoordelen fraude': 26,
 'Start': 1,
 'End': 2}