# **Preprocessing of Datasets**

#### Set path variables and import functions

In [1]:
# Append the directory containing the src folder to sys.path
import sys

sys.path.append('/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift')

In [2]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [3]:
#Load package and functions to notebook
from src.data.data_manager import data_loader, save_event_log, delete_dataframes
from src.data.data_generator import next_activity
from src.features.preprocessing import outliers, missing_values, normalize_and_lowercase, generate_prefix_traces, early_fusion, create_multiindex, split_data

  from .autonotebook import tqdm as notebook_tqdm


### ----------------------------------------------------------
### Start of preprocessing

**Steps**

1. Order events by timestamp
2. Missing values
3. Outlier detection (IQR)
4. Convert numerical data to str
5. Earliy fusion approach
6. Tokenization
7. Padding
8. BERT feature encoder
9. Train/Val/Test split

**Load specific dataset into enviroment**

In [49]:
#Loads dataset and orders it by time and case
df = data_loader(directory, path_raw, "BPI_Challenge_2018.xes" , columns=["Complete Timestamp", "Case ID", "Activity", "Resource"])
df.info()


parsing log, completed traces :: 100%|██████████| 43809/43809 [01:47<00:00, 407.82it/s]


<class 'pandas.core.frame.DataFrame'>
Index: 2514266 entries, 136988 to 317373
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   time:timestamp     object
 1   case:concept:name  object
 2   concept:name       object
 3   org:resource       object
dtypes: object(4)
memory usage: 95.9+ MB


**Clean dataset**

In [63]:
df

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
136988,2015-04-28 00:00:00,0002505cb62792e4,mail income,0;n/a
136989,2015-05-04 00:00:00,0002505cb62792e4,mail valid,0;n/a
136990,2015-05-04 00:00:00,0002505cb62792e4,mail valid,0;n/a
136991,2015-05-04 00:00:00,0002505cb62792e4,mail valid,0;n/a
136992,2015-06-12 11:50:18,0002505cb62792e4,initialize,Document processing automaton
...,...,...,...,...
317369,2017-04-25 14:06:50,fffadf8d083df3d5,begin editing,f9fe07
317370,2017-04-25 14:06:57,fffadf8d083df3d5,calculate,f9fe07
317371,2017-04-25 14:07:08,fffadf8d083df3d5,remove document,f9fe07
317372,2017-04-25 14:07:14,fffadf8d083df3d5,insert document,f9fe07


In [51]:
df['concept:name'].nunique()

41

In [7]:
#Save dataset as pkl file for faster processing

save_event_log(directory, path_interim, df, "Helpdesk")

File saved as 2024-06-07_Helpdesk_next_activity.pkl


In [8]:
df = data_loader(directory, path_interim, "Helpdesk")

In [74]:
df

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
136988,2015-04-28 00:00:00,0002505cb62792e4,mail income,0;n/a
136989,2015-05-04 00:00:00,0002505cb62792e4,mail valid,0;n/a
136990,2015-05-04 00:00:00,0002505cb62792e4,mail valid,0;n/a
136991,2015-05-04 00:00:00,0002505cb62792e4,mail valid,0;n/a
136992,2015-06-12 11:50:18,0002505cb62792e4,initialize,Document processing automaton
...,...,...,...,...
317369,2017-04-25 14:06:50,fffadf8d083df3d5,begin editing,f9fe07
317370,2017-04-25 14:06:57,fffadf8d083df3d5,calculate,f9fe07
317371,2017-04-25 14:07:08,fffadf8d083df3d5,remove document,f9fe07
317372,2017-04-25 14:07:14,fffadf8d083df3d5,insert document,f9fe07


In [38]:
def outliers2(dataframe, case_column):

    event_counts = dataframe[case_column].value_counts()

    #Calculate quantiles
    Q3 = event_counts.quantile(0.75)
    Q1 = event_counts.quantile(0.25)

    #Calculate IQR
    IQR = Q3 - Q1

    #Calculate uper bound
    upper_bound = Q3 + 1.5 * IQR

    #Filter cases
    filtered_event_log = dataframe[dataframe[case_column].isin(event_counts[(event_counts <= upper_bound) & (event_counts >1)].index)]

    return filtered_event_log

In [75]:
#To get dataframe short

# Remove missing values
df_clean = missing_values(df, "case:concept:name")

df_clean = outliers(df_clean, "case:concept:name")

In [53]:
#To get dataframe long

# Remove missing values
df_clean = missing_values(df, "case:concept:name")

#Delete Outliers (IQR)
df_clean = outliers2(df_clean, "case:concept:name")

In [76]:
df_clean.head()

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
136988,2015-04-28 00:00:00,0002505cb62792e4,mail-income,0;n/a
136989,2015-05-04 00:00:00,0002505cb62792e4,mail-valid,0;n/a
136990,2015-05-04 00:00:00,0002505cb62792e4,mail-valid,0;n/a
136991,2015-05-04 00:00:00,0002505cb62792e4,mail-valid,0;n/a
136992,2015-06-12 11:50:18,0002505cb62792e4,initialize,Document processing automaton


In [77]:
import numpy as np

#Displays new quantil ranges for checking
grouped_activities_df = df_clean.groupby('case:concept:name')['concept:name'].apply(list)

trace_length_df = list()

for i in grouped_activities_df:
    trace_length_df.append(len(i))

trace_length_df_array = np.array(trace_length_df)

print("\n99.99% quantile: {}".format(np.quantile(trace_length_df_array, 0.9999)))
print("99% quantile: {}".format(np.quantile(trace_length_df_array, 0.99)))
print("95% quantile: {}".format(np.quantile(trace_length_df_array, 0.95)))
print("75% quantile: {}".format(np.quantile(trace_length_df_array, 0.75)))
print("50% quantile: {}".format(np.quantile(trace_length_df_array, 0.5)))
print("25% quantile: {}".format(np.quantile(trace_length_df_array, 0.25)))


99.99% quantile: 79.0
99% quantile: 77.0
95% quantile: 71.0
75% quantile: 55.0
50% quantile: 49.0
25% quantile: 43.0


**Create next activity column**

In [78]:
#Create next activity for each event
df_clean = next_activity(df_clean)
df_clean

Unnamed: 0,case:concept:name,time:timestamp,concept:name,org:resource,next activity
136988,0002505cb62792e4,2015-04-28 00:00:00,mail-income,0;n/a,mail-valid
136989,0002505cb62792e4,2015-05-04 00:00:00,mail-valid,0;n/a,mail-valid
136990,0002505cb62792e4,2015-05-04 00:00:00,mail-valid,0;n/a,mail-valid
136991,0002505cb62792e4,2015-05-04 00:00:00,mail-valid,0;n/a,initialize
136992,0002505cb62792e4,2015-06-12 11:50:18,initialize,Document processing automaton,begin-editing
...,...,...,...,...,...
317369,fffadf8d083df3d5,2017-04-25 14:06:50,begin-editing,f9fe07,calculate
317370,fffadf8d083df3d5,2017-04-25 14:06:57,calculate,f9fe07,remove-document
317371,fffadf8d083df3d5,2017-04-25 14:07:08,remove-document,f9fe07,insert-document
317372,fffadf8d083df3d5,2017-04-25 14:07:14,insert-document,f9fe07,save


**Train/Test/Val split**
See: 

Next-Activity Prediction for Non-stationary
Processes with Unseen Data Variability

and

Outcome-Oriented Predictive Process Monitoring:
Review and Benchmark

In [16]:
#train_df, val_df, test_df = split_data(df_clean)

Train set shape: (1409180, 5)
Validation set shape: (303658, 5)
Test set shape: (302235, 5)


In [17]:
#Convert df entries to string values and lowercase df

train_df = normalize_and_lowercase(train_df)
val_df = normalize_and_lowercase(val_df)
test_df = normalize_and_lowercase(test_df)

val_df.head()

Unnamed: 0,case:concept:name,time:timestamp,concept:name,org:resource,next activity
128850,b30fbe81ce3c33c4,2015-04-28 00:00:00,mail-income,0;n/a,mail-valid
128851,b30fbe81ce3c33c4,2015-05-04 00:00:00,mail-valid,0;n/a,mail-valid
128852,b30fbe81ce3c33c4,2015-05-04 00:00:00,mail-valid,0;n/a,mail-valid
128853,b30fbe81ce3c33c4,2015-05-04 00:00:00,mail-valid,0;n/a,initialize
128854,b30fbe81ce3c33c4,2015-06-12 13:20:59,initialize,document processing automaton,begin-editing


In [79]:
df_complete = normalize_and_lowercase(df_clean)

**Generate prefix traces**

In [18]:
train_df = create_multiindex(train_df)
val_df = create_multiindex(val_df)
test_df = create_multiindex(test_df)

In [80]:
df_complete = create_multiindex(df_complete)

In [19]:
train_df = generate_prefix_traces(train_df)
val_df = generate_prefix_traces(val_df)
test_df = generate_prefix_traces(test_df)


In [81]:
df_complete = generate_prefix_traces(df_complete)

**Create sequencalized prefix traces**

In [20]:
#Create dataframe that contains sequentialized prefix traces and the corresponding next activity
train_sequenzes = early_fusion(train_df)
val_sequenzes = early_fusion(val_df)
test_sequenzes = early_fusion(test_df)

In [82]:
df_complete = early_fusion(df_complete)

In [21]:
train_df[2]

(                                     concept:name
 caseID           time                            
 0002505cb62792e4 2015-04-28 00:00:00  mail-income
                  2015-05-04 00:00:00   mail-valid
                  2015-05-04 00:00:00   mail-valid,
 'mail-valid')

In [22]:
val_sequenzes

Unnamed: 0,Prefix_Trace,Next_Activity
0,mail-income,mail-valid
1,mail-income mail-valid,mail-valid
2,mail-income mail-valid mail-valid,mail-valid
3,mail-income mail-valid mail-valid mail-valid,initialize
4,mail-income mail-valid mail-valid mail-valid i...,begin-editing
...,...,...
303653,mail-income mail-valid initialize begin-editin...,abort-payment
303654,mail-income mail-valid initialize begin-editin...,begin-payment
303655,mail-income mail-valid initialize begin-editin...,insert-document
303656,mail-income mail-valid initialize begin-editin...,finish-payment


In [83]:
df_complete

Unnamed: 0,Prefix_Trace,Next_Activity
0,mail-income,mail-valid
1,mail-income mail-valid,mail-valid
2,mail-income mail-valid mail-valid,mail-valid
3,mail-income mail-valid mail-valid mail-valid,initialize
4,mail-income mail-valid mail-valid mail-valid i...,begin-editing
...,...,...
1991479,mail-income mail-valid mail-valid mail-valid i...,calculate
1991480,mail-income mail-valid mail-valid mail-valid i...,remove-document
1991481,mail-income mail-valid mail-valid mail-valid i...,insert-document
1991482,mail-income mail-valid mail-valid mail-valid i...,save


In [23]:
print(train_sequenzes['Next_Activity'].nunique())
print(val_sequenzes['Next_Activity'].nunique())
print(test_sequenzes['Next_Activity'].nunique())

save_event_log(directory, path_interim, train_sequenzes, "Long_BPIC2018_train")
save_event_log(directory, path_interim, val_sequenzes, "Long_BPIC2018_val")
save_event_log(directory, path_interim, test_sequenzes, "Long_BPIC2018_test")

42
41
41
File saved as 2024-06-02_Long_BPIC2018_train_next_activity.pkl
File saved as 2024-06-02_Long_BPIC2018_val_next_activity.pkl
File saved as 2024-06-02_Long_BPIC2018_test_next_activity.pkl


In [84]:
print(df_complete['Next_Activity'].nunique())

save_event_log(directory, path_interim, df_complete, "Short_BPIC2018_complete")

42
File saved as 2024-06-07_Short_BPIC2018_complete_next_activity.pkl


In [None]:
Long_Helpdesk_train = 10
Long_Helpdesk_val = 8
Long_Helpdesk_test = 8

In [None]:
Long_BPIC12_train = 24
Long_BPIC12_val = 23
Long_BPIC12_test = 23

In [None]:
Long_BPIC18_train = 42
Long_BPIC18_val = 41
Long_BPIC18_test = 41