# **Preprocessing of Datasets**

#### Set path variables and import functions

In [5]:
# Append the directory containing the src folder to sys.path
import sys

sys.path.append('/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift')

In [6]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [7]:
#Load package and functions to notebook
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
import transformers
print("Transformers version:", transformers.__version__)
from src.data.data_manager import data_loader, save_event_log, delete_dataframes
from src.data.data_generator import next_activity
from src.features.preprocessing import outliers, missing_values, normalize_and_lowercase, generate_prefix_traces, early_fusion, create_multiindex, encoding_and_tokenizing, split_data

TensorFlow version: 2.16.1
Transformers version: 4.40.1


### ----------------------------------------------------------
### Start of preprocessing

**Steps**

1. Order events by timestamp
2. Missing values
3. Outlier detection (IQR)
4. Convert numerical data to str
5. Earliy fusion approach
6. Tokenization
7. Padding
8. BERT feature encoder
9. Train/Val/Test split

**Load specific dataset into enviroment**

In [57]:
#Loads dataset and orders it by time and case
df = data_loader(directory, path_raw, "Helpdesk_finale.csv" , columns=["Complete Timestamp", "Case ID", "Activity", "Resource"])
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 21348 entries, 0 to 4615
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   time:timestamp     21348 non-null  object
 1   case:concept:name  21348 non-null  object
 2   concept:name       21348 non-null  object
 3   org:resource       21348 non-null  object
dtypes: object(4)
memory usage: 833.9+ KB


**Clean dataset**

In [58]:
df

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
0,2012-10-09 14:50:17,Case 1,Assign seriousness,Value 1
1,2012-10-09 14:51:01,Case 1,Take in charge ticket,Value 1
2,2012-10-12 15:02:56,Case 1,Take in charge ticket,Value 2
3,2012-10-25 11:54:26,Case 1,Resolve ticket,Value 1
4,2012-11-09 12:54:39,Case 1,Closed,Value 3
...,...,...,...,...
4611,2013-01-04 16:51:50,Case 998,Closed,Value 3
4612,2013-02-12 16:06:37,Case 999,Assign seriousness,Value 1
4613,2013-02-25 11:37:20,Case 999,Take in charge ticket,Value 12
4614,2013-03-14 16:24:30,Case 999,Resolve ticket,Value 12


In [59]:
#Save dataset as pkl file for faster processing

save_event_log(directory, path_interim, df, "Helpdesk")

File saved as 2024-05-13_Helpdesk_next_activity.pkl


In [60]:
df = data_loader(directory, path_interim, "Helpdesk")

In [61]:
df.head()

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
0,2012-10-09 14:50:17,Case 1,Assign seriousness,Value 1
1,2012-10-09 14:51:01,Case 1,Take in charge ticket,Value 1
2,2012-10-12 15:02:56,Case 1,Take in charge ticket,Value 2
3,2012-10-25 11:54:26,Case 1,Resolve ticket,Value 1
4,2012-11-09 12:54:39,Case 1,Closed,Value 3


In [62]:
# Remove missing values
df_clean = missing_values(df, "case:concept:name")

#Delete Outliers (IQR)
df_clean = outliers(df_clean, "case:concept:name")

In [63]:
df_clean.head()

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
0,2012-10-09 14:50:17,Case 1,Assign-seriousness,Value 1
1,2012-10-09 14:51:01,Case 1,Take-in-charge-ticket,Value 1
2,2012-10-12 15:02:56,Case 1,Take-in-charge-ticket,Value 2
3,2012-10-25 11:54:26,Case 1,Resolve-ticket,Value 1
4,2012-11-09 12:54:39,Case 1,Closed,Value 3


In [64]:
import numpy as np

#Displays new quantil ranges for checking
grouped_activities_df = df_clean.groupby('case:concept:name')['concept:name'].apply(list)

trace_length_df = list()

for i in grouped_activities_df:
    trace_length_df.append(len(i))

trace_length_df_array = np.array(trace_length_df)

print("\n99.99% quantile: {}".format(np.quantile(trace_length_df_array, 0.9999)))
print("99% quantile: {}".format(np.quantile(trace_length_df_array, 0.99)))
print("95% quantile: {}".format(np.quantile(trace_length_df_array, 0.95)))
print("75% quantile: {}".format(np.quantile(trace_length_df_array, 0.75)))
print("50% quantile: {}".format(np.quantile(trace_length_df_array, 0.5)))
print("25% quantile: {}".format(np.quantile(trace_length_df_array, 0.25)))


99.99% quantile: 6.0
99% quantile: 6.0
95% quantile: 6.0
75% quantile: 5.0
50% quantile: 4.0
25% quantile: 4.0


**Create next activity column**

In [65]:
#Create next activity for each event
df_clean = next_activity(df_clean)
df_clean

Unnamed: 0,case:concept:name,time:timestamp,concept:name,org:resource,next activity
0,Case 1,2012-10-09 14:50:17,Assign-seriousness,Value 1,Take-in-charge-ticket
1,Case 1,2012-10-09 14:51:01,Take-in-charge-ticket,Value 1,Take-in-charge-ticket
2,Case 1,2012-10-12 15:02:56,Take-in-charge-ticket,Value 2,Resolve-ticket
3,Case 1,2012-10-25 11:54:26,Resolve-ticket,Value 1,Closed
4,Case 1,2012-11-09 12:54:39,Closed,Value 3,end
...,...,...,...,...,...
4611,Case 998,2013-01-04 16:51:50,Closed,Value 3,end
4612,Case 999,2013-02-12 16:06:37,Assign-seriousness,Value 1,Take-in-charge-ticket
4613,Case 999,2013-02-25 11:37:20,Take-in-charge-ticket,Value 12,Resolve-ticket
4614,Case 999,2013-03-14 16:24:30,Resolve-ticket,Value 12,Closed


**Train/Test/Val split**
See: 

Next-Activity Prediction for Non-stationary
Processes with Unseen Data Variability

and

Outcome-Oriented Predictive Process Monitoring:
Review and Benchmark

In [66]:
train_df, val_df, test_df = split_data(df_clean)

Train set shape: (13175, 5)
Validation set shape: (2805, 5)
Test set shape: (2829, 5)


In [67]:
#Convert df entries to string values and lowercase df

train_df = normalize_and_lowercase(train_df)
val_df = normalize_and_lowercase(val_df)
test_df = normalize_and_lowercase(test_df)

val_df.head()

Unnamed: 0,case:concept:name,time:timestamp,concept:name,org:resource,next activity
18179,case 3896,2013-09-11 13:04:49,assign-seriousness,value 1,take-in-charge-ticket
18180,case 3896,2013-09-17 10:30:20,take-in-charge-ticket,value 9,wait
18181,case 3896,2013-09-17 13:56:27,wait,value 9,resolve-ticket
18182,case 3896,2013-09-27 13:19:31,resolve-ticket,value 9,closed
18183,case 3896,2013-10-12 13:19:52,closed,value 3,end


**Generate prefix traces**

In [68]:
train_df = create_multiindex(train_df)
val_df = create_multiindex(val_df)
test_df = create_multiindex(test_df)

In [69]:
train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,concept:name,next activity
caseID,time,Unnamed: 2_level_1,Unnamed: 3_level_1
case 1,2012-10-09 14:50:17,assign-seriousness,take-in-charge-ticket
case 1,2012-10-09 14:51:01,take-in-charge-ticket,take-in-charge-ticket
case 1,2012-10-12 15:02:56,take-in-charge-ticket,resolve-ticket
case 1,2012-10-25 11:54:26,resolve-ticket,closed
case 1,2012-11-09 12:54:39,closed,end
...,...,...,...
case 3895,2011-09-16 14:58:43,assign-seriousness,take-in-charge-ticket
case 3895,2011-09-23 13:27:02,take-in-charge-ticket,wait
case 3895,2011-09-23 14:02:48,wait,resolve-ticket
case 3895,2011-09-27 15:47:03,resolve-ticket,closed


In [70]:
train_df = generate_prefix_traces(train_df)
val_df = generate_prefix_traces(val_df)
test_df = generate_prefix_traces(test_df)


**Create sequencalized prefix traces**

In [71]:
#Create dataframe that contains sequentialized prefix traces and the corresponding next activity
train_sequenzes = early_fusion(train_df)
val_sequenzes = early_fusion(val_df)
test_sequenzes = early_fusion(test_df)

In [72]:
train_df[2]

(                                     concept:name
 caseID time                                      
 case 1 2012-10-09 14:50:17     assign-seriousness
        2012-10-09 14:51:01  take-in-charge-ticket
        2012-10-12 15:02:56  take-in-charge-ticket,
 'resolve-ticket')

In [73]:
train_sequenzes

Unnamed: 0,Prefix_Trace,Next_Activity
0,assign-seriousness,take-in-charge-ticket
1,assign-seriousness take-in-charge-ticket,take-in-charge-ticket
2,assign-seriousness take-in-charge-ticket take-...,resolve-ticket
3,assign-seriousness take-in-charge-ticket take-...,closed
4,assign-seriousness take-in-charge-ticket take-...,end
...,...,...
13170,assign-seriousness,take-in-charge-ticket
13171,assign-seriousness take-in-charge-ticket,wait
13172,assign-seriousness take-in-charge-ticket wait,resolve-ticket
13173,assign-seriousness take-in-charge-ticket wait ...,closed


**Tokenize and encode prefix traces and next activities**

**Creates final dataset made of tensors**

In [74]:
#Tokenizes prefix traces and encodes next activities
#Afterwars a dataset consisting of tensors is created
train = encoding_and_tokenizing(train_sequenzes, "Prefix_Trace", "Next_Activity")
val = encoding_and_tokenizing(val_sequenzes, "Prefix_Trace", "Next_Activity")
test = encoding_and_tokenizing(test_sequenzes, "Prefix_Trace", "Next_Activity")



In [75]:
print(train)
print(val)
print(test)

<_TensorSliceDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>
<_TensorSliceDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>
<_TensorSliceDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>


In [76]:
save_event_log(directory, path_processed, train, "Helpdesk", "train")
save_event_log(directory, path_processed, val, "Helpdesk", "val")
save_event_log(directory, path_processed, test, "Helpdesk", "test")



Saved new folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-13_ Helpdesk_train_tensor'
File saved as 2024-05-13_ Helpdesk_train_tensor
Saved new folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-13_ Helpdesk_val_tensor'
File saved as 2024-05-13_ Helpdesk_val_tensor
Saved new folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-13_ Helpdesk_test_tensor'
File saved as 2024-05-13_ Helpdesk_test_tensor


In [77]:
train_tensor = data_loader(directory, path_processed, "Helpdesk_train")
val_tensor = data_loader(directory, path_processed, "Helpdesk_val")
test_tensor = data_loader(directory, path_processed, "Helpdesk_test")

Loading dataset from folder: '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-13_ Helpdesk_train_tensor'
Loading dataset from folder: '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-13_ Helpdesk_val_tensor'
Loading dataset from folder: '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-13_ Helpdesk_test_tensor'


In [78]:
train_tensor

<_LoadDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

## Testing

Train/Val/Test split

In [30]:
def decode_tokenized_sequences(tokenized_sequences, tokenizer):

    decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in tokenized_sequences]
    
    return decoded_sequences