**Preprocessing of Datasets**

In [1]:
# Append the directory containing the src folder to sys.path
import sys

sys.path.append('/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift')

In [2]:
#Load package and functions to notebook
import numpy as np
import pandas as pd
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
import transformers
print("Transformers version:", transformers.__version__)
from src.data.data_manager import data_loader, save_event_log, delete_dataframes, load_event_log
from src.data.data_generator import next_activity
from src.features.preprocessing import outliers, missing_values, normalize_and_lowercase, generate_prefix_traces, early_fusion

TensorFlow version: 2.16.1
Transformers version: 4.40.1


**Steps**

1. Order events by timestamp
2. Missing values
3. Outlier detection (IQR)
4. Convert numerical data to str
5. Earliy fusion approach
6. Tokenization
7. Padding
8. BERT feature encoder
9. Train/Val/Test split

**Helpdesk**

In [3]:
#Loads dataset and orders it by time and case
df_helpdesk = data_loader("/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/raw/Helpdesk_finale.csv", columns=["Complete Timestamp", "Case ID", "Activity", "Resource"])
df_helpdesk.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21348 entries, 0 to 4615
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   time:timestamp     21348 non-null  datetime64[ns]
 1   case:concept:name  21348 non-null  object        
 2   concept:name       21348 non-null  object        
 3   org:resource       21348 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 833.9+ KB


In [4]:
# Remove missing values
df_helpdesk_clean = missing_values(df_helpdesk, "case:concept:name")

#Delete Outliers (IQR)
df_helpdesk_clean = outliers(df_helpdesk_clean, "case:concept:name")

In [5]:
df_helpdesk_clean.head()

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
0,2012-10-09 14:50:17,Case 1,Assign-seriousness,Value 1
1,2012-10-09 14:51:01,Case 1,Take-in-charge-ticket,Value 1
2,2012-10-12 15:02:56,Case 1,Take-in-charge-ticket,Value 2
3,2012-10-25 11:54:26,Case 1,Resolve-ticket,Value 1
4,2012-11-09 12:54:39,Case 1,Closed,Value 3


In [6]:
#Create next activity for each event
df_helpdesk_clean = next_activity(df_helpdesk_clean)
df_helpdesk_clean

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource,next activity
0,2012-10-09 14:50:17,Case 1,Assign-seriousness,Value 1,Take-in-charge-ticket
1,2012-10-09 14:51:01,Case 1,Take-in-charge-ticket,Value 1,Take-in-charge-ticket
2,2012-10-12 15:02:56,Case 1,Take-in-charge-ticket,Value 2,Resolve-ticket
3,2012-10-25 11:54:26,Case 1,Resolve-ticket,Value 1,Closed
4,2012-11-09 12:54:39,Case 1,Closed,Value 3,end
...,...,...,...,...,...
4611,2013-01-04 16:51:50,Case 998,Closed,Value 3,end
4612,2013-02-12 16:06:37,Case 999,Assign-seriousness,Value 1,Take-in-charge-ticket
4613,2013-02-25 11:37:20,Case 999,Take-in-charge-ticket,Value 12,Resolve-ticket
4614,2013-03-14 16:24:30,Case 999,Resolve-ticket,Value 12,Closed


In [7]:
#Convert df entries to string values and lowercase df

df_helpdesk_converted = normalize_and_lowercase(df_helpdesk_clean)
df_helpdesk_converted.head()

  new_df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource,next activity
0,2012-10-09 14:50:17,case 1,assign-seriousness,value 1,take-in-charge-ticket
1,2012-10-09 14:51:01,case 1,take-in-charge-ticket,value 1,take-in-charge-ticket
2,2012-10-12 15:02:56,case 1,take-in-charge-ticket,value 2,resolve-ticket
3,2012-10-25 11:54:26,case 1,resolve-ticket,value 1,closed
4,2012-11-09 12:54:39,case 1,closed,value 3,end


In [8]:
#Generate prefix traces for each event
prefix_traces = generate_prefix_traces(df_helpdesk_clean, "time:timestamp", "case:concept:name")

In [10]:
#Create dataframe that contains sequentialized prefix traces and the corresponding next activity
df_helpdesk_sequenzes = early_fusion(prefix_traces)
df_helpdesk_sequenzes

Unnamed: 0,Prefix_Trace,Next_Activity
0,2012-10-09 14:50:17 case 1 assign-seriousness ...,take-in-charge-ticket
1,2012-10-09 14:50:17 2012-10-09 14:51:01 case 1...,take-in-charge-ticket
2,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,resolve-ticket
3,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,closed
4,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,
...,...,...
18804,2012-11-30 11:57:13 2012-11-30 11:57:38 2012-1...,
18805,2013-02-12 16:06:37 case 999 assign-seriousnes...,take-in-charge-ticket
18806,2013-02-12 16:06:37 2013-02-25 11:37:20 case 9...,resolve-ticket
18807,2013-02-12 16:06:37 2013-02-25 11:37:20 2013-0...,closed


In [10]:
#Save cleaned dataset with next activity
save_event_log(df_helpdesk_sequenzes, "Helpdesk")

2024-05-03_Helpdesk_next_activity.pkl has been saved.


In [3]:
#Delete all dataset variables
delete_dataframes()

#Load most current dataset
df_helpdesk=load_event_log("Helpdesk")
df_helpdesk.head()

All dataset varibales are deleted
Helpdesk loaded


Unnamed: 0,Prefix_Trace,Next_Activity
0,2012-10-09 14:50:17 case 1 assign-seriousness ...,take-in-charge-ticket
1,2012-10-09 14:50:17 2012-10-09 14:51:01 case 1...,take-in-charge-ticket
2,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,resolve-ticket
3,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,closed
4,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,


In [4]:
df_helpdesk['Prefix_Trace'][2]

'2012-10-09 14:50:17 2012-10-09 14:51:01 2012-10-12 15:02:56 case 1 case 1 case 1 assign-seriousness take-in-charge-ticket take-in-charge-ticket value 1 value 1 value 2 take-in-charge-ticket take-in-charge-ticket resolve-ticket'

## Testing

In [5]:
from transformers import BertTokenizer

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_text = df_helpdesk["Prefix_Trace"].apply(lambda x:tokenizer.encode(x, add_special_tokens=True))



In [7]:
# Get unique labels from the 'label' column
unique_labels = df_helpdesk['Next_Activity'].unique()
# Create label map using dictionary comprehension
label_map = {label: index for index, label in enumerate(unique_labels)}
encoded_labels = df_helpdesk['Next_Activity'].map(label_map)


In [10]:
label_map

{'take-in-charge-ticket': 0,
 'resolve-ticket': 1,
 'closed': 2,
 None: 3,
 'require-upgrade': 4,
 'assign-seriousness': 5,
 'wait': 6,
 'create-sw-anomaly': 7,
 'end': 8,
 'schedule-intervention': 9,
 'resolve-sw-anomaly': 10}

In [11]:
max_length = max(len(seq) for seq in tokenized_text)

# Pad tokenized sequences
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(tokenized_text, padding='post', maxlen=max_length)

In [12]:
padded_sequences

array([[ 101, 2262, 1011, ...,    0,    0,    0],
       [ 101, 2262, 1011, ...,    0,    0,    0],
       [ 101, 2262, 1011, ...,    0,    0,    0],
       ...,
       [ 101, 2286, 1011, ...,    0,    0,    0],
       [ 101, 2286, 1011, ...,    0,    0,    0],
       [ 101, 2286, 1011, ...,    0,    0,    0]], dtype=int32)