# **Preprocessing of Datasets**

#### Set path variables and import functions

In [1]:
# Append the directory containing the src folder to sys.path
import sys

sys.path.append('/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift')

In [2]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [3]:
#Load package and functions to notebook
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
import transformers
print("Transformers version:", transformers.__version__)
from src.data.data_manager import data_loader, save_event_log, delete_dataframes
from src.data.data_generator import next_activity
from src.features.preprocessing import outliers, missing_values, normalize_and_lowercase, generate_prefix_traces, early_fusion, encoding_and_tokenizing

TensorFlow version: 2.16.1
Transformers version: 4.40.1


### ----------------------------------------------------------
### Start of preprocessing

**Steps**

1. Order events by timestamp
2. Missing values
3. Outlier detection (IQR)
4. Convert numerical data to str
5. Earliy fusion approach
6. Tokenization
7. Padding
8. BERT feature encoder
9. Train/Val/Test split

**Load specific dataset into enviroment**

In [4]:
#Loads dataset and orders it by time and case
df_helpdesk = data_loader(directory, path_raw, "Helpdesk_finale.csv" , columns=["Complete Timestamp", "Case ID", "Activity", "Resource"])
df_helpdesk.info()


<class 'pandas.core.frame.DataFrame'>
Index: 21348 entries, 0 to 4615
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   time:timestamp     21348 non-null  datetime64[ns]
 1   case:concept:name  21348 non-null  object        
 2   concept:name       21348 non-null  object        
 3   org:resource       21348 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 833.9+ KB


**Clean dataset**

In [5]:
# Remove missing values
df_helpdesk_clean = missing_values(df_helpdesk, "case:concept:name")

#Delete Outliers (IQR)
df_helpdesk_clean = outliers(df_helpdesk_clean, "case:concept:name")

In [6]:
df_helpdesk_clean.head()

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
0,2012-10-09 14:50:17,Case 1,Assign-seriousness,Value 1
1,2012-10-09 14:51:01,Case 1,Take-in-charge-ticket,Value 1
2,2012-10-12 15:02:56,Case 1,Take-in-charge-ticket,Value 2
3,2012-10-25 11:54:26,Case 1,Resolve-ticket,Value 1
4,2012-11-09 12:54:39,Case 1,Closed,Value 3


**Create next activity column**

In [7]:
#Create next activity for each event
df_helpdesk_clean = next_activity(df_helpdesk_clean)
df_helpdesk_clean

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource,next activity
0,2012-10-09 14:50:17,Case 1,Assign-seriousness,Value 1,Take-in-charge-ticket
1,2012-10-09 14:51:01,Case 1,Take-in-charge-ticket,Value 1,Take-in-charge-ticket
2,2012-10-12 15:02:56,Case 1,Take-in-charge-ticket,Value 2,Resolve-ticket
3,2012-10-25 11:54:26,Case 1,Resolve-ticket,Value 1,Closed
4,2012-11-09 12:54:39,Case 1,Closed,Value 3,end
...,...,...,...,...,...
4611,2013-01-04 16:51:50,Case 998,Closed,Value 3,end
4612,2013-02-12 16:06:37,Case 999,Assign-seriousness,Value 1,Take-in-charge-ticket
4613,2013-02-25 11:37:20,Case 999,Take-in-charge-ticket,Value 12,Resolve-ticket
4614,2013-03-14 16:24:30,Case 999,Resolve-ticket,Value 12,Closed


In [8]:
#Convert df entries to string values and lowercase df

df_helpdesk_converted = normalize_and_lowercase(df_helpdesk_clean)
df_helpdesk_converted.head()

  new_df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource,next activity
0,2012-10-09 14:50:17,case 1,assign-seriousness,value 1,take-in-charge-ticket
1,2012-10-09 14:51:01,case 1,take-in-charge-ticket,value 1,take-in-charge-ticket
2,2012-10-12 15:02:56,case 1,take-in-charge-ticket,value 2,resolve-ticket
3,2012-10-25 11:54:26,case 1,resolve-ticket,value 1,closed
4,2012-11-09 12:54:39,case 1,closed,value 3,end


**Generate prefix traces**

In [9]:
#Generate prefix traces for each event
prefix_traces = generate_prefix_traces(df_helpdesk_clean, "time:timestamp", "case:concept:name")
prefix_traces

[(       time:timestamp case:concept:name        concept:name org:resource
  0 2012-10-09 14:50:17            Case 1  Assign-seriousness      Value 1,
  'Take-in-charge-ticket'),
 (       time:timestamp case:concept:name           concept:name org:resource
  0 2012-10-09 14:50:17            Case 1     Assign-seriousness      Value 1
  1 2012-10-09 14:51:01            Case 1  Take-in-charge-ticket      Value 1,
  'Take-in-charge-ticket'),
 (       time:timestamp case:concept:name           concept:name org:resource
  0 2012-10-09 14:50:17            Case 1     Assign-seriousness      Value 1
  1 2012-10-09 14:51:01            Case 1  Take-in-charge-ticket      Value 1
  2 2012-10-12 15:02:56            Case 1  Take-in-charge-ticket      Value 2,
  'Resolve-ticket'),
 (       time:timestamp case:concept:name           concept:name org:resource
  0 2012-10-09 14:50:17            Case 1     Assign-seriousness      Value 1
  1 2012-10-09 14:51:01            Case 1  Take-in-charge-ticket    

**Create sequencalized prefix traces**

In [10]:
#Create dataframe that contains sequentialized prefix traces and the corresponding next activity
df_helpdesk_sequenzes = early_fusion(prefix_traces)
df_helpdesk_sequenzes

Unnamed: 0,Prefix_Trace,Next_Activity
0,2012-10-09 14:50:17 case 1 assign-seriousness ...,take-in-charge-ticket
1,2012-10-09 14:50:17 2012-10-09 14:51:01 case 1...,take-in-charge-ticket
2,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,resolve-ticket
3,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,closed
4,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,end
...,...,...
18804,2012-11-30 11:57:13 2012-11-30 11:57:38 2012-1...,end
18805,2013-02-12 16:06:37 case 999 assign-seriousnes...,take-in-charge-ticket
18806,2013-02-12 16:06:37 2013-02-25 11:37:20 case 9...,resolve-ticket
18807,2013-02-12 16:06:37 2013-02-25 11:37:20 2013-0...,closed


In [11]:
df_helpdesk_sequenzes['Prefix_Trace'][4]
df_helpdesk_sequenzes['Next_Activity'][4]

'end'

In [12]:
#Save cleaned dataset with next activity
save_event_log(directory, path_interim, df_helpdesk_sequenzes, "Helpdesk")

File saved as 2024-05-08_Helpdesk_next_activity.pkl


In [13]:
#Delete all dataset variables
delete_dataframes()

#Load most current dataset
df_helpdesk=data_loader(directory, path_interim, "Helpdesk")
df_helpdesk.head()

All dataset varibales are deleted


Unnamed: 0,Prefix_Trace,Next_Activity
0,2012-10-09 14:50:17 case 1 assign-seriousness ...,take-in-charge-ticket
1,2012-10-09 14:50:17 2012-10-09 14:51:01 case 1...,take-in-charge-ticket
2,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,resolve-ticket
3,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,closed
4,2012-10-09 14:50:17 2012-10-09 14:51:01 2012-1...,end


In [14]:
df_helpdesk['Prefix_Trace'][4]
df_helpdesk['Next_Activity'][4]

'end'

**Tokenize and encode prefix traces and next activities**

**Creates final dataset made of tensors**

In [15]:
#Tokenizes prefix traces and encodes next activities
#Afterwars a dataset consisting of tensors is created
test, inputs, labels = encoding_and_tokenizing(df_helpdesk, "Prefix_Trace", "Next_Activity")



In [16]:
test

<_TensorSliceDataset element_spec=(TensorSpec(shape=(122,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [17]:
save_event_log(directory, path_processed, test, "Helpdesk")

Removed existing folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-08_ Helpdesk_tensor'
Saved new folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-08_ Helpdesk_tensor'
File saved as 2024-05-08_ Helpdesk_tensor


In [18]:
helpdesk_tensor = data_loader(directory, path_processed, "Helpdesk")

Loading dataset from folder: '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-08_ Helpdesk_tensor'


In [19]:
helpdesk_tensor

<_LoadDataset element_spec=(TensorSpec(shape=(122,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [20]:
test

<_TensorSliceDataset element_spec=(TensorSpec(shape=(122,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

## Testing

In [21]:
print(len(helpdesk_tensor))

18809
