# **Preprocessing of Datasets**

#### Set path variables and import functions

In [1]:
# Append the directory containing the src folder to sys.path
import sys

sys.path.append('/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift')

In [2]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [3]:
#Load package and functions to notebook
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
import transformers
print("Transformers version:", transformers.__version__)
from src.data.data_manager import data_loader, save_event_log, delete_dataframes
from src.data.data_generator import next_activity
from src.features.preprocessing import outliers, missing_values, normalize_and_lowercase, generate_prefix_traces, early_fusion, encoding_and_tokenizing, split_data

TensorFlow version: 2.16.1
Transformers version: 4.40.1


### ----------------------------------------------------------
### Start of preprocessing

**Steps**

1. Order events by timestamp
2. Missing values
3. Outlier detection (IQR)
4. Convert numerical data to str
5. Earliy fusion approach
6. Tokenization
7. Padding
8. BERT feature encoder
9. Train/Val/Test split

**Load specific dataset into enviroment**

In [4]:
#Loads dataset and orders it by time and case
df_helpdesk = data_loader(directory, path_raw, "Helpdesk_finale.csv" , columns=["Complete Timestamp", "Case ID", "Activity", "Resource"])
df_helpdesk.info()


<class 'pandas.core.frame.DataFrame'>
Index: 21348 entries, 0 to 4615
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   time:timestamp     21348 non-null  datetime64[ns]
 1   case:concept:name  21348 non-null  object        
 2   concept:name       21348 non-null  object        
 3   org:resource       21348 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 833.9+ KB


**Clean dataset**

In [5]:
# Remove missing values
df_helpdesk_clean = missing_values(df_helpdesk, "case:concept:name")

#Delete Outliers (IQR)
df_helpdesk_clean = outliers(df_helpdesk_clean, "case:concept:name")

In [6]:
df_helpdesk_clean.head()

Unnamed: 0,time:timestamp,case:concept:name,concept:name,org:resource
0,2012-10-09 14:50:17,Case 1,Assign-seriousness,Value 1
1,2012-10-09 14:51:01,Case 1,Take-in-charge-ticket,Value 1
2,2012-10-12 15:02:56,Case 1,Take-in-charge-ticket,Value 2
3,2012-10-25 11:54:26,Case 1,Resolve-ticket,Value 1
4,2012-11-09 12:54:39,Case 1,Closed,Value 3


**Create next activity column**

In [7]:
#Create next activity for each event
df_helpdesk_clean = next_activity(df_helpdesk_clean)
df_helpdesk_clean

Unnamed: 0,case:concept:name,time:timestamp,concept:name,org:resource,next activity
0,Case 1,2012-10-09 14:50:17,Assign-seriousness,Value 1,Take-in-charge-ticket
1,Case 1,2012-10-09 14:51:01,Take-in-charge-ticket,Value 1,Take-in-charge-ticket
2,Case 1,2012-10-12 15:02:56,Take-in-charge-ticket,Value 2,Resolve-ticket
3,Case 1,2012-10-25 11:54:26,Resolve-ticket,Value 1,Closed
4,Case 1,2012-11-09 12:54:39,Closed,Value 3,end
...,...,...,...,...,...
4611,Case 998,2013-01-04 16:51:50,Closed,Value 3,end
4612,Case 999,2013-02-12 16:06:37,Assign-seriousness,Value 1,Take-in-charge-ticket
4613,Case 999,2013-02-25 11:37:20,Take-in-charge-ticket,Value 12,Resolve-ticket
4614,Case 999,2013-03-14 16:24:30,Resolve-ticket,Value 12,Closed


**Train/Test/Val split**
See: 

Next-Activity Prediction for Non-stationary
Processes with Unseen Data Variability

and

Outcome-Oriented Predictive Process Monitoring:
Review and Benchmark

In [8]:
train_df, val_df, test_df = split_data(df_helpdesk_clean)

Train set shape: (13175, 5)
Validation set shape: (2805, 5)
Test set shape: (2829, 5)


In [9]:
#Convert df entries to string values and lowercase df

train_df = normalize_and_lowercase(train_df)
val_df = normalize_and_lowercase(val_df)
test_df = normalize_and_lowercase(test_df)

val_df.head()

Unnamed: 0,case:concept:name,time:timestamp,concept:name,org:resource,next activity
18179,case 3896,2013-09-11 13:04:49,assign-seriousness,value 1,take-in-charge-ticket
18180,case 3896,2013-09-17 10:30:20,take-in-charge-ticket,value 9,wait
18181,case 3896,2013-09-17 13:56:27,wait,value 9,resolve-ticket
18182,case 3896,2013-09-27 13:19:31,resolve-ticket,value 9,closed
18183,case 3896,2013-10-12 13:19:52,closed,value 3,end


**Generate prefix traces**

In [10]:
#Generate prefix traces for each event
train_prefix_traces = generate_prefix_traces(train_df, "time:timestamp", "case:concept:name")
val_prefix_traces = generate_prefix_traces(val_df, "time:timestamp", "case:concept:name")
test_prefix_traces = generate_prefix_traces(test_df, "time:timestamp", "case:concept:name")

In [11]:
test_prefix_traces

[(      case:concept:name       time:timestamp        concept:name org:resource
  21015         case 4508  2012-04-04 09:39:57  assign-seriousness      value 9,
  'take-in-charge-ticket'),
 (      case:concept:name       time:timestamp           concept:name  \
  21015         case 4508  2012-04-04 09:39:57     assign-seriousness   
  21016         case 4508  2012-04-04 12:39:31  take-in-charge-ticket   
  
        org:resource  
  21015      value 9  
  21016      value 6  ,
  'resolve-ticket'),
 (      case:concept:name       time:timestamp           concept:name  \
  21015         case 4508  2012-04-04 09:39:57     assign-seriousness   
  21016         case 4508  2012-04-04 12:39:31  take-in-charge-ticket   
  21017         case 4508  2012-04-04 13:06:48         resolve-ticket   
  
        org:resource  
  21015      value 9  
  21016      value 6  
  21017      value 6  ,
  'closed'),
 (      case:concept:name       time:timestamp           concept:name  \
  21015         case 450

**Create sequencalized prefix traces**

In [12]:
#Create dataframe that contains sequentialized prefix traces and the corresponding next activity
train_sequenzes = early_fusion(train_prefix_traces)
val_sequenzes = early_fusion(val_prefix_traces)
test_sequenzes = early_fusion(test_prefix_traces)

In [13]:
train_sequenzes

Unnamed: 0,Prefix_Trace,Next_Activity
0,case 1 2012-10-09 14:50:17 assign-seriousness ...,take-in-charge-ticket
1,case 1 case 1 2012-10-09 14:50:17 2012-10-09 1...,take-in-charge-ticket
2,case 1 case 1 case 1 2012-10-09 14:50:17 2012-...,resolve-ticket
3,case 1 case 1 case 1 case 1 2012-10-09 14:50:1...,closed
4,case 1 case 1 case 1 case 1 case 1 2012-10-09 ...,end
...,...,...
13170,case 3895 2011-09-16 14:58:43 assign-seriousne...,take-in-charge-ticket
13171,case 3895 case 3895 2011-09-16 14:58:43 2011-0...,wait
13172,case 3895 case 3895 case 3895 2011-09-16 14:58...,resolve-ticket
13173,case 3895 case 3895 case 3895 case 3895 2011-0...,closed


In [14]:
#Save cleaned dataset with next activity
#save_event_log(directory, path_interim, df_helpdesk_sequenzes, "Helpdesk")

In [15]:
#Delete all dataset variables
#delete_dataframes()

#Load most current dataset
#df_helpdesk=data_loader(directory, path_interim, "Helpdesk")
#df_helpdesk.head()

**Tokenize and encode prefix traces and next activities**

**Creates final dataset made of tensors**

In [16]:
#Tokenizes prefix traces and encodes next activities
#Afterwars a dataset consisting of tensors is created
train_encoded = encoding_and_tokenizing(train_sequenzes, "Prefix_Trace", "Next_Activity")
val_encoded = encoding_and_tokenizing(val_sequenzes, "Prefix_Trace", "Next_Activity")
test_encoded = encoding_and_tokenizing(test_sequenzes, "Prefix_Trace", "Next_Activity")



In [17]:
train_encoded

<_TensorSliceDataset element_spec=(TensorSpec(shape=(122,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [20]:
map

{'take-in-charge-ticket': 0,
 'resolve-ticket': 1,
 'closed': 2,
 'end': 3,
 'require-upgrade': 4,
 'assign-seriousness': 5,
 'wait': 6,
 'create-sw-anomaly': 7,
 'schedule-intervention': 8,
 'resolve-sw-anomaly': 9}

In [22]:
save_event_log(directory, path_processed, df_helpdesk_sequenzes, "Helpdesk")

Removed existing folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-10_ Helpdesk_tensor'
Saved new folder '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-10_ Helpdesk_tensor'
File saved as 2024-05-10_ Helpdesk_tensor


In [23]:
helpdesk_tensor = data_loader(directory, path_processed, "Helpdesk")

Loading dataset from folder: '/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift/data/processed/2024-05-10_ Helpdesk_tensor'


## Testing

Train/Val/Test split

In [39]:
def decode_tokenized_sequences(tokenized_sequences, tokenizer):

    decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in tokenized_sequences]
    
    return decoded_sequences

In [59]:
# Assuming tensor_nested_array and tokenizer are available
decoded_sequences = decode_tokenized_sequences([seq for seq, _ in test_dataset], BertTokenizer.from_pretrained("bert-base-uncased"))




In [60]:
decoded_sequences[:5]

['case 4509 case 4509 case 4509 case 4509 2011 - 09 - 16 07 : 35 : 57 2011 - 09 - 16 09 : 03 : 22 2011 - 09 - 16 09 : 20 : 11 2011 - 11 - 05 08 : 40 : 22 assign - seriousness take - in - charge - ticket resolve - ticket closed value 6 value 6 value 6 value 5']