# Hyperparameter Optimized Dynamic Bert Model

In [2]:
import sys

# Append the directory containing the src folder to sys.path
sys.path.append('/Users/lars/Documents/test/')

In [3]:
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding, set_seed
from tensorflow.keras.callbacks import Callback # type: ignore
from datasets import Dataset
import os
import pickle
import time
from src.data.data_manager import data_loader
from src.models.baseline_models import MetricsCallbackDynamic, BERTModelBuilderDynamic, sort_by_length, create_buckets_and_batches, create_buckets_and_batches_bert
from src.visualization.evaluation import plot_loss_accuracy_comparison

In [4]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [7]:
# Set seed for reproducability

seed = 1234
set_seed(seed)

In [8]:
train_tensor = data_loader(directory, path_interim, "Long_Helpdesk_train")
val_tensor = data_loader(directory, path_interim, "Long_Helpdesk_val")
test_tensor = data_loader(directory, path_interim, "Helpdesk_test")

In [9]:
train_tensor

Unnamed: 0,Prefix_Trace,Next_Activity
0,assign-seriousness,take-in-charge-ticket
1,assign-seriousness take-in-charge-ticket,take-in-charge-ticket
2,assign-seriousness take-in-charge-ticket take-...,resolve-ticket
3,assign-seriousness take-in-charge-ticket take-...,closed
4,assign-seriousness take-in-charge-ticket take-...,end
...,...,...
13170,assign-seriousness,take-in-charge-ticket
13171,assign-seriousness take-in-charge-ticket,wait
13172,assign-seriousness take-in-charge-ticket wait,resolve-ticket
13173,assign-seriousness take-in-charge-ticket wait ...,closed


---
## Checking for Overfitting

### No Hyperparameter optimization

In [None]:
# Set the environment variable for GPU memory management
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

# Enable mixed precision for better performance and reduced memory usage
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Clear any existing GPU memory state
tf.keras.backend.clear_session()

# Reduce TensorFlow logging verbosity
tf.get_logger().setLevel('ERROR')

In [10]:
# Set parameters
num_classes = 10
batch_size = 4
max_length = 36  # Set the maximum sequence length

In [11]:
# Build and compile the model
model_builder = BERTModelBuilderDynamic(model_name='bert-base-uncased', num_classes=num_classes)
model = model_builder.create_model()
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, None,                                        

In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Dynamic padding and uniform length batching

# Assuming train_tensor and val_tensor are pandas dataframes
train_tensor['Prefix_Trace'] = train_tensor['Prefix_Trace'].astype(str)
val_tensor['Prefix_Trace'] = val_tensor['Prefix_Trace'].astype(str)

# Convert labels to integers
label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
train_tensor['Next_Activity'] = train_tensor['Next_Activity'].map(label_map).astype(int)
val_tensor['Next_Activity'] = val_tensor['Next_Activity'].map(label_map).astype(int)

# Convert to Hugging Face datasets
train_data = Dataset.from_pandas(train_tensor)
val_data = Dataset.from_pandas(val_tensor)

# Sort the data by length
sorted_train_data = sort_by_length(train_data, tokenizer, max_length)
sorted_val_data = sort_by_length(val_data, tokenizer, max_length)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create TensorFlow datasets and ensure they repeat
tf_train_dataset = create_buckets_and_batches_bert(sorted_train_data, batch_size, data_collator).repeat()
tf_val_dataset = create_buckets_and_batches_bert(sorted_val_data, batch_size, data_collator).repeat()

# Prefetch datasets
tf_train_dataset = tf_train_dataset.prefetch(tf.data.AUTOTUNE)
tf_val_dataset = tf_val_dataset.prefetch(tf.data.AUTOTUNE)

# Calculate steps per epoch based on the length of the dataset
train_steps_per_epoch = len(sorted_train_data) // batch_size
val_steps_per_epoch = len(sorted_val_data) // batch_size

# Debugging statements to check the sizes and steps
print(f"Number of training samples: {len(sorted_train_data)}")
print(f"Number of validation samples: {len(sorted_val_data)}")
print(f"Steps per epoch (train): {train_steps_per_epoch}")
print(f"Steps per epoch (val): {val_steps_per_epoch}")

Number of training samples: 13175
Number of validation samples: 2805
Steps per epoch (train): 3293
Steps per epoch (val): 701


In [None]:
# Set callbacks
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
metrics_callback = MetricsCallbackDynamic(validation_data=tf_val_dataset, steps_per_epoch=val_steps_per_epoch)

start_time = time.time()

# Train the model
history_helpdesk = model.fit(
    tf_train_dataset,
    epochs=50,  # Increase the number of epochs if necessary
    validation_data=tf_val_dataset,
    steps_per_epoch=train_steps_per_epoch,
    validation_steps=val_steps_per_epoch,
    callbacks=[metrics_callback, early_stopping_callback]
)

end_time = time.time()

print(f"BERT (base) training time: {end_time - start_time} seconds")

In [None]:
# List of all datasets the models were trained
datasets = ['helpdesk']

# Ploting loss and accuracy for all datasets
plot_loss_accuracy_comparison(history_helpdesk.history, datasets, 5)