In [1]:
import sys

sys.path.append('/Users/lars/Documents/test/')

In [2]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [3]:
import tensorflow as tf
from transformers import AutoTokenizer, DataCollatorWithPadding
from tensorflow.keras.callbacks import Callback # type: ignore
from datasets import Dataset
import os
import pickle
import time
from src.data.data_manager import data_loader
from src.models.baseline_models import LongformerModelBuilderDynamic, MetricsCallbackDynamic, BERTModelBuilderDynamic, sort_by_length, create_buckets_and_batches, create_buckets_and_batches_bert

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.15.0


In [5]:
train_tensor = data_loader(directory, path_interim, "Long_Helpdesk_train")
val_tensor = data_loader(directory, path_interim, "Long_Helpdesk_val")
test_tensor = data_loader(directory, path_interim, "Helpdesk_test")

In [6]:
train_tensor

Unnamed: 0,Prefix_Trace,Next_Activity
0,assign-seriousness,take-in-charge-ticket
1,assign-seriousness take-in-charge-ticket,take-in-charge-ticket
2,assign-seriousness take-in-charge-ticket take-...,resolve-ticket
3,assign-seriousness take-in-charge-ticket take-...,closed
4,assign-seriousness take-in-charge-ticket take-...,end
...,...,...
13170,assign-seriousness,take-in-charge-ticket
13171,assign-seriousness take-in-charge-ticket,wait
13172,assign-seriousness take-in-charge-ticket wait,resolve-ticket
13173,assign-seriousness take-in-charge-ticket wait ...,closed


---

## Longformer with Dynamic Padding and uniform length batching

In [None]:
# Set the environment variable for GPU memory management
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

# Enable mixed precision for better performance and reduced memory usage

# Clear any existing GPU memory state
tf.keras.backend.clear_session()

# Reduce TensorFlow logging verbosity
tf.get_logger().setLevel('ERROR')

In [7]:
# Set parameters
max_length = 36
num_classes = 10
batch_size = 4

In [8]:
# Build and compile the model
model_builder = LongformerModelBuilderDynamic(model_name='allenai/longformer-base-4096', num_classes=num_classes)
model = model_builder.create_model(max_length=max_length)
model.summary()

Some layers from the model checkpoint at allenai/longformer-base-4096 were not used when initializing TFLongformerModel: ['lm_head']
- This IS expected if you are initializing TFLongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFLongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFLongformerModel were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_longformer_model (TFLon  TFLongformerBaseModelOutpu   1486594   ['input_ids[0][0]',           
 gformerModel)               tWithPooling(last_hidden_s   56         'attention_mask[0][0]']      
                             tate=(None, None, 768),                                          

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')

# Assuming train_tensor and val_tensor are pandas dataframes
train_tensor['Prefix_Trace'] = train_tensor['Prefix_Trace'].astype(str)
val_tensor['Prefix_Trace'] = val_tensor['Prefix_Trace'].astype(str)

# Convert labels to integers
label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
train_tensor['Next_Activity'] = train_tensor['Next_Activity'].map(label_map).astype(int)
val_tensor['Next_Activity'] = val_tensor['Next_Activity'].map(label_map).astype(int)

# Convert to Hugging Face datasets
train_data = Dataset.from_pandas(train_tensor)
val_data = Dataset.from_pandas(val_tensor)

# Sort the data by length
sorted_train_data = sort_by_length(train_data, tokenizer, max_length)
sorted_val_data = sort_by_length(val_data, tokenizer, max_length)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create TensorFlow datasets and ensure they repeat
tf_train_dataset = create_buckets_and_batches(sorted_train_data, batch_size, data_collator).repeat()
tf_val_dataset = create_buckets_and_batches(sorted_val_data, batch_size, data_collator).repeat()

# Prefetch datasets
tf_train_dataset = tf_train_dataset.prefetch(tf.data.AUTOTUNE)
tf_val_dataset = tf_val_dataset.prefetch(tf.data.AUTOTUNE)

# Calculate steps per epoch
train_steps_per_epoch = len(sorted_train_data) // batch_size
val_steps_per_epoch = len(sorted_val_data) // batch_size

# Debugging statements to check the sizes and steps
print(f"Number of training samples: {len(sorted_train_data)}")
print(f"Number of validation samples: {len(sorted_val_data)}")
print(f"Steps per epoch (train): {train_steps_per_epoch}")
print(f"Steps per epoch (val): {val_steps_per_epoch}")

Number of training samples: 13175
Number of validation samples: 2805
Steps per epoch (train): 3293
Steps per epoch (val): 701


In [None]:
# Function to inspect batches
def inspect_batches(dataset, num_batches=5):
    iterator = iter(dataset)
    for i in range(num_batches):
        batch = next(iterator)
        input_ids, attention_mask = batch[0]['input_ids'], batch[0]['attention_mask']
        labels = batch[1]
        print(f"Batch {i+1}:")
        print(f"Input IDs shape: {input_ids.shape}")
        print(f"Attention mask shape: {attention_mask.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Max sequence length in batch: {tf.reduce_max(tf.reduce_sum(attention_mask, axis=1)).numpy()}")
        print("-" * 40)

# Inspect a few batches from the training dataset
inspect_batches(tf_train_dataset)

In [None]:
# Set callbacks
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
metrics_callback = MetricsCallbackDynamic(validation_data=tf_val_dataset, steps_per_epoch=val_steps_per_epoch)

# Set start time to calculate training duration
start_time = time.time()

# Train the model
history_helpdesk = model.fit(
    tf_train_dataset,
    epochs=1,  # Increase the number of epochs if necessary
    validation_data=tf_val_dataset,
    steps_per_epoch=train_steps_per_epoch,
    validation_steps=val_steps_per_epoch,
    callbacks=[metrics_callback, early_stopping_callback]
)

end_time = time.time()

print(f"Longformer training time: {end_time - start_time} seconds")

In [None]:
# Directories to save the model and history
model_save_dir = '/home/lars.gsaenger/test/models/models_pretrained'
history_save_dir = '/home/lars.gsaenger/test/models/models_pretrained/histories'

# Define the model and history file paths
model_save_path = os.path.join(model_save_dir, 'dynamic_longformer_bpic2012')
history_save_path = os.path.join(history_save_dir, 'dynamic_longformer_bpic2012_history.pkl')

# Save the model using TensorFlow/Keras method
model.save(model_save_path, save_format='tf')

# Save the history object returned by model.fit()
with open(history_save_path, 'wb') as f:
    pickle.dump(history_helpdesk.history, f)

print(f"Model saved to {model_save_path}")
print(f"Tokenizer saved to {model_save_path}")
print(f"History saved to {history_save_path}")

In [None]:
# Define path to history object
history_save_path = '/home/lars.gsaenger/test/models/models_pretrained/histories/dynamic_bert_bpic2018_history.pkl'

# Load the history object
with open(history_save_path, 'rb') as f:
    loaded_history = pickle.load(f)

---

## BERT base with dynamic padding

In [None]:
# Set the environment variable for GPU memory management
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

# Enable mixed precision for better performance and reduced memory usage
tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Clear any existing GPU memory state
tf.keras.backend.clear_session()

# Reduce TensorFlow logging verbosity
tf.get_logger().setLevel('ERROR')

In [10]:
# Set parameters
num_classes = 10
batch_size = 4
max_length = 36  # Set the maximum sequence length

In [11]:
# Build and compile the model
model_builder = BERTModelBuilderDynamic(model_name='bert-base-uncased', num_classes=num_classes)
model = model_builder.create_model()
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, None)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, None,                                      

In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Dynamic padding and uniform length batching

# Assuming train_tensor and val_tensor are pandas dataframes
train_tensor['Prefix_Trace'] = train_tensor['Prefix_Trace'].astype(str)
val_tensor['Prefix_Trace'] = val_tensor['Prefix_Trace'].astype(str)

# Convert labels to integers
label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
train_tensor['Next_Activity'] = train_tensor['Next_Activity'].map(label_map).astype(int)
val_tensor['Next_Activity'] = val_tensor['Next_Activity'].map(label_map).astype(int)

# Convert to Hugging Face datasets
train_data = Dataset.from_pandas(train_tensor)
val_data = Dataset.from_pandas(val_tensor)

# Sort the data by length
sorted_train_data = sort_by_length(train_data, tokenizer, max_length)
sorted_val_data = sort_by_length(val_data, tokenizer, max_length)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create TensorFlow datasets and ensure they repeat
tf_train_dataset = create_buckets_and_batches_bert(sorted_train_data, batch_size, data_collator).repeat()
tf_val_dataset = create_buckets_and_batches_bert(sorted_val_data, batch_size, data_collator).repeat()

# Prefetch datasets
tf_train_dataset = tf_train_dataset.prefetch(tf.data.AUTOTUNE)
tf_val_dataset = tf_val_dataset.prefetch(tf.data.AUTOTUNE)

# Calculate steps per epoch based on the length of the dataset
train_steps_per_epoch = len(sorted_train_data) // batch_size
val_steps_per_epoch = len(sorted_val_data) // batch_size

# Debugging statements to check the sizes and steps
print(f"Number of training samples: {len(sorted_train_data)}")
print(f"Number of validation samples: {len(sorted_val_data)}")
print(f"Steps per epoch (train): {train_steps_per_epoch}")
print(f"Steps per epoch (val): {val_steps_per_epoch}")

Number of training samples: 13175
Number of validation samples: 2805
Steps per epoch (train): 3293
Steps per epoch (val): 701


In [None]:
inspect_batches(tf_train_dataset)

In [13]:
# Set callbacks
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
metrics_callback = MetricsCallbackDynamic(validation_data=tf_val_dataset, steps_per_epoch=val_steps_per_epoch)

start_time = time.time()

# Train the model
history_helpdesk = model.fit(
    tf_train_dataset,
    epochs=1,  # Increase the number of epochs if necessary
    validation_data=tf_val_dataset,
    steps_per_epoch=train_steps_per_epoch,
    validation_steps=val_steps_per_epoch,
    callbacks=[metrics_callback, early_stopping_callback]
)

end_time = time.time()

print(f"BERT (base) training time: {end_time - start_time} seconds")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


 119/3293 [>.............................] - ETA: 16:13 - loss: 1.3851 - accuracy: 0.6429 - precision: 0.9189 - recall: 0.9341 - f1_score: 0.9264

KeyboardInterrupt: 

In [4]:
# Directories to save the model and history
model_save_dir = '/home/lars.gsaenger/test/models/models_pretrained'
history_save_dir = '/home/lars.gsaenger/test/models/models_pretrained/histories'

# Define the model and history file paths
model_save_path = os.path.join(model_save_dir, 'dynamic_bert_bpic2018')
history_save_path = os.path.join(history_save_dir, 'dynamic_bert_bpic2018_history.pkl')

# Save the model using TensorFlow/Keras method
model.save(model_save_path, save_format='tf')

# Save the history object returned by model.fit()
with open(history_save_path, 'wb') as f:
    pickle.dump(history_helpdesk.history, f)

print(f"Model saved to {model_save_path}")
print(f"Tokenizer saved to {model_save_path}")
print(f"History saved to {history_save_path}")

In [5]:
history_save_path = '/home/lars.gsaenger/test/models/models_pretrained/histories/dynamic_bert_bpic2018_history.pkl'
# Load the history object
with open(history_save_path, 'rb') as f:
    loaded_history = pickle.load(f)

{'loss': [0.6476355195045471,
  0.4462307393550873,
  0.4185558259487152,
  0.4058007597923279,
  0.3974268436431885],
 'accuracy': [0.8352489471435547,
  0.8573390245437622,
  0.8622629642486572,
  0.8650448322296143,
  0.8673900961875916],
 'precision': [0.9940122961997986,
  0.9936304092407227,
  0.9934799075126648,
  0.9934649467468262,
  0.9934191703796387],
 'recall': [0.9982346296310425,
  0.9992821216583252,
  0.9995473623275757,
  0.9996140599250793,
  0.9996532797813416],
 'f1_score': [0.9961189031600952,
  0.9964482188224792,
  0.9965043663978577,
  0.9965299367904663,
  0.9965264201164246],
 'val_loss': [0.4629610776901245,
  0.42274001240730286,
  0.41032665967941284,
  0.4029856324195862,
  0.4012047052383423],
 'val_accuracy': [0.8571833372116089,
  0.8610885143280029,
  0.864477276802063,
  0.8661866188049316,
  0.8661799430847168],
 'val_precision': [0.9933332204818726,
  0.9933264851570129,
  0.9933333396911621,
  0.9933298230171204,
  0.9933298826217651],
 'val_recal