# Concept Drift and Online Learning

In [None]:
'''
Sources:
    - https://medium.com/analytics-vidhya/tf-gradienttape-explained-for-keras-users-cc3f06276f22
    - https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch
    - https://www.kaggle.com/code/fabriciojoc/drebin-experiment-4-adwin-retrain
'''

Kein dynamic padding benötigt, da jede Sequenz einzeln preprocessed und für prediction verwendet wird

In [1]:
import sys

# Append the directory containing the src folder to sys.path
sys.path.append('/Users/lars/Documents/test/')

In [2]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [3]:
import math
import random
import tensorflow as tf
import numpy as np
import tensorflow_models as tfm
from river import drift
from transformers import TFAutoModel, AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset
from src.data.data_manager import data_loader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tf.random.set_seed(1234)
np.random.seed(1234)

In [5]:
# Set parameters

max_length = 36
batch_size = 1
num_classes = 10

optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

2024-07-12 12:26:33.876010: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-07-12 12:26:33.876028: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-07-12 12:26:33.876031: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-07-12 12:26:33.876062: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-12 12:26:33.876078: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
train_tensor = data_loader(directory, path_interim, "Long_Helpdesk_train")
val_tensor = data_loader(directory, path_interim, "Long_Helpdesk_val")
test_tensor = data_loader(directory, path_interim, "Long_Helpdesk_test")

In [7]:
label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}

In [8]:
train_tensor = train_tensor[:10]
test_tensor = test_tensor[:10]

---
### Needed Functions

In [9]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function for samples
def preprocessing_single(X_test, y_test):
    X_test_encoded = tokenizer(X_test, return_tensors='tf', padding=True, truncation=True)
    input_ids = X_test_encoded['input_ids']
    attention_mask = X_test_encoded['attention_mask']
    label = tf.convert_to_tensor([y_test], dtype= tf.float32)

    return {'input_ids': input_ids, 'attention_mask': attention_mask}, label



In [9]:
def preprocess_function(tokenizer, example, max_length=512):    
    return tokenizer(example['Prefix_Trace'], padding=False, truncation=True, max_length=max_length)

def sort_by_length(dataset, tokenizer, max_length=1024):

    # Tokenizes the dataset and calculates the length for all in input_ids
    tokenized = [preprocess_function(tokenizer, example, max_length) for example in dataset]
    lengths = [len(tok['input_ids']) for tok in tokenized]

    # Combine tokenized inputs, lengths, and labels and sort them
    combined = list(zip(tokenized, lengths, dataset['Next_Activity']))
    combined.sort(key=lambda x: x[1])

    return combined

def create_buckets_and_batches_bert(sorted_data, batch_size, data_collator):
 
    def gen():
        while True:

            # Shuffle data at the start of each epoch
            random.shuffle(sorted_data)  

            # Iterate over the dataset and select batch
            for i in range(0, len(sorted_data), batch_size):
                batch = sorted_data[i:i + batch_size]
                
                # Skip empty batches
                if len(batch) == 0:
                    continue  
                
                # Extract tokenized inputs and labels from the batch
                tokenized_batch = [item[0] for item in batch]
                labels = [item[2] for item in batch]
                
                # Create input dictionaries
                batch_inputs = {'input_ids': [tok['input_ids'] for tok in tokenized_batch],
                                'attention_mask': [tok['attention_mask'] for tok in tokenized_batch]}
                
                # Batch the inputs and yiel the batches and labels as tensors
                batch_inputs = data_collator(batch_inputs)
                yield dict(batch_inputs), tf.convert_to_tensor(labels)
    
    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {'input_ids': tf.TensorSpec(shape=(None, None), dtype=tf.int32),
             'attention_mask': tf.TensorSpec(shape=(None, None), dtype=tf.int32)},
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        )
    )

---
# Test

In [10]:
class BERTOnline:
    def __init__(self, model_name, num_classes):

        self.model_name = model_name
        self.num_classes = num_classes

    def create_model(self):

        # Load the pretrained BERT model
        encoder = TFAutoModel.from_pretrained(self.model_name)

        # Input layer for input_ids and attention_masks
        input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

        # Get encoder outputs
        encoder_outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Get the pooled output and make sure it is of type tf.float32
        pooled_output = tf.keras.layers.Lambda(lambda x: tf.cast(x.pooler_output, tf.float32))(encoder_outputs)

        # Apply dropout
        dropout = tf.keras.layers.Dropout(rate=0.1)(pooled_output)

        # Final dense layer for classification with softmax activation function and L2 regularization
        output = tf.keras.layers.Dense(self.num_classes, activation='softmax', dtype=tf.float32)(dropout)
        
        # Create model
        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
        
        return model

In [11]:
test = BERTOnline(model_name='bert-base-uncased', num_classes=10)
test = test.create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
weights_load_path = '/Users/lars/Documents/test/models/Weights_Helpdesk_Tuned/Weights_Helpdesk_Tuned'
test.load_weights(weights_load_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x3b1a8d990>

In [13]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function for samples
def preprocessing_single(X_test, y_test):
    X_test_encoded = tokenizer(X_test, return_tensors='tf', padding=True, truncation=True)
    input_ids = X_test_encoded['input_ids']
    attention_mask = X_test_encoded['attention_mask']
    label = tf.convert_to_tensor([y_test], dtype=tf.int32)  # Change to int32 for SparseCategoricalCrossentropy

    return {'input_ids': input_ids, 'attention_mask': attention_mask}, label


In [14]:
def train_step(model, optimizer, loss_fn, x, y):
    with tf.GradientTape() as tape:
        # Predict
        predictions = model(x, training=True)
        # Calculate Loss
        loss = loss_fn(y, predictions)
    
    # Calculate Gradients
    gradients = tape.gradient(loss, model.trainable_variables)
    # Update model
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

In [15]:
# Encoding labels and preparing samples to be tokenized later on
#label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
y_test = test_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
y_train = train_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
X_test = test_tensor['Prefix_Trace'].astype(str).values.tolist()
X_train = train_tensor['Next_Activity'].astype(str).values.tolist()

In [16]:
# Initialize variables for tracking
drifts = []
warnings = []
DRIFT = False
WARNING = False
warning_data = []
y_warning = []
acc = []
pred = []
true = []
hits = 0
p = []
s = []
n = 1.0
p.append(1.0)

X_window = X_train
y_window = y_train

adwin = drift.ADWIN()

In [21]:
# Process the training data stream
for sample in range(len(X_train)):
    sample_X = X_train[sample]
    sample_y = y_train[sample]
    
    sample_X, label = preprocessing_single(sample_X, sample_y)
    print("Start")
    # Train on the sample
    loss = train_step(test, optimizer, loss_fn, sample_X, label)
    print(f"Sample {sample}, Loss: {loss.numpy()}")

Start
Sample 0, Loss: 0.23538294434547424
Start
Sample 1, Loss: 0.18390701711177826
Start
Sample 2, Loss: 1.0428751707077026
Start
Sample 3, Loss: 1.0469597578048706
Start
Sample 4, Loss: 3.329148530960083
Start
Sample 5, Loss: 0.043817877769470215
Start
Sample 6, Loss: 0.8007882833480835
Start
Sample 7, Loss: 1.0832617282867432
Start
Sample 8, Loss: 3.8526039123535156
Start
Sample 9, Loss: 0.17193879187107086


In [18]:
# Compile the model for evaluation
test.compile(optimizer=optimizer, loss=loss_fn, metrics=['acc'])

In [19]:
# Evaluate the model
window_data = {
    'Prefix_Trace': X_test,
    'Next_Activity': y_test.tolist()
}

# Convert to Hugging Face datasets
window_data = Dataset.from_dict(window_data)

# Sort the data by length
sorted_window_data = sort_by_length(window_data, tokenizer, max_length)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create TensorFlow datasets and ensure they repeat
tf_window_dataset = create_buckets_and_batches_bert(sorted_window_data, batch_size, data_collator).repeat()

# Prefetch datasets
tf_window_dataset = tf_window_dataset.prefetch(tf.data.AUTOTUNE)

# Calculate steps per epoch based on the length of the dataset
window_steps_per_epoch = len(sorted_window_data) // batch_size

# Debugging statements to check the sizes and steps
print(f"Number of training samples: {len(sorted_window_data)}")
print(f"Steps per epoch (train): {window_steps_per_epoch}")

Number of training samples: 10
Steps per epoch (train): 10


In [20]:
# Test if everything worked
evaluation = test.evaluate(tf_window_dataset, steps=window_steps_per_epoch)

print(f"Validation loss: {evaluation[0]}")
print(f"Validation accuracy: {evaluation[1]}")

2024-07-12 12:27:41.579837: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation loss: 0.7614604234695435
Validation accuracy: 0.699999988079071


---
## Start formating data for streaming

In [11]:
# Encoding labels and preparing samples to be tokenized later on
#label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
y_test = test_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
y_train = train_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
X_test = test_tensor['Prefix_Trace'].astype(str).values.tolist()
X_train = train_tensor['Next_Activity'].astype(str).values.tolist()

In [12]:
# Set up lists for information storage
# drift points
drifts = []

# warning points
warnings = []

# flags for drift and warning
DRIFT = False
WARNING = False
warning_data = []
y_warning = []

# accuracy
acc = []

# predictions
pred = []
true = []

# hits
hits = 0

# prequential error
p = [] 

# stdev
s = [] 

# counte
n = 1.0 

p.append(1.0)

X_window = X_train
y_window = y_train

adwin = drift.ADWIN()

In [19]:
class BERTModelBuilderDynamic:
    def __init__(self, model_name, num_classes):

        self.model_name = model_name
        self.num_classes = num_classes

    def create_model(self):

        # Load the pretrained BERT model
        encoder = TFAutoModel.from_pretrained(self.model_name)

        # Input layer for input_ids and attention_masks
        input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

        # Get encoder outputs
        encoder_outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Get the pooled output and make sure it is of type tf.float32
        pooled_output = tf.keras.layers.Lambda(lambda x: tf.cast(x.pooler_output, tf.float32))(encoder_outputs)

        # Apply dropout
        dropout = tf.keras.layers.Dropout(rate=0.1)(pooled_output)

        # Final dense layer for classification with softmax activation function and L2 regularization
        output = tf.keras.layers.Dense(self.num_classes, activation='softmax', dtype=tf.float32)(dropout)
        
        # Create model
        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

        # Compile model with AdamW as optimzer
        model.compile(optimizer=tf.keras.optimizers.AdamW(learning_rate=5e-5),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                      metrics=['accuracy'])
        
        return model

---
# BERT for online learning

In [13]:
class BERTOnline:
    def __init__(self, model_name, num_classes):

        self.model_name = model_name
        self.num_classes = num_classes

    def create_model(self):

        # Load the pretrained BERT model
        encoder = TFAutoModel.from_pretrained(self.model_name)

        # Input layer for input_ids and attention_masks
        input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

        # Get encoder outputs
        encoder_outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Get the pooled output and make sure it is of type tf.float32
        pooled_output = tf.keras.layers.Lambda(lambda x: tf.cast(x.pooler_output, tf.float32))(encoder_outputs)

        # Apply dropout
        dropout = tf.keras.layers.Dropout(rate=0.1)(pooled_output)

        # Final dense layer for classification with softmax activation function and L2 regularization
        output = tf.keras.layers.Dense(self.num_classes, activation='softmax', dtype=tf.float32)(dropout)
        
        # Create model
        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
        
        return model

In [14]:
test = BERTOnline(model_name='bert-base-uncased', num_classes=10)
test = test.create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [15]:
weights_load_path = '/Users/lars/Documents/test/models/Weights_Helpdesk_Tuned/Weights_Helpdesk_Tuned'
test.load_weights(weights_load_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x3904e9810>

In [16]:
def online(sample, label):
    with tf.GradientTape() as tape:
        # Predict
        pred_y = test(sample)
        # Calculate Loss
        model_loss = loss(label, pred_y)
    
    # Calculate Gradients
    model_gradients = tape.gradient(model_loss, test.trainable_variables)

    # Update model
    optimizer.apply_gradients(zip(model_gradients, test.trainable_variables))


In [17]:
for sample in range(len(X_train)):
    sample_X = X_train[sample]
    sample_y = y_train[sample]
    print(sample_y)
    sample_X, label = preprocessing_single(sample_X, sample_y)
    print(label)
    online(sample_X, label)

0
tf.Tensor([0.], shape=(1,), dtype=float32)


2024-07-12 12:09:54.827992: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


KeyboardInterrupt: 

In [18]:
test.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=['acc']) # Compile just for evaluation

In [27]:
window_data = {
    'Prefix_Trace' : X_test,
    'Next_Activity': y_test.tolist()
}

# Convert to Hugging Face datasets
window_data = Dataset.from_dict(window_data)

# Sort the data by length
sorted_window_data = sort_by_length(window_data, tokenizer, max_length)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create TensorFlow datasets and ensure they repeat
tf_window_dataset = create_buckets_and_batches_bert(sorted_window_data, batch_size, data_collator).repeat()

# Prefetch datasets
tf_window_dataset = tf_window_dataset.prefetch(tf.data.AUTOTUNE)

# Calculate steps per epoch based on the length of the dataset
window_steps_per_epoch = len(sorted_window_data) // batch_size

# Debugging statements to check the sizes and steps
print(f"Number of training samples: {len(sorted_window_data)}")
print(f"Steps per epoch (train): {window_steps_per_epoch}")

Number of training samples: 10
Steps per epoch (train): 10


In [29]:
# Test if everything worked
evaluation = test.evaluate(tf_window_dataset, steps=window_steps_per_epoch)

print(f"Validation loss: {evaluation[0]}")
print(f"Validation accuracy: {evaluation[1]}")


2024-07-12 12:07:12.073091: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: Creating variables on a non-first call to a function decorated with tf.function.

---

In [20]:
# Build and compile the model
model = BERTModelBuilderDynamic(model_name='bert-base-uncased', num_classes=10)
model = model.create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [20]:
# Load the weights into model

weights_load_path = '/Users/lars/Documents/test/models/Weights_Helpdesk_Tuned/Weights_Helpdesk_Tuned'
model.load_weights(weights_load_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x3ab67a020>

In [None]:
for sample in range(len(X_test)):
    sample_X = X_test[sample]
    sample_y = y_test[sample]

    # add unprocessed samples to window
    X_window = np.append(X_window, sample_X)
    y_window = np.append(y_window, sample_y)

    # preprocess data to be used as input in BERT
    sample_X, label = preprocessing_single(sample_X, sample_y)
    y_pred = model.predict(sample_X)
    y_pred = np.argmax(y_pred, axis=1)
    print(y_pred)

    # Save prediction and true value
    pred.append(y_pred)
    true.append(sample_y)

    # check if prediction is a hit
    if y_pred == sample_y:
        hits += 1
        p.append(p[-1]-p[-1]/n)
    else:
        p.append(p[-1]+(1-p[-1])/n)
    
    # calculate stdv
    s.append(math.sqrt(p[-1]*(1-p[-1])/n))
    n += 1

    # update drift detector
    adwin.update(int(sample_y ==  y_pred))

    # save accuracy
    acc.append(float(hits)/float(sample+1))

    if drift.drift_detected:
        print(f"Change has been detected in {sample}")
        print(f"Window size: {adwin.width}")
        print(f"Total sum of stored elements: {adwin.total}")
        print(f"Mean: {adwin.estimation}")
        print(f"Variance: {adwin.variance}")
        print(f"Total number of drifts: {adwin.n_detections}")

        p.append(1.0)
        s.append(0.0)
        n = 1.0
        
        drifts.append(sample)

        # update data and label window
        X_window = X_window[-len(X_window) - adwin.width:]
        y_window = y_window[-len(y_window) - adwin.width:]

        # preprocess X_window with dynamic padding
        # Create dictionary for tf dataset creation
        window_data = {
            'Prefix_Trace' : X_window,
            'Next_Activity': y_window.tolist()
        }

        # Convert to Hugging Face datasets
        window_data = Dataset.from_dict(window_data)

        # Sort the data by length
        sorted_window_data = sort_by_length(window_data, tokenizer, max_length)

        # Initialize data collator
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

        # Create TensorFlow datasets and ensure they repeat
        tf_window_dataset = create_buckets_and_batches_bert(sorted_window_data, batch_size, data_collator).repeat()

        # Prefetch datasets
        tf_window_dataset = tf_window_dataset.prefetch(tf.data.AUTOTUNE)

        # Calculate steps per epoch based on the length of the dataset
        window_steps_per_epoch = len(sorted_window_data) // batch_size

        # Debugging statements to check the sizes and steps
        print(f"Number of training samples: {len(sorted_window_data)}")
        print(f"Steps per epoch (train): {window_steps_per_epoch}")

        # BERT with GradientTape
        
        
        
        
        
        
        
        
        
        adwin = drift.ADWIN()

In [None]:
# Make predictions on a test dataset (optional)
predictions = test.predict(tf_test_dataset, steps=test_steps_per_epoch)