# Concept Drift and Online Learning

In [None]:
'''
Sources:
    - https://medium.com/analytics-vidhya/tf-gradienttape-explained-for-keras-users-cc3f06276f22
    - https://www.kaggle.com/code/fabriciojoc/drebin-experiment-4-adwin-retrain
    - https://www.tensorflow.org/guide/data#consuming_python_generators (pyhton generator add to preprocessing)
'''

Kein dynamic padding benötigt, da jede Sequenz einzeln preprocessed und für prediction verwendet wird

In [1]:
import sys

# Append the directory containing the src folder to sys.path
sys.path.append('/Users/lars/Documents/test/')

In [2]:
#Path variables for datasets
directory = "/Users/lars/Documents/Uni/Masterarbeit/Online_Process_Concept_Drift"
path_raw = "/data/raw/"
path_interim = "/data/interim/"
path_processed = "/data/processed/"

In [3]:
import math
import random
import tensorflow as tf
import numpy as np
import tensorflow_models as tfm
from river import drift
from transformers import TFAutoModel, AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset
from src.data.data_manager import data_loader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tf.random.set_seed(1234)
np.random.seed(1234)

In [5]:
# Set parameters

max_length = 36
batch_size = 1
num_classes = 10

optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-5)
loss_online = tf.keras.losses.SparseCategoricalCrossentropy()

2024-07-12 15:02:28.406793: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-07-12 15:02:28.406816: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-07-12 15:02:28.406819: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-07-12 15:02:28.406847: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-12 15:02:28.406861: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
train_tensor = data_loader(directory, path_interim, "Long_Helpdesk_train")
val_tensor = data_loader(directory, path_interim, "Long_Helpdesk_val")
test_tensor = data_loader(directory, path_interim, "Long_Helpdesk_test")

In [7]:
label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}

In [8]:
train_tensor = train_tensor[:20]
test_tensor = test_tensor[:20]

---
### Needed Functions

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing function for samples
def preprocessing_single(X_test, y_test):
    X_test_encoded = tokenizer(X_test, return_tensors='tf', padding=True, truncation=True)
    input_ids = X_test_encoded['input_ids']
    attention_mask = X_test_encoded['attention_mask']
    label = tf.convert_to_tensor([y_test], dtype=tf.int32)  # Change to int32 for SparseCategoricalCrossentropy

    return {'input_ids': input_ids, 'attention_mask': attention_mask}, label



In [10]:
# Function to preprocess window data to be used for online learning
def drifted_dataset_generator(X_window, y_window, batch_size):
    def process():
        for sample, label in zip(X_window, y_window):
            inputs, label = preprocessing_single(sample, label)
            yield inputs, label
    
    dataset = tf.data.Dataset.from_generator(
        process,
        output_signature=(
            {
                'input_ids': tf.TensorSpec(shape=(None, None), dtype=tf.int32),
                'attention_mask': tf.TensorSpec(shape=(None, None), dtype=tf.int32),
            },
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        )
    )
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [11]:
def train_step(model, optimizer, loss_online, x_sample, y_label):
    with tf.GradientTape() as tape:
        # Predict
        predictions = model(x_sample, training=True)
        # Calculate Loss
        loss = loss_online(y_label, predictions)
    
    # Calculate Gradients
    gradients = tape.gradient(loss, model.trainable_variables)
    # Update model
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

In [12]:
class BERTOnline:
    def __init__(self, model_name, num_classes):

        self.model_name = model_name
        self.num_classes = num_classes

    def create_model(self):

        # Load the pretrained BERT model
        encoder = TFAutoModel.from_pretrained(self.model_name)

        # Input layer for input_ids and attention_masks
        input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

        # Get encoder outputs
        encoder_outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Get the pooled output and make sure it is of type tf.float32
        pooled_output = tf.keras.layers.Lambda(lambda x: tf.cast(x.pooler_output, tf.float32))(encoder_outputs)

        # Apply dropout
        dropout = tf.keras.layers.Dropout(rate=0.1)(pooled_output)

        # Final dense layer for classification with softmax activation function and L2 regularization
        output = tf.keras.layers.Dense(self.num_classes, activation='softmax', dtype=tf.float32)(dropout)
        
        # Create model
        model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
        
        return model

In [13]:
def preprocess_function(tokenizer, example, max_length=512):    
    return tokenizer(example['Prefix_Trace'], padding=False, truncation=True, max_length=max_length)

def sort_by_length(dataset, tokenizer, max_length=1024):

    # Tokenizes the dataset and calculates the length for all in input_ids
    tokenized = [preprocess_function(tokenizer, example, max_length) for example in dataset]
    lengths = [len(tok['input_ids']) for tok in tokenized]

    # Combine tokenized inputs, lengths, and labels and sort them
    combined = list(zip(tokenized, lengths, dataset['Next_Activity']))
    combined.sort(key=lambda x: x[1])

    return combined

def create_buckets_and_batches_bert(sorted_data, batch_size, data_collator):
 
    def gen():
        while True:

            # Shuffle data at the start of each epoch
            random.shuffle(sorted_data)  

            # Iterate over the dataset and select batch
            for i in range(0, len(sorted_data), batch_size):
                batch = sorted_data[i:i + batch_size]
                
                # Skip empty batches
                if len(batch) == 0:
                    continue  
                
                # Extract tokenized inputs and labels from the batch
                tokenized_batch = [item[0] for item in batch]
                labels = [item[2] for item in batch]
                
                # Create input dictionaries
                batch_inputs = {'input_ids': [tok['input_ids'] for tok in tokenized_batch],
                                'attention_mask': [tok['attention_mask'] for tok in tokenized_batch]}
                
                # Batch the inputs and yiel the batches and labels as tensors
                batch_inputs = data_collator(batch_inputs)
                yield dict(batch_inputs), tf.convert_to_tensor(labels)
    
    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {'input_ids': tf.TensorSpec(shape=(None, None), dtype=tf.int32),
             'attention_mask': tf.TensorSpec(shape=(None, None), dtype=tf.int32)},
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        )
    )

---
# Test

In [13]:
test = BERTOnline(model_name='bert-base-uncased', num_classes=10)
test = test.create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [14]:
weights_load_path = '/Users/lars/Documents/test/models/Weights_Helpdesk_Tuned/Weights_Helpdesk_Tuned'
test.load_weights(weights_load_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x3a96c7970>

In [13]:
# Encoding labels and preparing samples to be tokenized later on
#label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
y_test = test_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
y_train = train_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
X_test = test_tensor['Prefix_Trace'].astype(str).values.tolist()
X_train = train_tensor['Next_Activity'].astype(str).values.tolist()

In [16]:
# Initialize variables for tracking
drifts = []
warnings = []
DRIFT = False
WARNING = False
warning_data = []
y_warning = []
acc = []
pred = []
true = []
hits = 0
p = []
s = []
n = 1.0
p.append(1.0)

X_window = X_train
y_window = y_train

adwin = drift.ADWIN()

In [21]:
# Process the training data stream
for sample in range(len(X_train)):
    sample_X = X_train[sample]
    sample_y = y_train[sample]
    
    sample_X, label = preprocessing_single(sample_X, sample_y)
    print(sample_X)
    print(label)
    # Train on the sample
    loss = train_step(test, optimizer, loss_online, sample_X, label)
    print(f"Sample {sample}, Loss: {loss.numpy()}")

{'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 2202, 1011, 1999, 1011, 3715, 1011, 7281,  102]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}
tf.Tensor([0], shape=(1,), dtype=int32)
Sample 0, Loss: 0.23538294434547424
{'input_ids': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=
array([[ 101, 2202, 1011, 1999, 1011, 3715, 1011, 7281,  102]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}
tf.Tensor([0], shape=(1,), dtype=int32)
Sample 1, Loss: 0.18390701711177826
{'input_ids': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[  101, 10663,  1011,  7281,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[1, 1, 1, 1, 1]], dtype=int32)>}
tf.Tensor([1], shape=(1,), dtype=int32)
Sample 2, Loss: 1.042875051498413
{'input_ids': <tf.Tensor: shape=

In [18]:
# Compile the model for evaluation
test.compile(optimizer=optimizer, loss=loss_online, metrics=['acc'])

In [16]:
# Evaluate the model
window_data = {
    'Prefix_Trace': X_test,
    'Next_Activity': y_test.tolist()
}

# Convert to Hugging Face datasets
window_data = Dataset.from_dict(window_data)

# Sort the data by length
sorted_window_data = sort_by_length(window_data, tokenizer, max_length)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create TensorFlow datasets and ensure they repeat
tf_window_dataset = create_buckets_and_batches_bert(sorted_window_data, batch_size, data_collator)



In [17]:
tf_window_dataset

<_FlatMapDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [None]:
# Prefetch datasets
tf_window_dataset = tf_window_dataset.prefetch(tf.data.AUTOTUNE)

# Calculate steps per epoch based on the length of the dataset
window_steps_per_epoch = len(sorted_window_data) // batch_size

# Debugging statements to check the sizes and steps
print(f"Number of training samples: {len(sorted_window_data)}")
print(f"Steps per epoch (train): {window_steps_per_epoch}")

In [22]:
tf_window_dataset

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [20]:
# Test if everything worked
evaluation = test.evaluate(tf_window_dataset, steps=window_steps_per_epoch)

print(f"Validation loss: {evaluation[0]}")
print(f"Validation accuracy: {evaluation[1]}")

2024-07-12 13:44:08.137682: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation loss: 0.7614604234695435
Validation accuracy: 0.699999988079071


---
# BERT for Online Learning

In [13]:
# Encoding labels and preparing samples to be tokenized later on
#label_map = {label: idx for idx, label in enumerate(train_tensor['Next_Activity'].unique())}
y_test = test_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
y_train = train_tensor['Next_Activity'].map(label_map).astype(int).to_numpy()
X_test = test_tensor['Prefix_Trace'].astype(str).values.tolist()
X_train = train_tensor['Next_Activity'].astype(str).values.tolist()

In [14]:
# Set up lists for information storage
# drift points
drifts = []

# warning points
warnings = []

# flags for drift and warning
DRIFT = False
WARNING = False
warning_data = []
y_warning = []

# accuracy
acc = []

# predictions
pred = []
true = []

# hits
hits = 0

# prequential error
perror = [] 

# stdev
stdv = [] 

# count
n = 1.0 

perror.append(1.0)

X_window = X_train
y_window = y_train

adwin = drift.ADWIN()

In [15]:
model = BERTOnline(model_name='bert-base-uncased', num_classes=10)
model = model.create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [16]:
weights_load_path = '/Users/lars/Documents/test/models/Weights_Helpdesk_Tuned/Weights_Helpdesk_Tuned'
model.load_weights(weights_load_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x39e92fd00>

In [17]:
for sample in range(len(X_test)):
    print(sample)
    sample_X = X_test[sample]
    sample_y = y_test[sample]

    # add unprocessed samples to window
    X_window = np.append(X_window, sample_X)
    y_window = np.append(y_window, sample_y)
    print('X_Window')
    print(X_window)
    print('y_window')
    print(y_window)

    # preprocess data to be used as input in BERT
    sample_X, label = preprocessing_single(sample_X, sample_y)
    y_pred = model.predict(sample_X)
    y_pred = np.argmax(y_pred, axis=1)
    print(f'Prediction: {y_pred}')
    print(f'True Label: {sample_y}')

    # Save prediction and true value
    pred.append(y_pred)
    true.append(sample_y)

    # check if prediction is a hit
    if y_pred == sample_y:
        hits += 1
        perror.append(perror[-1]-perror[-1]/n)
    else:
        perror.append(perror[-1]+(1-perror[-1])/n)
    
    # calculate stdv
    stdv.append(math.sqrt(perror[-1]*(1-perror[-1])/n))
    n += 1

    # update drift detector
    adwin.update(int(sample_y ==  y_pred))

    # save accuracy
    acc.append(float(hits)/float(sample+1))

    if adwin.drift_detected:
        print(f"Change has been detected in {sample}")
        print(f"Window size: {adwin.width}")
        print(f"Total sum of stored elements: {adwin.total}")
        print(f"Mean: {adwin.estimation}")
        print(f"Variance: {adwin.variance}")
        print(f"Total number of drifts: {adwin.n_detections}")

        p.append(1.0)
        stdv.append(0.0)
        n = 1.0
        
        drifts.append(sample)

        # update data and label window
        X_window = X_window[-len(X_window) - adwin.width:]
        y_window = y_window[-len(y_window) - adwin.width:]

        # Create dataset to incrementally update BERT 
        drifted_data = drifted_dataset_generator(X_window=X_window, y_window=y_window, batch_size=batch_size)

        #Loop for drifted data stream und to update model
        for batch in drifted_data:
            drifted_sample, drifted_label = batch
            loss = train_step(model=model, 
                              optimizer=optimizer, 
                              loss_online=loss_online, 
                              x_sample=drifted_sample, 
                              y_label=drifted_label
                              )
        print(f"Drifted Loss: {loss.numpy()}")
        
        # Reset ADWIN
        adwin = drift.ADWIN()
        WARNING = False
    else:
        print('No drift')
        WARNING = False
        # Preprocess sample/label pair
        print(sample_X)
        #sample_X, label = preprocessing_single(sample_X, sample_y)
        print(label)
        # Update on the sample
        loss = train_step(model, optimizer, loss_online, sample_X, label)
        print(f"Sample {sample}, Loss: {loss.numpy()}")
            

print(f"Drifts: {drifts}")
print(f"AVG Accuracy: {np.mean(acc)}")


0
X_Window
['take-in-charge-ticket' 'take-in-charge-ticket' 'resolve-ticket' 'closed'
 'end' 'take-in-charge-ticket' 'resolve-ticket' 'closed' 'end'
 'take-in-charge-ticket' 'require-upgrade' 'resolve-ticket' 'closed' 'end'
 'assign-seriousness' 'take-in-charge-ticket' 'resolve-ticket' 'closed'
 'end' 'take-in-charge-ticket' 'assign-seriousness']
y_window
[0 0 1 2 3 0 1 2 3 0 4 1 2 3 5 0 1 2 3 0 0]


2024-07-12 15:03:01.221637: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Prediction: [0]
True Label: 0
No drift
{'input_ids': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[  101, 23911,  1011, 27994,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[1, 1, 1, 1, 1]], dtype=int32)>}
tf.Tensor([0], shape=(1,), dtype=int32)


  adwin.update(int(sample_y ==  y_pred))


Sample 0, Loss: 0.1852436065673828
1
X_Window
['take-in-charge-ticket' 'take-in-charge-ticket' 'resolve-ticket' 'closed'
 'end' 'take-in-charge-ticket' 'resolve-ticket' 'closed' 'end'
 'take-in-charge-ticket' 'require-upgrade' 'resolve-ticket' 'closed' 'end'
 'assign-seriousness' 'take-in-charge-ticket' 'resolve-ticket' 'closed'
 'end' 'take-in-charge-ticket' 'assign-seriousness'
 'assign-seriousness take-in-charge-ticket']
y_window
[0 0 1 2 3 0 1 2 3 0 4 1 2 3 5 0 1 2 3 0 0 1]
Prediction: [1]
True Label: 1
No drift
{'input_ids': <tf.Tensor: shape=(1, 12), dtype=int32, numpy=
array([[  101, 23911,  1011, 27994,  2202,  1011,  1999,  1011,  3715,
         1011,  7281,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 12), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}
tf.Tensor([1], shape=(1,), dtype=int32)
Sample 1, Loss: 0.31168049573898315
2
X_Window
['take-in-charge-ticket' 'take-in-charge-ticket' 'resolve-ticket' 'closed'
 'end' 'take-