# Model Training (TEST)

In [None]:
# This is for adding features to the auto model trainer

In [None]:
--learning_rate 3e-5 --epochs 10 --batch_size 32 \
--hidden_dropout_prob 0.3 --attention_probs_dropout_prob 0.15 \
--classifier_dropout 0.3 --extra_dropout 0.2 --l2_strength 1e-5 \
--weight_decay 1e-5

    # Model Names: 
    # 'bert-base-uncased' (bert)
    # 'bert-base-multilingual-cased' (mBERT)
    # 'xlm-roberta-base' or "distilroberta-base" (XLM-RoBERTa, Distil Roberta)
    # "google-bert/bert-base-cased" (mobileBert)

In [1]:
import sys
    
default_args = {
    'learning_rate' : 5e-5,
    'epochs' : 125,
    'batch_size' : 16,
    'model' : 'bert-base-multilingual-cased',
    'exp_desc' : 'test_run'
}

sys.argv = ["script_name", "--learning_rate", str(default_args["learning_rate"]) ,"--epochs", str(default_args["epochs"]),"--batch_size", str(default_args["batch_size"]),"--model" , default_args["model"]]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

import os
import tensorflow as tf
import itertools
import gc
import mlflow

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import random

import argparse

MAX_LENGTH = 150
BATCH_SIZE = 64
EPOCHS = 100

randnum = 10#42

mlflow.sklearn.autolog()

# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['text', 'label']].dropna()
    df['label'] = df['label'].astype(int)
    return df

# Tokenization function
def tokenize_data(texts, tokenizer):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Prepare datasets
def prepare_datasets(train_df, val_df, test_df, tokenizer):
    train_encodings = tokenize_data(train_df['text'].tolist(), tokenizer)
    val_encodings = tokenize_data(val_df['text'].tolist(), tokenizer)
    test_encodings = tokenize_data(test_df['text'].tolist(), tokenizer)

    train_dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': train_encodings['input_ids'],
            'attention_mask': train_encodings['attention_mask']
        },
        train_df['label'].values
    )).shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    val_dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': val_encodings['input_ids'],
            'attention_mask': val_encodings['attention_mask']
        },
        val_df['label'].values
    )).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    test_dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': test_encodings['input_ids'],
            'attention_mask': test_encodings['attention_mask']
        },
        test_df['label'].values
    )).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

    return train_dataset, val_dataset, test_dataset

def model_abbre(model_name):
    cases = {
        "bert-base-uncased": "bert-base",
        "bert-base-multilingual-cased": "mBERT",
        'xlm-roberta-base': 'XLM-RoBERTa' 
        #'google-bert/bert-base-cased': 'mobileBert'
    }
    return cases.get(model_name, "Model Unavailable")
    
def run_training(hp, model_name):
    mlflow.set_experiment("Second Evaluation")
    run_name = f"{hp['exp_desc']}_{model_abbre(model_name)}__lr{hp['learning_rate']}_ep{hp['epochs']}_bs{hp['batch_size']}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(hp)
        mlflow.set_tag("model_name", model_name)
        mlflow.log_param("model", model_name)
        print("||--------------------------------------||")        
        print(f"||===>> Starting run: {run_name} with hyperparameters: {hp}")
        print("||--------------------------------------||")        

        #Red Info Logs Killer
        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
        
        # Print the TensorFlow version
        print(f"TensorFlow version: {tf.__version__}")
        
        # List available GPU devices
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            print("GPUs detected:")
            for gpu in gpus:
                print(gpu)
        else:
            print("No GPUs detected.")
            
        if gpus:
            tf.config.experimental.set_memory_growth(gpus[0], True)
        
            # Configuration
        MODEL_NAME = model_name # also for tokenizer
                                 # 'bert-base-uncased' (bert)
                                 # 'bert-base-multilingual-cased' (mBERT)
                                 # 'xlm-roberta-base' or "distilroberta-base" (XLM-RoBERTa, Distil Roberta)
                                 # "google-bert/bert-base-cased" (mobileBert)

        random.seed(randnum)
        tf.random.set_seed(randnum)
        np.random.seed(randnum)
        
        model_output_name = "mbert_logging_test1"


        # Dataset split
        df = load_data('dataset/finaldataset_6k_shuffled_v2.csv')
        train_df, test_df = train_test_split(df, test_size=0.1, random_state=randnum)
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=randnum)
        
        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        train_dataset, val_dataset, test_dataset = prepare_datasets(train_df, val_df, test_df, tokenizer)

        # Model initialization
        model = TFAutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=2,
            # hidden_dropout_prob=0.3,
            # attention_probs_dropout_prob=0.15
        )
        
        # Freeze all layers
        for layer in model.layers:
            layer.trainable = False
        # Unfreeze classifier layer
        model.layers[-1].trainable = True
        
        # Compile the model
        optimizer = tf.keras.optimizers.Adam(learning_rate=hp["learning_rate"])
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
    
        # Prepare datasets
        train_ds = train_dataset.unbatch().batch(hp["batch_size"])
        val_ds = val_dataset.unbatch().batch(hp["batch_size"])

        # Prepare callbacks: EarlyStopping and ModelCheckpoint
        checkpoint_filepath = f"./checkpoints/{run_name}.h5"
        os.makedirs(os.path.dirname(checkpoint_filepath), exist_ok=True)
        callbacks = [
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
            tf.keras.callbacks.TensorBoard(log_dir='logs'),
            tf.keras.callbacks.ModelCheckpoint(
                filepath=checkpoint_filepath,
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True,
                verbose=1
            ),
            tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: [
                mlflow.log_metric("train_loss", logs["loss"], step=epoch),
                mlflow.log_metric("train_accuracy", logs["accuracy"], step=epoch),
                mlflow.log_metric("val_loss", logs["val_loss"], step=epoch),
                mlflow.log_metric("val_accuracy", logs["val_accuracy"], step=epoch),
            ])
        ]
        
        # Train the model
        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=hp["epochs"],
            callbacks=callbacks,
            verbose=1
        )
    
        # Optionally load best checkpoint
        if os.path.exists(checkpoint_filepath):
            model.load_weights(checkpoint_filepath)
        
        # Evaluate the model
        val_preds = model.predict(val_ds).logits
        y_pred = np.argmax(val_preds, axis=1)
        y_true = np.concatenate([y for x, y in val_ds], axis=0)
        
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred)
        rec = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        print("||-------------------------------------------------------||")
        print(f"||--> Run {run_name} evaluation metrics output:")
        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}")
        print("||-------------------------------------------------------||")
        # Log metrics to MLflow
        mlflow.log_metrics({
            "val_accuracy": acc,
            "val_precision": prec,
            "val_recall": rec,
            "val_f1_score": f1
        })

        # Compute confusion matrix
        conf_matrix = confusion_matrix(y_true, y_pred)
        print("Confusion Matrix:")
        print(conf_matrix)
        
        # Plot confusion matrix
        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1], yticklabels=[0, 1])
        plt.xlabel("Predicted Labels")
        plt.ylabel("True Labels")
        plt.title("Confusion Matrix")
        plt.show()

        # Clean up GPU memory
        del model
        gc.collect()
        tf.keras.backend.clear_session()

    

if __name__ == "__main__":

    
    parser = argparse.ArgumentParser()
    parser.add_argument("--learning_rate", type=float, required=True)
    parser.add_argument("--epochs", type=int, required=True)
    parser.add_argument("--batch_size", type=int, required=True)
    parser.add_argument("--model", type=str, required=True)
    parser.add_argument("--exp_desc", type=str, required=False, default="oo")
    args = parser.parse_args()

    print(f"Learning Rate: {args.learning_rate}, Epochs: {args.epochs}, Batch Size: {args.batch_size} Model: {args.model} Exp Des: {args.exp_desc}")

    # Model Names: 
    # 'bert-base-uncased' (bert)
    # 'bert-base-multilingual-cased' (mBERT)
    # 'xlm-roberta-base' or "distilroberta-base" (XLM-RoBERTa, Distil Roberta)
    # "google-bert/bert-base-cased" (mobileBert)

    # Prepare hyperparameter dictionary
    hyperparams = {
        "learning_rate": args.learning_rate,
        "epochs": args.epochs,
        "batch_size": args.batch_size,
        "exp_desc" : args.exp_desc
    }
    
    run_training(hyperparams, args.model)

In [3]:
import os
import gc
import random
import argparse
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf

from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
import mlflow

# Suppress oneDNN warnings (optional)
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # reduce TF logs

# Constants
MAX_LENGTH = 150
SEED = 10

mlflow.sklearn.autolog()

def load_data(fp):
    df = pd.read_csv(fp)[['text','label']].dropna()
    df['label'] = df['label'].astype(int)
    return df

def tokenize_data(texts, tokenizer):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

def prepare_datasets(train_df, val_df, test_df, tokenizer, batch_size):
    def to_ds(df):
        enc = tokenize_data(df['text'].tolist(), tokenizer)
        return tf.data.Dataset.from_tensor_slices((
            {'input_ids': enc['input_ids'], 'attention_mask': enc['attention_mask']},
            df['label'].values
        ))
    train_ds = to_ds(train_df).shuffle(1000, seed=SEED).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    val_ds   = to_ds(val_df).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    test_ds  = to_ds(test_df).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return train_ds, val_ds, test_ds

def model_abbre(name):
    return {
        "bert-base-uncased": "BERT",
        "bert-base-multilingual-cased": "mBERT",
        "xlm-roberta-base": "XLM-R",
    }.get(name, "Model")

def run_training(hp, model_name):
    mlflow.set_experiment("Second Evaluation")
    run_name = f"{hp['exp_desc']}_{model_abbre(model_name)}_lr{hp['learning_rate']}_ep{hp['epochs']}_bs{hp['batch_size']}"
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(hp)
        mlflow.set_tag("model_name", model_name)

        # Seeds
        random.seed(SEED)
        np.random.seed(SEED)
        tf.random.set_seed(SEED)

        # Data split
        df = load_data('dataset/finaldataset_6k_shuffled_v2.csv')
        train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)
        train_df, val_df  = train_test_split(train_df, test_size=0.2, random_state=SEED)

        # Tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Config + dropout
        config = AutoConfig.from_pretrained(
            model_name,
            hidden_dropout_prob=0.3,
            attention_probs_dropout_prob=0.2,
            classifier_dropout=0.3
        )
        config.num_labels = 2  # <— set num_labels here

        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=config
        )

        # Freeze base, unfreeze classifier
        for layer in model.layers:
            layer.trainable = False
        model.classifier.trainable = True

        # L2 on classifier head
        l2_reg = tf.keras.regularizers.L2(0.005)
        model.classifier.kernel_regularizer = l2_reg
        model.classifier.bias_regularizer   = l2_reg

        # Datasets
        train_ds, val_ds, test_ds = prepare_datasets(
            train_df, val_df, test_df, tokenizer, hp['batch_size']
        )

        # Optimizer & loss
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=hp['learning_rate'],
            weight_decay=0.01,
            clipnorm=1.0
        )
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True,
        )

        print(model.summary())

        model.compile(
            optimizer=optimizer,
            loss=loss,
            metrics=["accuracy"]
        )

        # Callbacks
        ckpt_fp = f"./checkpoints/{run_name}.h5"
        os.makedirs(os.path.dirname(ckpt_fp), exist_ok=True)
        callbacks = [
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
            tf.keras.callbacks.ModelCheckpoint(
                filepath=ckpt_fp,
                monitor='val_loss',
                save_best_only=True,
                save_weights_only=True
                verbose=1
            ),
            tf.keras.callbacks.TensorBoard(log_dir='logs'),
            tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda ep, logs: [
                mlflow.log_metric(k, logs[k], step=ep) for k in ["loss","accuracy","val_loss","val_accuracy"]
            ])
        ]

        # Train
        model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=hp['epochs'],
            callbacks=callbacks,
            verbose=1
        )

        # Load best weights & evaluate
        if os.path.exists(ckpt_fp):
            model.load_weights(ckpt_fp)

        y_true = np.concatenate([y for _, y in val_ds], axis=0)
        y_pred = np.argmax(model.predict(val_ds).logits, axis=1)

        acc, prec, rec, f1 = (
            accuracy_score(y_true, y_pred),
            precision_score(y_true, y_pred),
            recall_score(y_true, y_pred),
            f1_score(y_true, y_pred)
        )
        print(f"Val → Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")
        print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

        # Cleanup
        del model
        gc.collect()
        tf.keras.backend.clear_session()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--learning_rate", type=float, required=True)
    parser.add_argument("--epochs",        type=int,   required=True)
    parser.add_argument("--batch_size",    type=int,   required=True)
    parser.add_argument("--model",         type=str,   required=True)
    parser.add_argument("--exp_desc",      type=str,   default="oo")
    args = parser.parse_args()

    hp = {
        "learning_rate": args.learning_rate,
        "epochs":        args.epochs,
        "batch_size":    args.batch_size,
        "exp_desc":      args.exp_desc
    }
    run_training(hp, args.model)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  177853440 
                                                                 
 dropout_75 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 177854978 (678.46 MB)
Trainable params: 1538 (6.01 KB)
Non-trainable params: 177853440 (678.46 MB)
_________________________________________________________________
None
Epoch 1/125


2025-04-20 18:04:34.494007: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f3ac6256220 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-20 18:04:34.494084: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Ti Laptop GPU, Compute Capability 8.6
2025-04-20 18:04:34.503592: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-04-20 18:04:34.524359: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1745143474.594137    2842 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Val → Acc: 0.8595, Prec: 0.8249, Rec: 0.9208, F1: 0.8702
Confusion matrix:
 [[431 111]
 [ 45 523]]
