# Model Training and Evaluation

This notebook covers the definition, training, and evaluation of a transformer-based multi-label text classification model.

## 1. Load Prepared Datasets and Libraries

Load the PyTorch datasets and import required libraries for model training and evaluation.

In [None]:
%pip install mlflow

In [None]:
%pip install databricks-sdk

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Import libraries and load datasets
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import numpy as np
from mlflow.models import infer_signature

from transformers import (
    BertForSequenceClassification, RobertaForSequenceClassification, get_linear_schedule_with_warmup
)
from torch.utils.data import DataLoader
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

import os
import mlflow
from mlflow.models import infer_signature

In [4]:
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()

DATABRICKS_HOST = os.getenv("DATABRICKS_HOST")
DATABRICKS_TOKEN = os.getenv("DATABRICKS_TOKEN")


False

In [16]:
# Load environment variables from colab secrets
from google.colab import userdata

DATABRICKS_HOST = userdata.get("DATABRICKS_HOST")
DATABRICKS_TOKEN = userdata.get("DATABRICKS_TOKEN")

In [17]:
print(DATABRICKS_HOST)
print(DATABRICKS_TOKEN)

https://dbc-cfeb31c8-2841.cloud.databricks.com
dapif3c8337356a6eda9363fac85b93d7f06


In [None]:
import os

# Set environment variables for MLflow authentication
os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

In [19]:

if not DATABRICKS_HOST or not DATABRICKS_TOKEN:
    raise ValueError("DATABRICKS_HOST and DATABRICKS_TOKEN must be set in your environment.\n\nIn Colab, use:\n%env DATABRICKS_HOST=https://<your-databricks-instance>\n%env DATABRICKS_TOKEN=<your-token>\n\nLocally, set them in your shell or .env file.")

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/j2damax@gmail.com/serendip-travel-review-classifier-experiments")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3866477269740015', creation_time=1758424394648, experiment_id='3866477269740015', last_update_time=1758427431013, lifecycle_stage='active', name='/Users/j2damax@gmail.com/serendip-travel-review-classifier-experiments', tags={'mlflow.experiment.sourceName': '/Users/j2damax@gmail.com/serendip-travel-review-classifier-experiments',
 'mlflow.experimentKind': 'genai_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'j2damax@gmail.com',
 'mlflow.ownerId': '5804221812504751'}>

In [20]:
# Define ReviewsDataset class (must match the one used in 03_modeling.ipynb)
from torch.utils.data import Dataset
class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    def __len__(self):
        return len(self.labels)

In [21]:
#train_data = torch.load('../data/processed/train_dataset.pt', weights_only=False)
#test_data = torch.load('../data/processed/test_dataset.pt', weights_only=False)

# Load datasets from Google Drive
train_data = torch.load('/content/drive/MyDrive/SerendipTravel/data/processed/train_dataset.pt', weights_only=False)
test_data = torch.load('/content/drive/MyDrive/SerendipTravel/data/processed/test_dataset.pt', weights_only=False)

# Reuse your ReviewsDataset class definition here
train_dataset = ReviewsDataset(train_data['encodings'], train_data['labels'])
test_dataset = ReviewsDataset(test_data['encodings'], test_data['labels'])

In [24]:
# Model definition and configuration
num_labels = train_data['labels'].shape[1]
print(f"Number of labels: {num_labels}")


Number of labels: 4


## 2. Model Definition and Configuration

Define the transformer model for multi-label classification and set up optimizer, loss, and training parameters.

In [None]:

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    problem_type='multi_label_classification'
)

# Training parameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Loss function for multi-label
loss_fn = torch.nn.BCEWithLogitsLoss()

## 3. Model Training and Evaluation

Train the model and evaluate its performance on the test set.

In [None]:
# Training and evaluation loop with MLflow logging

with mlflow.start_run(run_name="bert-multilabel-baseline"):
    # Log hyperparameters
    mlflow.log_param("epochs", epochs)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("model_name", "bert-base-uncased")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_train_loss:.4f}")
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.cpu().numpy()
            preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
            all_preds.append(preds)
            all_labels.append(labels)

    import numpy as np
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    # Compute metrics
    from sklearn.metrics import accuracy_score, f1_score
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_macro_f1 = f1_score(all_labels, all_preds, average='macro')
    print("Test accuracy:", test_accuracy)
    print("Test macro F1:", test_macro_f1)
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("test_macro_f1", test_macro_f1)

    # Prepare a sample input and output
    example_batch = next(iter(test_loader))
    inputs = {
        "input_ids": example_batch["input_ids"][:1].cpu().numpy(),
        "attention_mask": example_batch["attention_mask"][:1].cpu().numpy()
    }
    outputs = model(
        input_ids=example_batch["input_ids"][:1].to(device),
        attention_mask=example_batch["attention_mask"][:1].to(device)
    ).logits.cpu().detach().numpy()

    signature = infer_signature(inputs, outputs)

    mlflow.pytorch.log_model(
        model,
        name="model",
        signature=signature,
        pip_requirements=["torch==2.8.0+cu126", "torchvision==0.23.0+cu126"]
    )

    np.save("predictions.npy", all_preds)
    mlflow.log_artifact("predictions.npy")

Epoch 1/3 - Training loss: 0.0573
Epoch 2/3 - Training loss: 0.0573
Epoch 3/3 - Training loss: 0.0578
Test accuracy: 0.9245049504950495
Test macro F1: 0.9298934093407507
🏃 View run bert-multilabel-baseline at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015/runs/5728587524314f69b56434d8d58e9490
🧪 View experiment at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015


## 4. Model Training and Evaluation - Optimzed, resuable function

Train the model and evaluate its performance on the test set by parsing different arguments

In [29]:
def train_and_evaluate(
    model_name="bert-base-uncased",
    epochs=3,
    batch_size=16,
    learning_rate=2e-5,
    train_dataset=None,
    test_dataset=None,
    num_labels=None,
    experiment_name="bert-multilabel-baseline",
    seed=None
):
    """
    Train and evaluate a transformer model for multi-label classification.

    Args:
        model_name (str): Hugging Face model name (e.g., 'bert-base-uncased', 'roberta-base').
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training and evaluation.
        learning_rate (float): Learning rate for optimizer.
        train_dataset (Dataset): PyTorch Dataset for training.
        test_dataset (Dataset): PyTorch Dataset for evaluation.
        num_labels (int): Number of output labels.
        experiment_name (str): Name for MLflow experiment run.
        seed (int, optional): Random seed for reproducibility.

    Returns:
        all_preds (np.ndarray): Predicted labels for test set.
        all_labels (np.ndarray): True labels for test set.
        model (nn.Module): Trained model.
    """
    import torch
    import numpy as np
    from torch.utils.data import DataLoader
    from transformers import BertForSequenceClassification, RobertaForSequenceClassification, get_linear_schedule_with_warmup
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
    import mlflow
    import mlflow.pytorch
    import json

    if seed is not None:
        torch.manual_seed(seed)
        np.random.seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    # Model selection
    if model_name.startswith("roberta"):
        model_cls = RobertaForSequenceClassification
    else:
        model_cls = BertForSequenceClassification
    model = model_cls.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type='multi_label_classification'
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    with mlflow.start_run(run_name=experiment_name):
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("model_name", model_name)
        if seed is not None:
            mlflow.log_param("seed", seed)

        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for batch in train_loader:
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                scheduler.step()
                total_loss += loss.item()
            avg_train_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_train_loss:.4f}")
            mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].cpu().numpy()
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits.cpu().numpy()
                preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
                all_preds.append(preds)
                all_labels.append(labels)

        all_preds = np.vstack(all_preds)
        all_labels = np.vstack(all_labels)

        # Metrics
        test_accuracy = accuracy_score(all_labels, all_preds)
        test_macro_f1 = f1_score(all_labels, all_preds, average='macro')
        test_macro_precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
        test_macro_recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
        print("Test accuracy:", test_accuracy)
        print("Test macro F1:", test_macro_f1)
        print("Test macro precision:", test_macro_precision)
        print("Test macro recall:", test_macro_recall)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("test_macro_f1", test_macro_f1)
        mlflow.log_metric("test_macro_precision", test_macro_precision)
        mlflow.log_metric("test_macro_recall", test_macro_recall)

        # Per-label F1, precision, recall
        per_label_f1 = f1_score(all_labels, all_preds, average=None)
        per_label_precision = precision_score(all_labels, all_preds, average=None, zero_division=0)
        per_label_recall = recall_score(all_labels, all_preds, average=None, zero_division=0)
        for i, (f1, p, r) in enumerate(zip(per_label_f1, per_label_precision, per_label_recall)):
            mlflow.log_metric(f"f1_label_{i}", f1)
            mlflow.log_metric(f"precision_label_{i}", p)
            mlflow.log_metric(f"recall_label_{i}", r)

        # Save model and predictions
        #mlflow.pytorch.log_model(model, "model")

        example_batch = next(iter(test_loader))
        inputs = {
            "input_ids": example_batch["input_ids"][:1].cpu().numpy(),
            "attention_mask": example_batch["attention_mask"][:1].cpu().numpy()
        }
        outputs = model(
            input_ids=example_batch["input_ids"][:1].to(device),
            attention_mask=example_batch["attention_mask"][:1].to(device)
        ).logits.cpu().detach().numpy()

        signature = infer_signature(inputs, outputs)

        mlflow.pytorch.log_model(
            model,
            name="model",
            signature=signature,
            pip_requirements=["torch==2.8.0+cu126", "torchvision==0.23.0+cu126"]
        )

        np.save("predictions.npy", all_preds)
        mlflow.log_artifact("predictions.npy")
        # Optionally, log classification report
        report = classification_report(all_labels, all_preds, output_dict=True, zero_division=0)
        with open("classification_report.json", "w") as f:
            json.dump(report, f)
        mlflow.log_artifact("classification_report.json")

    return all_preds, all_labels, model


In [None]:
# bert with different hyperparameters
results = []
for lr in [2e-5, 1e-5]:
    for bs in [8, 16]:
        print(f"\nRunning with learning_rate={lr}, batch_size={bs}")
        preds, labels, mdl = train_and_evaluate(
            model_name="bert-base-uncased",
            epochs=3,
            batch_size=bs,
            learning_rate=lr,
            train_dataset=train_dataset,
            test_dataset=test_dataset,
            num_labels=num_labels,
            experiment_name=f"bert-lr{lr}-bs{bs}"
        )
        results.append({"lr": lr, "batch_size": bs, "preds": preds, "labels": labels})


Running with learning_rate=2e-05, batch_size=8


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Training loss: 0.1842
Epoch 2/3 - Training loss: 0.0737
Epoch 3/3 - Training loss: 0.0508
Test accuracy: 0.9297648514851485
Test macro F1: 0.9329084780680814
Test macro precision: 0.9714572250948507
Test macro recall: 0.897935589037063
🏃 View run bert-lr2e-05-bs8 at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015/runs/4b99a3a87a704a34a5b51c550bfd27ac
🧪 View experiment at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015

Running with learning_rate=2e-05, batch_size=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Training loss: 0.2333
Epoch 2/3 - Training loss: 0.0823
Epoch 3/3 - Training loss: 0.0624
Test accuracy: 0.9223391089108911
Test macro F1: 0.9305806411883104
Test macro precision: 0.9806644226775529
Test macro recall: 0.8861337100709494
🏃 View run bert-lr2e-05-bs16 at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015/runs/a136c2f535a44314939a00228111a769
🧪 View experiment at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015

Running with learning_rate=1e-05, batch_size=8


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Training loss: 0.2574
Epoch 2/3 - Training loss: 0.1021
Epoch 3/3 - Training loss: 0.0768
Test accuracy: 0.9176980198019802
Test macro F1: 0.9219528719179253
Test macro precision: 0.9756663310011
Test macro recall: 0.8752207296541721
🏃 View run bert-lr1e-05-bs8 at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015/runs/753e5792304046c0b1d12ed33e6aac9c
🧪 View experiment at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015

Running with learning_rate=1e-05, batch_size=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Training loss: 0.3064
Epoch 2/3 - Training loss: 0.1222
Epoch 3/3 - Training loss: 0.0937
Test accuracy: 0.9081064356435643
Test macro F1: 0.903569652759835
Test macro precision: 0.9789891670326454
Test macro recall: 0.8430712988206288
🏃 View run bert-lr1e-05-bs16 at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015/runs/0aaaa385cc3d4b58b15cc62ddc03d82f
🧪 View experiment at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015


In [None]:

# RoBERTa with different hyperparameters
results = []
for lr in [2e-5, 1e-5]:
    for bs in [8, 16]:
        print(f"\nRunning with learning_rate={lr}, batch_size={bs}")
        preds, labels, mdl = train_and_evaluate(
            model_name="roberta-base",
            epochs=3,
            batch_size=bs,
            learning_rate=lr,
            train_dataset=train_dataset,
            test_dataset=test_dataset,
            num_labels=num_labels,
            experiment_name=f"roberta-lr{lr}-bs{bs}"
        )
        results.append({"lr": lr, "batch_size": bs, "preds": preds, "labels": labels})

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training loss: 0.4077
Epoch 2/5 - Training loss: 0.2551
Epoch 3/5 - Training loss: 0.1801
Epoch 4/5 - Training loss: 0.1403
Epoch 5/5 - Training loss: 0.1249
Test accuracy: 0.8586014851485149
Test macro F1: 0.8573617337922008
Test macro precision: 0.9634831779778774
Test macro recall: 0.7762212454179399
🏃 View run roberta-sweep at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015/runs/ff02925d34274539aa88c073181f2110
🧪 View experiment at: https://dbc-cfeb31c8-2841.cloud.databricks.com/ml/experiments/3866477269740015
