# **Data Preparation and Imports**

In [None]:
!pip install transformers datasets scikit-learn pandas numpy
!pip install --upgrade transformers

# Standard libraries
import os
import random

# Data manipulation libraries
import numpy as np
import pandas as pd

# Machine learning metrics
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    matthews_corrcoef,
    precision_score,
    recall_score,
    roc_auc_score
)

# PyTorch related
import torch
from torch.optim.swa_utils import AveragedModel
from safetensors.torch import load_file

# Hugging Face transformers
from transformers import (
    RobertaConfig,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    Trainer,
    TrainerCallback,
    TrainingArguments
)
from datasets import Dataset

# Google Colab
from google.colab import drive

# Set Constant Seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(42)

# Mount Drive
drive.mount('/content/drive/')
save_path = '/content/drive/MyDrive/nlu'
os.makedirs(save_path, exist_ok=True)

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

## Data Loading and Distribution Calculation

In [None]:
# Load data
train_df = pd.read_csv('/content/drive/MyDrive/nlu/train.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/nlu/dev.csv')

# Conver Label Column from String to INT
train_df["label"] = train_df["label"].astype(int)
valid_df["label"] = valid_df["label"].astype(int)

# Show Sample of Train Data
train_df.head()


Unnamed: 0,premise,hypothesis,label
0,yeah i don't know cut California in half or so...,Yeah. I'm not sure how to make that fit. Maybe...,1
1,actual names will not be used,"For the sake of privacy, actual names are not ...",1
2,The film was directed by Randall Wallace.,The film was directed by Randall Wallace and s...,1
3,"""How d'you know he'll sign me on?""Anse studie...",Anse looked at himself in a cracked mirror.,1
4,In the light of the candles his cheeks looked ...,Drew regarded his best friend and noted that i...,1


In [None]:
# Calculate Weights for both classes
def calculate_normalized_class_weights(train_data):
    y_train = train_data['label'].values

    # Calculate class distribution
    class_counts = np.bincount(y_train)
    total_samples = len(y_train)
    class_percentages = class_counts / total_samples * 100

    # Print distribution statistics
    for class_idx, (count, percentage) in enumerate(zip(class_counts, class_percentages)):
        print(f"Class {class_idx}: {count} samples ({percentage:.2f}%)")

    epsilon = 1e-6
    raw_class_weights = total_samples / (len(class_counts) * (class_counts + epsilon))

    normalized_class_weights = raw_class_weights / np.sum(raw_class_weights)

    class_weights = {
        i: normalized_class_weights[i] for i in range(len(normalized_class_weights))
    }

    return class_weights

class_weights = calculate_normalized_class_weights(train_df)

Class 0: 11784 samples (48.23%)
Class 1: 12648 samples (51.77%)


## Tokenization and Utility Functions for ULMFit Training

In [None]:
# Initialize Roberta Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large") # Using Roberta-Large which has 24 layers
def tokenize(batch):
    return tokenizer(batch["premise"], batch["hypothesis"], truncation=True, padding="max_length", max_length=128)

# Convert Dataframes to Dataset Objects and convert to an appropriate format for huggingface trainer
train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True).rename_column("label", "labels")
valid_ds = Dataset.from_pandas(valid_df).map(tokenize, batched=True).rename_column("label", "labels")
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
valid_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# Metrics to view during training
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)

    # For ROC AUC, we need probabilities rather than class predictions
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
    try:
        roc_auc = roc_auc_score(labels, probs[:, 1])
    except:
        roc_auc = 0.0

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "roc_auc": roc_auc,
    }

# Freeze layers function for ULMFiT
def freeze_until(model, block_idx):
    for name, param in model.named_parameters():
        # Always leave the classifier head trainable
        if "classifier" in name:
            param.requires_grad = True
        # Only unfreeze encoder block if block_idx ≥ 0 and matches
        elif block_idx >= 0 and f"encoder.layer.{block_idx}" in name:
            param.requires_grad = True
        else:
            param.requires_grad = False

Map:   0%|          | 0/24432 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/6736 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

## PHASE 1: Fine-tuning the Classifier Head Only

In [None]:
# Phase 1: Head Only: encoder frozen, classifier unfrozen
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2)
freeze_until(model, block_idx=-1)

# Configure the trainer to evaluate at the end of each epoch
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./ulmfit_phase1",
        eval_strategy="epoch",
        save_strategy="no",
        per_device_train_batch_size=8,
        num_train_epochs=1,
        learning_rate=5e-5,
        weight_decay=0.01,
        report_to="none",
    ),
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
)
trainer.train()

# Save model for Phase 1 loading
model_save_path = os.path.join(save_path, "ulmfit_phase1_final")
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.6932,0.686666,0.547951,0.668986,0.537843,0.884704,0.575429


Model saved to /content/drive/MyDrive/nlu/ulmfit_phase1_final


# PHASE 2: Gradual Unfreezing of Layers (24 Layers)

In [None]:
# Phase 2: Gradual Unfreeze
num_blocks = len(model.roberta.encoder.layer)
print(f"The number of Layers is: ", num_blocks)
for i in reversed(range(num_blocks)):
    # Unfreeze the current block
    freeze_until(model, block_idx=i)

    # Set up training arguments
    phase2_args = TrainingArguments(
        output_dir=f"./ulmfit_phase2_block{i}",
        eval_strategy="epoch",
        save_strategy="no",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        learning_rate=5e-5 * (0.9 ** (num_blocks - i)),
        weight_decay=0.01,
        report_to="none",
        logging_strategy="epoch"
    )

    # Set up trainer
    trainer = Trainer(
        model=model,
        args=phase2_args,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        compute_metrics=compute_metrics
    )

    # Train the model on this block
    trainer.train()

    # Print progress update
    print(f"Completed training with block {i} unfrozen")

# After all blocks are trained, save the final model
final_model_path = os.path.join(save_path, "ulmfit_phase2_final")
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final model saved to {final_model_path}")

The number of Layers is:  24


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.5303,0.493621,0.762619,0.782242,0.743079,0.825762,0.844457


Completed training with block 23 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.475,0.475732,0.778058,0.795177,0.759487,0.834388,0.859152


Completed training with block 22 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4121,0.483676,0.784145,0.799393,0.768435,0.83295,0.865013


Completed training with block 21 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.3466,0.571724,0.783996,0.799449,0.767805,0.833813,0.867041


Completed training with block 20 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2861,0.753509,0.784887,0.800165,0.768884,0.8341,0.866945


Completed training with block 19 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2573,1.177768,0.785778,0.800718,0.770396,0.833525,0.869588


Completed training with block 18 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2625,1.943829,0.792904,0.806277,0.779748,0.834675,0.872967


Completed training with block 17 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2511,2.467696,0.799287,0.810591,0.790437,0.8318,0.875895


Completed training with block 16 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2385,2.778624,0.807898,0.819021,0.797386,0.841863,0.877234


Completed training with block 15 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2209,3.114997,0.811164,0.822594,0.798754,0.847901,0.875806


Completed training with block 14 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2113,3.309191,0.813539,0.82404,0.803552,0.845601,0.878522


Completed training with block 13 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2194,3.418058,0.814133,0.822663,0.81072,0.834963,0.880376


Completed training with block 12 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2042,3.482517,0.817251,0.826937,0.809078,0.845601,0.881143


Completed training with block 11 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.187,3.536362,0.821259,0.830851,0.812363,0.850201,0.885773


Completed training with block 10 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1743,3.617954,0.82497,0.833592,0.818686,0.849051,0.886329


Completed training with block 9 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1691,3.654344,0.826455,0.835792,0.817083,0.855377,0.887736


Completed training with block 8 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1636,3.707278,0.827346,0.835989,0.820371,0.852214,0.887107


Completed training with block 7 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1587,3.812997,0.824525,0.834035,0.815038,0.853939,0.884797


Completed training with block 6 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1481,3.840663,0.823337,0.832818,0.814286,0.852214,0.883414


Completed training with block 5 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1514,3.83433,0.822595,0.831998,0.81403,0.850776,0.884537


Completed training with block 4 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1464,3.800739,0.82408,0.833122,0.81645,0.850489,0.88583


Completed training with block 3 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1794,4.090501,0.820962,0.828498,0.81964,0.83755,0.885078


Completed training with block 2 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2112,3.880069,0.828533,0.836795,0.822729,0.851351,0.888913


Completed training with block 1 unfrozen


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1191,3.887773,0.828385,0.836953,0.821429,0.853076,0.888176


Completed training with block 0 unfrozen
Final model saved to /content/drive/MyDrive/nlu/ulmfit_phase2_final


# PHASE 3: Final Fine-Tuning of Model with SWA

### Hyperparameter Tuning for Phase 3
This will take around 6 hours.

In [None]:
# import optuna
# from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# # Reload Phase 2 best checkpoint
# checkpoint = "./ulmfit_phase2_block0/checkpoint-3054"
# def model_init():
#     return RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# # Base TrainingArguments (will be overridden per trial)
# base_args = TrainingArguments(
#     output_dir="./ulmfit_phase3_search",
#     eval_strategy="epoch",
#     save_strategy="no",
#     load_best_model_at_end=True,
#     report_to="none",
# )

# search_trainer = Trainer(
#     model_init=model_init,
#     args=base_args,
#     train_dataset=train_ds,
#     eval_dataset=valid_ds,
#     compute_metrics=compute_metrics,
# )

# # Hyperparameter search (20 trials)
# def hp_space(trial):
#     return {
#         "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
#         "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),
#         "warmup_steps": trial.suggest_categorical("warmup_steps", [0, 50, 100]),
#         "num_train_epochs": trial.suggest_categorical("num_train_epochs", [2, 3, 4]),
#         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8]),
#     }

# best_run = search_trainer.hyperparameter_search(
#     direction="maximize",
#     hp_space=hp_space,
#     backend="optuna",
#     n_trials=20,
# )
# print("Best hyperparameters:", best_run.hyperparameters)

# # Final full fine‑tuning with best hyperparameters
# best = best_run.hyperparameters
# phase3_args = TrainingArguments(
#     output_dir="./ulmfit_phase3_full",
#     eval_strategy="epoch",
#     save_strategy="no",
#     load_best_model_at_end=True,
#     per_device_train_batch_size=int(best["per_device_train_batch_size"]),
#     per_device_eval_batch_size=int(best["per_device_train_batch_size"]),
#     num_train_epochs=int(best["num_train_epochs"]),
#     learning_rate=best["learning_rate"],
#     weight_decay=best["weight_decay"],
#     warmup_steps=int(best["warmup_steps"]),
#     report_to="none",
# )

# trainer = Trainer(
#     model_init=model_init,
#     args=phase3_args,
#     train_dataset=train_ds,
#     eval_dataset=valid_ds,
#     compute_metrics=compute_metrics,
# )
# trainer.train()


[I 2025-03-26 16:20:35,109] A new study created in memory with name: no-name-a08fc8f6-f5c2-4596-8701-f2902c464aaa
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7489,0.645832,0.903207,0.908014
2,0.4335,0.944719,0.906473,0.911839
3,0.3153,1.219983,0.910778,0.915245


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7489,0.645832,0.903207,0.908014
2,0.4335,0.944719,0.906473,0.911839
3,0.3153,1.219983,0.910778,0.915245
4,0.2403,1.36552,0.911223,0.915273


[I 2025-03-26 17:08:06,675] Trial 0 finished with value: 1.8264967264787384 and parameters: {'learning_rate': 1.5662767932032506e-06, 'weight_decay': 0.004537421892494496, 'warmup_steps': 100, 'num_train_epochs': 4, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 1.8264967264787384.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7376,0.692739,0.51633,0.681026
2,0.7199,0.692673,0.51633,0.681026
3,0.7133,0.692614,0.51633,0.681026


[I 2025-03-26 17:43:42,686] Trial 1 finished with value: 1.1973562089572924 and parameters: {'learning_rate': 3.1976525189386006e-05, 'weight_decay': 0.021044082562529605, 'warmup_steps': 50, 'num_train_epochs': 3, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 1.8264967264787384.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6305,0.580683,0.914786,0.917172
2,0.2416,0.788699,0.916271,0.92


[I 2025-03-26 18:07:24,776] Trial 2 finished with value: 1.836270783847981 and parameters: {'learning_rate': 5.562943617659205e-06, 'weight_decay': 0.015085539336367127, 'warmup_steps': 50, 'num_train_epochs': 2, 'per_device_train_batch_size': 4}. Best is trial 2 with value: 1.836270783847981.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4418,0.342756,0.899198,0.905498
2,0.3227,0.548078,0.910629,0.914245
3,0.1467,0.544168,0.917607,0.921952
4,0.0601,0.622725,0.922506,0.925747


[I 2025-03-26 18:43:09,522] Trial 3 finished with value: 1.8482527376732902 and parameters: {'learning_rate': 1.7631695388970417e-05, 'weight_decay': 0.0035403553886129645, 'warmup_steps': 100, 'num_train_epochs': 4, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4377,0.370228,0.870249,0.878577
2,0.3221,0.485242,0.896971,0.902309
3,0.1394,0.524348,0.909442,0.912732


[I 2025-03-26 19:10:01,294] Trial 4 finished with value: 1.8221742801898877 and parameters: {'learning_rate': 3.094411434100949e-05, 'weight_decay': 0.04849842366729191, 'warmup_steps': 100, 'num_train_epochs': 3, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7072,0.580276,0.912856,0.916298
2,0.2938,0.942322,0.910184,0.915914
3,0.1917,1.144112,0.915529,0.919302


[I 2025-03-26 19:46:22,253] Trial 5 finished with value: 1.8348307301974653 and parameters: {'learning_rate': 3.0928262031206867e-06, 'weight_decay': 0.0028032436063279826, 'warmup_steps': 50, 'num_train_epochs': 3, 'per_device_train_batch_size': 4}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4897,3.098613,0.846645,0.848645


[I 2025-03-26 19:55:16,384] Trial 6 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5619,1.101933,0.855998,0.871148


[I 2025-03-26 20:04:03,317] Trial 7 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5918,0.895912,0.872625,0.881818


[I 2025-03-26 20:12:50,055] Trial 8 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6162,0.661972,0.877375,0.88022


[I 2025-03-26 20:24:32,348] Trial 9 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4752,0.410049,0.900089,0.906515
2,0.2647,0.641724,0.916419,0.920424


[I 2025-03-26 20:42:25,612] Trial 10 finished with value: 1.8368432681735394 and parameters: {'learning_rate': 1.2357080763591453e-05, 'weight_decay': 0.0010700686431308272, 'warmup_steps': 100, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.463,0.3856,0.904097,0.910003
2,0.2751,0.616948,0.916568,0.920306


[I 2025-03-26 21:00:20,109] Trial 11 finished with value: 1.836873992048212 and parameters: {'learning_rate': 1.3087758011137621e-05, 'weight_decay': 0.0011025519839519505, 'warmup_steps': 100, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4699,0.449776,0.90573,0.912256
2,0.2739,0.626322,0.914489,0.918644


[I 2025-03-26 21:18:13,691] Trial 12 finished with value: 1.8331333789605058 and parameters: {'learning_rate': 1.3221746696390753e-05, 'weight_decay': 0.001013962263679903, 'warmup_steps': 100, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4967,0.492442,0.897268,0.905335


[I 2025-03-26 21:27:03,058] Trial 13 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7332,0.713636,0.48367,0.0


[I 2025-03-26 21:35:54,895] Trial 14 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5031,0.522872,0.890143,0.900027


[I 2025-03-26 21:44:41,999] Trial 15 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4384,0.391485,0.906621,0.911296
2,0.2664,0.515293,0.918795,0.92198


[I 2025-03-26 22:02:33,004] Trial 16 finished with value: 1.8407742829303526 and parameters: {'learning_rate': 1.8765687816571254e-05, 'weight_decay': 0.007876565211980328, 'warmup_steps': 100, 'num_train_epochs': 2, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 1.8482527376732902.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4237,0.366173,0.90291,0.904997


[I 2025-03-26 22:11:19,918] Trial 17 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5537,0.510789,0.896081,0.903315


[I 2025-03-26 22:20:06,580] Trial 18 pruned. 
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 5e-5),
  "weight_decay": trial.suggest_loguniform("weight_decay", 1e-3, 1e-1),


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7382,0.698219,0.48367,0.0


[I 2025-03-26 22:28:53,480] Trial 19 pruned. 


🏆 Best hyperparameters: {'learning_rate': 1.7631695388970417e-05, 'weight_decay': 0.0035403553886129645, 'warmup_steps': 100, 'num_train_epochs': 4, 'per_device_train_batch_size': 8}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4418,0.342756,0.899198,0.905498
2,0.3227,0.548078,0.910629,0.914245
3,0.1467,0.544168,0.917607,0.921952
4,0.0601,0.622725,0.922506,0.925747


Final validation results: {'eval_loss': 0.34275588393211365, 'eval_accuracy': 0.8991983372921615, 'eval_f1': 0.9054975643702158, 'eval_runtime': 39.5423, 'eval_samples_per_second': 170.349, 'eval_steps_per_second': 21.294, 'epoch': 4.0}


In [None]:
# Custom SWA Callback
class SimpleSWA(TrainerCallback):
    def __init__(self, start_step, freq):
        self.start_step = start_step
        self.freq = freq
        self.swa_model = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.swa_model = AveragedModel(kwargs["model"])

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step >= self.start_step and state.global_step % self.freq == 0:
            self.swa_model.update_parameters(kwargs["model"])

    def on_train_end(self, args, state, control, **kwargs):
        kwargs["model"].load_state_dict(self.swa_model.module.state_dict())

# Reload Phase 2 checkpoint
checkpoint = "/content/drive/MyDrive/nlu/ulmfit_phase2_final"

# Load the model
def model_init():
    config = RobertaConfig.from_pretrained(checkpoint, local_files_only=True)
    model = RobertaForSequenceClassification(config)
    ckpt_file = os.path.join(checkpoint, "model.safetensors")
    state_dict = load_file(ckpt_file)
    model.load_state_dict(state_dict)
    return model

# Phase 3 Training Arguments from Hyperparameter Tuning
phase3_args = TrainingArguments(
    output_dir="./ulmfit_phase3_swa",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to=[]
)

# Compute total steps for SWA scheduling
total_steps = len(train_ds) // phase3_args.per_device_train_batch_size * phase3_args.num_train_epochs
start_step = int(total_steps * 0.5)
swa_callback = SimpleSWA(start_step=start_step, freq=10)

trainer = Trainer(
    model_init=model_init,
    args=phase3_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
    callbacks=[swa_callback]
)
trainer.train()

# Final Evaluation
metrics = trainer.evaluate()
print("Final validation results with SWA:", metrics)

# Store Final Model
save_path = "/content/drive/MyDrive/nlu/swa_final_model_last/"

# Save both model weights and tokenizer
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.4145,0.338292,0.909293,0.913786,0.897201,0.930995,0.962633
2,0.2511,0.484854,0.917013,0.921145,0.904182,0.938758,0.969711


Final validation results with SWA: {'eval_loss': 0.47662946581840515, 'eval_accuracy': 0.9201306413301663, 'eval_f1': 0.9233181299885975, 'eval_precision': 0.9154889768230638, 'eval_recall': 0.9312823461759632, 'eval_roc_auc': 0.9705726356425779, 'eval_runtime': 40.075, 'eval_samples_per_second': 168.085, 'eval_steps_per_second': 21.011, 'epoch': 2.0}
Model and tokenizer saved to /content/drive/MyDrive/nlu/swa_final_model_last/


## Generate predictions.csv for Codabench

In [None]:
# Load Model
save_path = "/content/drive/MyDrive/nlu/swa_final_model_last"
config = RobertaConfig.from_pretrained(save_path)
model = RobertaForSequenceClassification(config)
state_dict = load_file(os.path.join(save_path, "model.safetensors"))
model.load_state_dict(state_dict)
eval_trainer = Trainer(model=model)
print("Model loaded from safetensors successfully")


# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained(save_path)
def tokenize(batch):
    return tokenizer(batch["premise"], batch["hypothesis"],
                     truncation=True, padding="max_length", max_length=128)

# Load the dataset and csv
dataset_df = pd.read_csv('/content/drive/MyDrive/nlu/dev.csv')
dataset_df["label"] = dataset_df["label"].astype(int)

dataset_ds = Dataset.from_pandas(dataset_df).map(tokenize, batched=True).rename_column("label", "labels")
dataset_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# Run inference on your validation/dev dataset
pred_output = eval_trainer.predict(dataset_ds)
pred_logits = pred_output.predictions
pred_labels = np.argmax(pred_logits, axis=1)


pd.DataFrame({"prediction": pred_labels}).to_csv("predictions.csv", index=False)

print("Saved predictions.csv with", len(pred_labels), "rows.")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model loaded from safetensors successfully


Map:   0%|          | 0/6736 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Saved predictions.csv with 6736 rows.
