In [None]:
!pip install transformers tokenizers sentencepiece tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import pandas as pd
import torch
import optuna
import wandb

from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.nn.functional import softmax as torch_softmax
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_linear_schedule_with_warmup
import numpy as np

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# PRETRAIN_MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
PRETRAIN_MODEL = "microsoft/deberta-v3-base"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Data

The dataset used in this project is sourced from [Kaggle - Coronavirus tweets NLP](https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification/data).

In [5]:
# Load the dataset
df_train = pd.read_csv("data/Corona_NLP_train.csv", encoding='latin1')
df_test = pd.read_csv("data/Corona_NLP_test.csv", encoding='latin1')

# Display first few rows
df_test.head()

Unnamed: 0,ï»¿UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02/03/2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02/03/2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02/03/2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02/03/2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03/03/2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [6]:
# map sentiment to integers
sentiment_map = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

df_train['Label'] = df_train['Sentiment'].map(sentiment_map)
df_test['Label'] = df_test['Sentiment'].map(sentiment_map)

df_test.head()

Unnamed: 0,ï»¿UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Label
0,1,44953,NYC,02/03/2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,2,44954,"Seattle, WA",02/03/2020,When I couldn't find hand sanitizer at Fred Me...,Positive,3
2,3,44955,,02/03/2020,Find out how you can protect yourself and love...,Extremely Positive,4
3,4,44956,Chicagoland,02/03/2020,#Panic buying hits #NewYork City as anxious sh...,Negative,1
4,5,44957,"Melbourne, Victoria",03/03/2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,2


#### Split the training data into train and eval

In [None]:
train_df, eval_df = train_test_split(
    df_train,
    test_size=0.3,
    random_state=42,
    stratify=df_train['Label']
)

train_df = train_df[['OriginalTweet', 'Label']].reset_index(drop=True)
eval_df = eval_df[['OriginalTweet', 'Label']].reset_index(drop=True)
test_df = df_test[['OriginalTweet', 'Label']].reset_index(drop=True)

train_df.to_csv('data/train_data.csv', index=False)
eval_df.to_csv('data/eval_data.csv', index=False)

### Look at the model

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(PRETRAIN_MODEL, num_labels=5, ignore_mismatched_sizes=True).to(device)
#model # Lets just look at the structure of the reoerta model from HF

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Check how the data is tokenized

In [17]:
# check num of tokens to choose truncation max_length
#tok = AutoTokenizer.from_pretrained(PRETRAIN_MODEL, use_fast=True)
#lens = df_train['OriginalTweet'].astype(str).map(lambda t: len(tok.encode(t, add_special_tokens=True)))
#lens.describe(), lens.quantile([0.90, 0.95, 0.99]).to_dict()

from transformers import DebertaV2Tokenizer
tok = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=True)
lens = df_train['OriginalTweet'].astype(str).map(
    lambda t: len(tok.encode(t, add_special_tokens=True))
)
print(lens.describe(), lens.quantile([0.90, 0.95, 0.99]).to_dict())

count    41157.000000
mean        55.744078
std         21.730697
min          3.000000
25%         40.000000
50%         55.000000
75%         70.000000
max        249.000000
Name: OriginalTweet, dtype: float64 {0.9: 84.0, 0.95: 92.0, 0.99: 107.0}


In [8]:
## Setup & Train

In [14]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.texts = dataframe['OriginalTweet'].tolist()
        self.labels = dataframe['Label'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [15]:
def early_stop_check(patience, best_f1, best_f1_epoch, current_f1, current_f1_epoch):
    early_stop_flag = False
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_f1_epoch = current_f1_epoch
    else:
        if current_f1_epoch - best_f1_epoch > patience:
            early_stop_flag = True
    return best_f1, best_f1_epoch, early_stop_flag

In [16]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial, scheduler):
    best_val_f1 = 0.0
    best_f1_epoch = 0
    early_stop_flag = False
    best_model_state = None

    # Enable automatic mixed precision on CUDA for stability/speed
    scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

    for epoch in range(1, epochs + 1):
        model.train() # Enable training mode
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader: #Iterates over the train_loader, which is a DataLoader object containing batches of training data.
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad(set_to_none=True) # Reset gradients

            # Forward pass (with AMP); save the logits (the raw output of the model) and calculate loss
            with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
                outputs = model(input_ids, attention_mask=attention_mask) # Forward pass
                logits = outputs.logits
                loss = criterion(logits, labels) # Calculate loss

            # Backward pass (with AMP) + gradient clipping, then update weights using the optimizer
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # NEW: prevent exploding gradients
            scaler.step(optimizer)
            scaler.update()

            # NEW: step the LR scheduler once per optimizer step
            if scheduler is not None:
                scheduler.step()

            # Accumulate training loss and predictions
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        ###  Validation loop  ###
        model.eval() # Enable evaluation mode
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0

        all_val_labels = []
        all_val_preds = []
        all_val_probs = []

        with torch.no_grad(): # Disable gradient computation
            for batch in val_loader: # iterate on the val_loader's batches 
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())
                all_val_probs.append(torch_softmax(logits, dim=1).cpu().numpy())

        # calculate metrics 
        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='macro')
        val_recall = recall_score(all_val_labels, all_val_preds, average='macro')
        val_f1 = f1_score(all_val_labels, all_val_preds, average='macro')

        probs = np.concatenate(all_val_probs, axis=0)
        labels_np = np.asarray(all_val_labels)
        try:
            val_auc = roc_auc_score(labels_np, probs, multi_class='ovr', average='macro')
        except ValueError:
            # 
            val_auc = float('nan')

        # Check for early stopping (UNCHANGED: still based on accuracy)
        best_val_f1, best_f1_epoch, early_stop_flag = early_stop_check(
            patience, best_val_f1, best_f1_epoch, val_f1, epoch
        )

        # Save the best model under the best_model_state parameter by f1
        if val_f1 >= best_val_f1:
            best_model_state = model.state_dict()

        print(f"Epoch {epoch}: train_loss={train_loss:.4f} val_f1={val_f1:.4f}, best_val_f1={best_val_f1:.4f}")

        # Log metrics to Weights & Biases - THIS IS WHERE WE TRACK THE RESULTS AND THE PROCESS
        wandb.log({ #log == logging of the training process (e.g. results) - will be done each epoch
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1,
            "Validation AUC": val_auc,

        })

        if early_stop_flag:  # Checks whether the early stopping condition has been met, as indicated by the early_stop_flag
            break # Exits the training loop immediately if the early stopping condition is satisfied

    if best_model_state is not None: # Save the best model as a .pt file
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    return best_val_f1

In [20]:

from transformers import DebertaV2Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=True)
# tokenizer = AutoTokenizer.from_pretrained(PRETRAIN_MODEL, use_fast=True)

# Objective Function for Optuna
def objective(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])  # use grad accumulation if VRAM is tight
    num_layers = trial.suggest_categorical("num_layers", [2, 3, 4, 5])
    patience = 5

    train_dataset = TweetDataset(train_df, tokenizer) # Create the TweetDataset object
    val_dataset = TweetDataset(eval_df, tokenizer)    # Create the TweetDataset object

    data_collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors="pt", pad_to_multiple_of=8)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                num_workers=6, pin_memory=True, persistent_workers=True, collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=6,
                                pin_memory=True, persistent_workers=True, collate_fn=data_collator)

    #model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=5, ignore_mismatched_sizes=True).to(device)
    model = AutoModelForSequenceClassification.from_pretrained(PRETRAIN_MODEL, num_labels=5, ignore_mismatched_sizes=True).to(device)

    # model.base_model.<...> when changing the model to something else
    for param in model.base_model.parameters():    # Freeze layers
        param.requires_grad = False
    for param in model.base_model.encoder.layer[-num_layers:].parameters():     # unfreeze the last "num_layers" of the encoder
        param.requires_grad = True
    for param in model.classifier.parameters():    #unfreeze the classifier
        param.requires_grad = True

    # Define optimizer and loss function
    # AdamW optimizer (transformer-friendly) instead of plain Adam
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # class-weighted CrossEntropy to handle label imbalance
    counts = train_df['Label'].value_counts().sort_index().values
    weights = torch.tensor((counts.sum() / (counts + 1e-9)), dtype=torch.float32, device=device)
    criterion = nn.CrossEntropyLoss(weight=weights)

    # LR scheduler with warmup (linear decay)
    epochs = 20  # keep your epoch budget here so we can compute total steps
    num_training_steps = epochs * len(train_loader)
    num_warmup_steps = int(0.06 * num_training_steps)  # ~6% warmup
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    model_name = PRETRAIN_MODEL.split("/")[1]
    wandb.init(
        # project="bertweet-covid-sentiment",
        project=f"covid-sentiment-twitter-{model_name}",
               config={ 
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "patience": patience,
        "batch_size": batch_size,
        "num_layers": num_layers,
        # "architecture": "BERTweet",
        # "architecture": "RoBERTa",
        "architecture": "DeBERTa",
        "dataset": "COVID-19 NLP"}, 
        name=f"trial_{trial.number}") # The name that will be saved in the W&B platform

    # Train the model and get the best validation accuracy
    best_val_f1 = train_model_with_hyperparams(
        model, train_loader, val_loader, optimizer, criterion,
        epochs=epochs, patience=patience, trial=trial, scheduler=scheduler
    )

    wandb.finish() # Finish the Weights & Biases run
    
    return best_val_f1 # Return best validation acc as the objective to maximize

In [21]:
# Optuna Study
study = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function
study.optimize(objective, n_trials=15)

[I 2025-08-18 20:44:38,480] A new study created in memory with name: no-name-33d28cfc-8f3e-46c7-bba6-03d3d7801a49
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33myoyulia[0m ([33myoyulia-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6100 val_f1=0.1352, best_val_f1=0.1352
Epoch 2: train_loss=1.3806 val_f1=0.5088, best_val_f1=0.5088
Epoch 3: train_loss=1.0212 val_f1=0.6071, best_val_f1=0.6071
Epoch 4: train_loss=0.9078 val_f1=0.5771, best_val_f1=0.6071
Epoch 5: train_loss=0.8311 val_f1=0.6415, best_val_f1=0.6415
Epoch 6: train_loss=0.7712 val_f1=0.6603, best_val_f1=0.6603
Epoch 7: train_loss=0.7155 val_f1=0.6754, best_val_f1=0.6754
Epoch 8: train_loss=0.6944 val_f1=0.6713, best_val_f1=0.6754
Epoch 9: train_loss=0.6645 val_f1=0.6818, best_val_f1=0.6818
Epoch 10: train_loss=0.6401 val_f1=0.6767, best_val_f1=0.6818
Epoch 11: train_loss=0.6302 val_f1=0.6814, best_val_f1=0.6818
Epoch 12: train_loss=0.5968 val_f1=0.6877, best_val_f1=0.6877
Epoch 13: train_loss=0.5604 val_f1=0.6855, best_val_f1=0.6877
Epoch 14: train_loss=0.5386 val_f1=0.6927, best_val_f1=0.6927
Epoch 15: train_loss=0.5132 val_f1=0.6898, best_val_f1=0.6927
Epoch 16: train_loss=0.4968 val_f1=0.6843, best_val_f1=0.6927
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▅▆▆▆▇▇▇▇▇▇████████
Train Loss,█▇▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
Validation AUC,▁▆▇▇████████████████
Validation Accuracy,▁▅▆▆▇▇██████████████
Validation F1,▁▆▇▇▇███████████████
Validation Loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂
Validation Precision,▁▅▇▇▇▇██████████████
Validation Recall,▁▆▇▇▇███████████████

0,1
Epoch,20.0
Train Accuracy,0.80614
Train Loss,0.46606
Validation AUC,0.91443
Validation Accuracy,0.68278
Validation F1,0.69437
Validation Loss,0.87482
Validation Precision,0.68796
Validation Recall,0.70782


[I 2025-08-18 23:17:57,649] Trial 0 finished with value: 0.6945423274562295 and parameters: {'learning_rate': 3.3114161071670855e-05, 'weight_decay': 0.02579334100215903, 'batch_size': 64, 'num_layers': 5}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6114 val_f1=0.0822, best_val_f1=0.0822
Epoch 2: train_loss=1.4283 val_f1=0.4583, best_val_f1=0.4583
Epoch 3: train_loss=1.0806 val_f1=0.5599, best_val_f1=0.5599
Epoch 4: train_loss=0.9642 val_f1=0.5888, best_val_f1=0.5888
Epoch 5: train_loss=0.9092 val_f1=0.6072, best_val_f1=0.6072
Epoch 6: train_loss=0.8681 val_f1=0.6135, best_val_f1=0.6135
Epoch 7: train_loss=0.8343 val_f1=0.6073, best_val_f1=0.6135
Epoch 8: train_loss=0.8248 val_f1=0.6139, best_val_f1=0.6139
Epoch 9: train_loss=0.7947 val_f1=0.6103, best_val_f1=0.6139
Epoch 10: train_loss=0.7711 val_f1=0.6176, best_val_f1=0.6176
Epoch 11: train_loss=0.7494 val_f1=0.6397, best_val_f1=0.6397
Epoch 12: train_loss=0.7306 val_f1=0.6332, best_val_f1=0.6397
Epoch 13: train_loss=0.7103 val_f1=0.6282, best_val_f1=0.6397
Epoch 14: train_loss=0.6964 val_f1=0.6370, best_val_f1=0.6397
Epoch 15: train_loss=0.6773 val_f1=0.6437, best_val_f1=0.6437
Epoch 16: train_loss=0.6636 val_f1=0.6548, best_val_f1=0.6548
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▅▆▆▆▇▇▇▇▇▇████████
Train Loss,█▇▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁
Validation AUC,▁▆▇▇████████████████
Validation Accuracy,▁▅▆▇▇▇▇▇▇▇██████████
Validation F1,▁▆▇▇▇▇▇▇▇███████████
Validation Loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Precision,▁▅▆▇▇▇▇▇▇▇██████████
Validation Recall,▁▆▇▇▇███████████████

0,1
Epoch,20.0
Train Accuracy,0.72738
Train Loss,0.63201
Validation AUC,0.89774
Validation Accuracy,0.63808
Validation F1,0.65008
Validation Loss,0.88665
Validation Precision,0.6417
Validation Recall,0.66865


[I 2025-08-19 00:01:08,534] Trial 1 finished with value: 0.6551492141501705 and parameters: {'learning_rate': 2.0256774853154974e-05, 'weight_decay': 0.01808148312270186, 'batch_size': 32, 'num_layers': 3}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6097 val_f1=0.1257, best_val_f1=0.1257
Epoch 2: train_loss=1.3902 val_f1=0.4700, best_val_f1=0.4700
Epoch 3: train_loss=1.0887 val_f1=0.5032, best_val_f1=0.5032
Epoch 4: train_loss=1.0124 val_f1=0.5293, best_val_f1=0.5293
Epoch 5: train_loss=0.9538 val_f1=0.5784, best_val_f1=0.5784
Epoch 6: train_loss=0.8895 val_f1=0.5766, best_val_f1=0.5784
Epoch 7: train_loss=0.8593 val_f1=0.6026, best_val_f1=0.6026
Epoch 8: train_loss=0.8237 val_f1=0.6064, best_val_f1=0.6064
Epoch 9: train_loss=0.7999 val_f1=0.6134, best_val_f1=0.6134
Epoch 10: train_loss=0.7755 val_f1=0.6100, best_val_f1=0.6134
Epoch 11: train_loss=0.7561 val_f1=0.6194, best_val_f1=0.6194
Epoch 12: train_loss=0.7181 val_f1=0.6169, best_val_f1=0.6194
Epoch 13: train_loss=0.7080 val_f1=0.6298, best_val_f1=0.6298
Epoch 14: train_loss=0.6890 val_f1=0.6324, best_val_f1=0.6324
Epoch 15: train_loss=0.6770 val_f1=0.6285, best_val_f1=0.6324
Epoch 16: train_loss=0.6624 val_f1=0.6383, best_val_f1=0.6383
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▅▅▆▆▇▇▇▇▇▇████████
Train Loss,█▆▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
Validation AUC,▁▆▇▇████████████████
Validation Accuracy,▁▅▆▆▇▇▇▇████████████
Validation F1,▁▆▆▇▇▇██████████████
Validation Loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Precision,▁▆▆▇▇▇▇█████████████
Validation Recall,▁▆▇▇▇▇██████████████

0,1
Epoch,20.0
Train Accuracy,0.72838
Train Loss,0.63115
Validation AUC,0.88772
Validation Accuracy,0.62107
Validation F1,0.63225
Validation Loss,0.95047
Validation Precision,0.62478
Validation Recall,0.65521


[I 2025-08-19 00:45:32,443] Trial 2 finished with value: 0.6383401218633841 and parameters: {'learning_rate': 1.708270667144254e-05, 'weight_decay': 0.020425638527464964, 'batch_size': 16, 'num_layers': 2}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6119 val_f1=0.1058, best_val_f1=0.1058
Epoch 2: train_loss=1.3068 val_f1=0.5250, best_val_f1=0.5250
Epoch 3: train_loss=1.0086 val_f1=0.5601, best_val_f1=0.5601
Epoch 4: train_loss=0.9052 val_f1=0.6110, best_val_f1=0.6110
Epoch 5: train_loss=0.8475 val_f1=0.6354, best_val_f1=0.6354
Epoch 6: train_loss=0.8083 val_f1=0.6574, best_val_f1=0.6574
Epoch 7: train_loss=0.7587 val_f1=0.6439, best_val_f1=0.6574
Epoch 8: train_loss=0.7176 val_f1=0.6452, best_val_f1=0.6574
Epoch 9: train_loss=0.6855 val_f1=0.6618, best_val_f1=0.6618
Epoch 10: train_loss=0.6412 val_f1=0.6628, best_val_f1=0.6628
Epoch 11: train_loss=0.6281 val_f1=0.6667, best_val_f1=0.6667
Epoch 12: train_loss=0.5941 val_f1=0.6618, best_val_f1=0.6667
Epoch 13: train_loss=0.5667 val_f1=0.6776, best_val_f1=0.6776
Epoch 14: train_loss=0.5476 val_f1=0.6868, best_val_f1=0.6868
Epoch 15: train_loss=0.5278 val_f1=0.6911, best_val_f1=0.6911
Epoch 16: train_loss=0.5079 val_f1=0.6810, best_val_f1=0.6911
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▅▅▆▆▆▇▇▇▇▇▇███████
Train Loss,█▆▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
Validation AUC,▁▆▇▇████████████████
Validation Accuracy,▁▆▆▇▇█▇▇████████████
Validation F1,▁▆▆▇▇█▇▇████████████
Validation Loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂
Validation Precision,▁▆▆▇▇█▇▇████████████
Validation Recall,▁▆▇▇▇███████████████

0,1
Epoch,20.0
Train Accuracy,0.80638
Train Loss,0.46198
Validation AUC,0.91069
Validation Accuracy,0.67355
Validation F1,0.68548
Validation Loss,0.91195
Validation Precision,0.67893
Validation Recall,0.69834


[I 2025-08-19 01:31:52,564] Trial 3 finished with value: 0.6911161852054347 and parameters: {'learning_rate': 2.4765278693100987e-05, 'weight_decay': 0.008414435026754576, 'batch_size': 32, 'num_layers': 4}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6123 val_f1=0.1752, best_val_f1=0.1752
Epoch 2: train_loss=1.5766 val_f1=0.3418, best_val_f1=0.3418
Epoch 3: train_loss=1.2360 val_f1=0.4675, best_val_f1=0.4675
Epoch 4: train_loss=1.1057 val_f1=0.4808, best_val_f1=0.4808
Epoch 5: train_loss=1.0475 val_f1=0.5025, best_val_f1=0.5025
Epoch 6: train_loss=1.0256 val_f1=0.5375, best_val_f1=0.5375
Epoch 7: train_loss=1.0028 val_f1=0.5324, best_val_f1=0.5375
Epoch 8: train_loss=0.9783 val_f1=0.5506, best_val_f1=0.5506
Epoch 9: train_loss=0.9565 val_f1=0.5639, best_val_f1=0.5639
Epoch 10: train_loss=0.9430 val_f1=0.5741, best_val_f1=0.5741
Epoch 11: train_loss=0.9263 val_f1=0.5744, best_val_f1=0.5744
Epoch 12: train_loss=0.9113 val_f1=0.5730, best_val_f1=0.5744
Epoch 13: train_loss=0.8998 val_f1=0.5818, best_val_f1=0.5818
Epoch 14: train_loss=0.8877 val_f1=0.5820, best_val_f1=0.5820
Epoch 15: train_loss=0.8796 val_f1=0.5892, best_val_f1=0.5892
Epoch 16: train_loss=0.8760 val_f1=0.5894, best_val_f1=0.5894
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▂▅▆▆▇▇▇▇▇▇▇████████
Train Loss,██▅▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁
Validation AUC,▁▄▇▇▇▇▇█████████████
Validation Accuracy,▁▄▆▆▆▇▇▇████████████
Validation F1,▁▄▆▆▇▇▇▇████████████
Validation Loss,█▆▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
Validation Precision,▁▃▅▆▆▇▇▇▇███████████
Validation Recall,▁▄▆▇▇▇▇▇████████████

0,1
Epoch,20.0
Train Accuracy,0.61522
Train Loss,0.8565
Validation AUC,0.87469
Validation Accuracy,0.5792
Validation F1,0.58917
Validation Loss,0.92814
Validation Precision,0.58122
Validation Recall,0.62577


[I 2025-08-19 02:15:55,847] Trial 4 finished with value: 0.5898548849614988 and parameters: {'learning_rate': 1.3975849882027014e-05, 'weight_decay': 0.014060688680822551, 'batch_size': 32, 'num_layers': 2}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: train_loss=1.6114 val_f1=0.1344, best_val_f1=0.1344
Epoch 2: train_loss=1.4630 val_f1=0.4442, best_val_f1=0.4442
Epoch 3: train_loss=1.0976 val_f1=0.5392, best_val_f1=0.5392
Epoch 4: train_loss=1.0118 val_f1=0.5610, best_val_f1=0.5610
Epoch 5: train_loss=0.9595 val_f1=0.5873, best_val_f1=0.5873
Epoch 6: train_loss=0.9092 val_f1=0.6104, best_val_f1=0.6104
Epoch 7: train_loss=0.8507 val_f1=0.6252, best_val_f1=0.6252
Epoch 8: train_loss=0.8298 val_f1=0.6342, best_val_f1=0.6342
Epoch 9: train_loss=0.7979 val_f1=0.6303, best_val_f1=0.6342
Epoch 10: train_loss=0.7632 val_f1=0.6436, best_val_f1=0.6436
Epoch 11: train_loss=0.7373 val_f1=0.6333, best_val_f1=0.6436
Epoch 12: train_loss=0.7102 val_f1=0.6477, best_val_f1=0.6477
Epoch 13: train_loss=0.6862 val_f1=0.6573, best_val_f1=0.6573
Epoch 14: train_loss=0.6729 val_f1=0.6541, best_val_f1=0.6573
Epoch 15: train_loss=0.6527 val_f1=0.6527, best_val_f1=0.6573
Epoch 16: train_loss=0.6361 val_f1=0.6580, best_val_f1=0.6580
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▂▅▅▆▆▆▇▇▇▇▇▇███████
Train Loss,█▇▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
Validation AUC,▁▆▇▇▇███████████████
Validation Accuracy,▁▄▆▆▇▇▇█▇███████████
Validation F1,▁▅▆▇▇▇██████████████
Validation Loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Precision,▁▄▅▆▆▇▇▇▇█▇█████████
Validation Recall,▁▅▇▇▇▇██████████████

0,1
Epoch,20.0
Train Accuracy,0.74692
Train Loss,0.60152
Validation AUC,0.89937
Validation Accuracy,0.64472
Validation F1,0.65684
Validation Loss,0.90583
Validation Precision,0.64917
Validation Recall,0.67542


[I 2025-08-19 03:11:01,723] Trial 5 finished with value: 0.6611399643232405 and parameters: {'learning_rate': 1.0400043407839611e-05, 'weight_decay': 0.007016634072433377, 'batch_size': 16, 'num_layers': 4}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6108 val_f1=0.1766, best_val_f1=0.1766
Epoch 2: train_loss=1.2040 val_f1=0.5253, best_val_f1=0.5253
Epoch 3: train_loss=0.9736 val_f1=0.5707, best_val_f1=0.5707
Epoch 4: train_loss=0.8973 val_f1=0.6104, best_val_f1=0.6104
Epoch 5: train_loss=0.8452 val_f1=0.6191, best_val_f1=0.6191
Epoch 6: train_loss=0.7942 val_f1=0.6474, best_val_f1=0.6474
Epoch 7: train_loss=0.7516 val_f1=0.6321, best_val_f1=0.6474
Epoch 8: train_loss=0.7224 val_f1=0.6537, best_val_f1=0.6537
Epoch 9: train_loss=0.6617 val_f1=0.6598, best_val_f1=0.6598
Epoch 10: train_loss=0.6356 val_f1=0.6684, best_val_f1=0.6684
Epoch 11: train_loss=0.6099 val_f1=0.6606, best_val_f1=0.6684
Epoch 12: train_loss=0.5584 val_f1=0.6673, best_val_f1=0.6684
Epoch 13: train_loss=0.5445 val_f1=0.6536, best_val_f1=0.6684
Epoch 14: train_loss=0.5199 val_f1=0.6668, best_val_f1=0.6684
Epoch 15: train_loss=0.4939 val_f1=0.6623, best_val_f1=0.6684
Epoch 16: train_loss=0.4704 val_f1=0.6705, best_val_f1=0.6705
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▄▅▅▆▆▆▆▇▇▇▇▇▇██████
Train Loss,█▆▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
Validation AUC,▁▇▇▇████████████████
Validation Accuracy,▁▆▆▇▇█▇█████████████
Validation F1,▁▆▇▇▇█▇█████████████
Validation Loss,█▂▂▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃
Validation Precision,▁▅▆▇▇█▇█████████████
Validation Recall,▁▆▇▇▇███████████████

0,1
Epoch,20.0
Train Accuracy,0.82825
Train Loss,0.41675
Validation AUC,0.9016
Validation Accuracy,0.65743
Validation F1,0.67059
Validation Loss,1.03347
Validation Precision,0.66615
Validation Recall,0.68113


[I 2025-08-19 03:58:28,779] Trial 6 finished with value: 0.6756210041264392 and parameters: {'learning_rate': 2.195015337071413e-05, 'weight_decay': 0.02897319243303322, 'batch_size': 16, 'num_layers': 3}. Best is trial 0 with value: 0.6945423274562295.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: train_loss=1.5568 val_f1=0.4650, best_val_f1=0.4650
Epoch 2: train_loss=1.0473 val_f1=0.5987, best_val_f1=0.5987
Epoch 3: train_loss=0.8528 val_f1=0.6232, best_val_f1=0.6232
Epoch 4: train_loss=0.7258 val_f1=0.6792, best_val_f1=0.6792
Epoch 5: train_loss=0.6643 val_f1=0.6778, best_val_f1=0.6792
Epoch 6: train_loss=0.5876 val_f1=0.7010, best_val_f1=0.7010
Epoch 7: train_loss=0.5470 val_f1=0.6967, best_val_f1=0.7010
Epoch 8: train_loss=0.5400 val_f1=0.6880, best_val_f1=0.7010
Epoch 9: train_loss=0.4699 val_f1=0.6861, best_val_f1=0.7010
Epoch 10: train_loss=0.4239 val_f1=0.7122, best_val_f1=0.7122
Epoch 11: train_loss=0.3840 val_f1=0.7038, best_val_f1=0.7122
Epoch 12: train_loss=0.3350 val_f1=0.7061, best_val_f1=0.7122
Epoch 13: train_loss=0.3147 val_f1=0.7052, best_val_f1=0.7122
Epoch 14: train_loss=0.2858 val_f1=0.7151, best_val_f1=0.7151
Epoch 15: train_loss=0.2579 val_f1=0.7143, best_val_f1=0.7151
Epoch 16: train_loss=0.2372 val_f1=0.7154, best_val_f1=0.7154
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▄▅▆▆▆▆▆▇▇▇▇▇███████
Train Loss,█▅▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
Validation AUC,▁▆▆▇████████████████
Validation Accuracy,▁▅▅▇▇█▇▇▇███████████
Validation F1,▁▅▅▇▇█▇▇▇███████████
Validation Loss,▇▃▂▁▁▁▂▂▂▂▄▄▄▆▆▇▇███
Validation Precision,▁▅▆▇▇▇▇▇▇█▇█▇███████
Validation Recall,▁▅▆▇▇███████████████

0,1
Epoch,20.0
Train Accuracy,0.92735
Train Loss,0.18345
Validation AUC,0.91666
Validation Accuracy,0.70424
Validation F1,0.71518
Validation Loss,1.30548
Validation Precision,0.71412
Validation Recall,0.71704


[I 2025-08-19 04:43:57,959] Trial 7 finished with value: 0.7170390352498054 and parameters: {'learning_rate': 4.9443785681517195e-05, 'weight_decay': 0.02181298694478672, 'batch_size': 32, 'num_layers': 4}. Best is trial 7 with value: 0.7170390352498054.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6090 val_f1=0.1483, best_val_f1=0.1483
Epoch 2: train_loss=1.3830 val_f1=0.4661, best_val_f1=0.4661
Epoch 3: train_loss=1.0610 val_f1=0.5381, best_val_f1=0.5381
Epoch 4: train_loss=0.9484 val_f1=0.6106, best_val_f1=0.6106
Epoch 5: train_loss=0.8885 val_f1=0.5967, best_val_f1=0.6106
Epoch 6: train_loss=0.8534 val_f1=0.6094, best_val_f1=0.6106
Epoch 7: train_loss=0.8264 val_f1=0.6219, best_val_f1=0.6219
Epoch 8: train_loss=0.7808 val_f1=0.6168, best_val_f1=0.6219
Epoch 9: train_loss=0.7393 val_f1=0.6539, best_val_f1=0.6539
Epoch 10: train_loss=0.7196 val_f1=0.6471, best_val_f1=0.6539
Epoch 11: train_loss=0.6956 val_f1=0.6528, best_val_f1=0.6539
Epoch 12: train_loss=0.6652 val_f1=0.6715, best_val_f1=0.6715
Epoch 13: train_loss=0.6474 val_f1=0.6677, best_val_f1=0.6715
Epoch 14: train_loss=0.6220 val_f1=0.6570, best_val_f1=0.6715
Epoch 15: train_loss=0.6046 val_f1=0.6662, best_val_f1=0.6715
Epoch 16: train_loss=0.5916 val_f1=0.6754, best_val_f1=0.6754
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▃▅▅▆▆▆▇▇▇▇▇▇███████
Train Loss,█▇▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
Validation AUC,▁▆▇▇▇███████████████
Validation Accuracy,▁▄▅▇▇▇▇▇█▇██████████
Validation F1,▁▅▆▇▇▇▇▇████████████
Validation Loss,█▄▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Precision,▁▂▄▆▆▆▆▆▇▇▇██▇██████
Validation Recall,▁▆▆▇▇▇█▇████████████

0,1
Epoch,20.0
Train Accuracy,0.76886
Train Loss,0.54898
Validation AUC,0.90618
Validation Accuracy,0.66391
Validation F1,0.67572
Validation Loss,0.88185
Validation Precision,0.66866
Validation Recall,0.69271


[I 2025-08-19 05:29:24,694] Trial 8 finished with value: 0.6760542455428501 and parameters: {'learning_rate': 2.0859463095012422e-05, 'weight_decay': 0.009329856000113509, 'batch_size': 32, 'num_layers': 4}. Best is trial 7 with value: 0.7170390352498054.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6116 val_f1=0.0978, best_val_f1=0.0978
Epoch 2: train_loss=1.4183 val_f1=0.4737, best_val_f1=0.4737
Epoch 3: train_loss=1.0779 val_f1=0.5745, best_val_f1=0.5745
Epoch 4: train_loss=0.9408 val_f1=0.5795, best_val_f1=0.5795
Epoch 5: train_loss=0.8638 val_f1=0.6187, best_val_f1=0.6187
Epoch 6: train_loss=0.8017 val_f1=0.6395, best_val_f1=0.6395
Epoch 7: train_loss=0.7416 val_f1=0.6403, best_val_f1=0.6403
Epoch 8: train_loss=0.6968 val_f1=0.6536, best_val_f1=0.6536
Epoch 9: train_loss=0.6687 val_f1=0.6532, best_val_f1=0.6536
Epoch 10: train_loss=0.6455 val_f1=0.6471, best_val_f1=0.6536
Epoch 11: train_loss=0.6218 val_f1=0.6640, best_val_f1=0.6640
Epoch 12: train_loss=0.5901 val_f1=0.6470, best_val_f1=0.6640
Epoch 13: train_loss=0.5709 val_f1=0.6578, best_val_f1=0.6640
Epoch 14: train_loss=0.5550 val_f1=0.6516, best_val_f1=0.6640
Epoch 15: train_loss=0.5292 val_f1=0.6525, best_val_f1=0.6640
Epoch 16: train_loss=0.5103 val_f1=0.6610, best_val_f1=0.6640
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
Train Accuracy,▁▃▅▆▆▆▇▇▇▇▇██████
Train Loss,█▇▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁
Validation AUC,▁▆▇▇█████████████
Validation Accuracy,▁▅▆▇▇████████████
Validation F1,▁▆▇▇▇████████████
Validation Loss,█▄▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂
Validation Precision,▁▅▇▇▇████████████
Validation Recall,▁▆▇▇█████████████

0,1
Epoch,17.0
Train Accuracy,0.78333
Train Loss,0.50637
Validation AUC,0.89867
Validation Accuracy,0.6414
Validation F1,0.65287
Validation Loss,0.96029
Validation Precision,0.64581
Validation Recall,0.67437


[I 2025-08-19 06:03:20,475] Trial 9 finished with value: 0.6639700382641764 and parameters: {'learning_rate': 4.376000407045076e-05, 'weight_decay': 0.012348877605247053, 'batch_size': 64, 'num_layers': 3}. Best is trial 7 with value: 0.7170390352498054.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: train_loss=1.5921 val_f1=0.3176, best_val_f1=0.3176
Epoch 2: train_loss=1.0670 val_f1=0.6138, best_val_f1=0.6138
Epoch 3: train_loss=0.8323 val_f1=0.6543, best_val_f1=0.6543
Epoch 4: train_loss=0.7101 val_f1=0.6951, best_val_f1=0.6951
Epoch 5: train_loss=0.6529 val_f1=0.6740, best_val_f1=0.6951
Epoch 6: train_loss=0.5965 val_f1=0.7062, best_val_f1=0.7062
Epoch 7: train_loss=0.5152 val_f1=0.7071, best_val_f1=0.7071
Epoch 8: train_loss=0.4571 val_f1=0.7236, best_val_f1=0.7236
Epoch 9: train_loss=0.3947 val_f1=0.7176, best_val_f1=0.7236
Epoch 10: train_loss=0.3456 val_f1=0.7318, best_val_f1=0.7318
Epoch 11: train_loss=0.2926 val_f1=0.7279, best_val_f1=0.7318
Epoch 12: train_loss=0.2605 val_f1=0.7301, best_val_f1=0.7318
Epoch 13: train_loss=0.2366 val_f1=0.7379, best_val_f1=0.7379
Epoch 14: train_loss=0.2012 val_f1=0.7265, best_val_f1=0.7379
Epoch 15: train_loss=0.1861 val_f1=0.7352, best_val_f1=0.7379
Epoch 16: train_loss=0.1650 val_f1=0.7365, best_val_f1=0.7379
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
Train Accuracy,▁▄▅▆▆▆▆▇▇▇▇▇███████
Train Loss,█▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁
Validation AUC,▁▆▇█▇██████████████
Validation Accuracy,▁▆▆▇▇▇▇████████████
Validation F1,▁▆▇▇▇▇▇████████████
Validation Loss,█▂▂▁▁▁▂▁▂▂▃▄▄▅▆▇▇██
Validation Precision,▁▆▆▇▇▇▇█▇██████████
Validation Recall,▁▆▇▇▇██████████████

0,1
Epoch,19.0
Train Accuracy,0.95189
Train Loss,0.12779
Validation AUC,0.92745
Validation Accuracy,0.72425
Validation F1,0.73391
Validation Loss,1.35739
Validation Precision,0.73057
Validation Recall,0.73909


[I 2025-08-19 06:49:20,011] Trial 10 finished with value: 0.7378896406857568 and parameters: {'learning_rate': 4.974146375224901e-05, 'weight_decay': 0.005145577997310733, 'batch_size': 32, 'num_layers': 5}. Best is trial 10 with value: 0.7378896406857568.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: train_loss=1.5555 val_f1=0.4467, best_val_f1=0.4467
Epoch 2: train_loss=1.0125 val_f1=0.6266, best_val_f1=0.6266
Epoch 3: train_loss=0.8020 val_f1=0.6832, best_val_f1=0.6832
Epoch 4: train_loss=0.6708 val_f1=0.7005, best_val_f1=0.7005
Epoch 5: train_loss=0.6022 val_f1=0.7060, best_val_f1=0.7060
Epoch 6: train_loss=0.5238 val_f1=0.7128, best_val_f1=0.7128
Epoch 7: train_loss=0.4787 val_f1=0.7126, best_val_f1=0.7128
Epoch 8: train_loss=0.4497 val_f1=0.7196, best_val_f1=0.7196
Epoch 9: train_loss=0.3881 val_f1=0.7179, best_val_f1=0.7196
Epoch 10: train_loss=0.3628 val_f1=0.7254, best_val_f1=0.7254
Epoch 11: train_loss=0.3156 val_f1=0.7281, best_val_f1=0.7281
Epoch 12: train_loss=0.2614 val_f1=0.7301, best_val_f1=0.7301
Epoch 13: train_loss=0.2128 val_f1=0.7272, best_val_f1=0.7301
Epoch 14: train_loss=0.1966 val_f1=0.7339, best_val_f1=0.7339
Epoch 15: train_loss=0.1769 val_f1=0.7355, best_val_f1=0.7355
Epoch 16: train_loss=0.1554 val_f1=0.7285, best_val_f1=0.7355
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▄▅▆▆▆▆▇▇▇▇▇████████
Train Loss,█▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁
Validation AUC,▁▆▇▇▇███████████████
Validation Accuracy,▁▅▇▇▇▇▇█████████████
Validation F1,▁▅▇▇▇▇▇█████████████
Validation Loss,▅▂▁▁▁▁▁▂▃▃▃▄▅▅▅▆▇▇██
Validation Precision,▁▅▇▇▇▇▇▇████████████
Validation Recall,▁▅▇▇▇█▇█████████████

0,1
Epoch,20.0
Train Accuracy,0.95606
Train Loss,0.11496
Validation AUC,0.92816
Validation Accuracy,0.72522
Validation F1,0.73518
Validation Loss,1.44857
Validation Precision,0.73211
Validation Recall,0.73902


[I 2025-08-19 07:37:44,705] Trial 11 finished with value: 0.7355360923348476 and parameters: {'learning_rate': 4.945934716001909e-05, 'weight_decay': 0.005424771713017307, 'batch_size': 32, 'num_layers': 5}. Best is trial 10 with value: 0.7378896406857568.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6094 val_f1=0.1227, best_val_f1=0.1227
Epoch 2: train_loss=1.1713 val_f1=0.6064, best_val_f1=0.6064
Epoch 3: train_loss=0.8903 val_f1=0.6502, best_val_f1=0.6502
Epoch 4: train_loss=0.7588 val_f1=0.6664, best_val_f1=0.6664
Epoch 5: train_loss=0.6876 val_f1=0.6849, best_val_f1=0.6849
Epoch 6: train_loss=0.6329 val_f1=0.6828, best_val_f1=0.6849
Epoch 7: train_loss=0.5914 val_f1=0.7023, best_val_f1=0.7023
Epoch 8: train_loss=0.5365 val_f1=0.7096, best_val_f1=0.7096
Epoch 9: train_loss=0.4860 val_f1=0.7084, best_val_f1=0.7096
Epoch 10: train_loss=0.4542 val_f1=0.6886, best_val_f1=0.7096
Epoch 11: train_loss=0.4158 val_f1=0.7091, best_val_f1=0.7096
Epoch 12: train_loss=0.3815 val_f1=0.7077, best_val_f1=0.7096
Epoch 13: train_loss=0.3478 val_f1=0.7074, best_val_f1=0.7096
Epoch 14: train_loss=0.3233 val_f1=0.7137, best_val_f1=0.7137
Epoch 15: train_loss=0.2981 val_f1=0.7123, best_val_f1=0.7137
Epoch 16: train_loss=0.2804 val_f1=0.7176, best_val_f1=0.7176
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▄▅▆▆▆▆▇▇▇▇▇▇███████
Train Loss,█▆▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
Validation AUC,▁▇▇█████████████████
Validation Accuracy,▁▆▇▇█▇██████████████
Validation F1,▁▇▇▇████████████████
Validation Loss,█▂▁▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄
Validation Precision,▁▇▇▇████████████████
Validation Recall,▁▆▇█████████████████

0,1
Epoch,20.0
Train Accuracy,0.90979
Train Loss,0.22411
Validation AUC,0.92236
Validation Accuracy,0.70667
Validation F1,0.71715
Validation Loss,1.13448
Validation Precision,0.71335
Validation Recall,0.72197


[I 2025-08-19 08:26:07,490] Trial 12 finished with value: 0.7176063724548684 and parameters: {'learning_rate': 3.368383014427381e-05, 'weight_decay': 0.0053975720531280105, 'batch_size': 32, 'num_layers': 5}. Best is trial 10 with value: 0.7378896406857568.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: train_loss=1.6102 val_f1=0.1531, best_val_f1=0.1531
Epoch 2: train_loss=1.1870 val_f1=0.5639, best_val_f1=0.5639
Epoch 3: train_loss=0.9173 val_f1=0.6325, best_val_f1=0.6325
Epoch 4: train_loss=0.8110 val_f1=0.6625, best_val_f1=0.6625
Epoch 5: train_loss=0.7382 val_f1=0.6657, best_val_f1=0.6657
Epoch 6: train_loss=0.6647 val_f1=0.6834, best_val_f1=0.6834
Epoch 7: train_loss=0.6064 val_f1=0.6636, best_val_f1=0.6834
Epoch 8: train_loss=0.5783 val_f1=0.6702, best_val_f1=0.6834
Epoch 9: train_loss=0.5235 val_f1=0.7000, best_val_f1=0.7000
Epoch 10: train_loss=0.4721 val_f1=0.6863, best_val_f1=0.7000
Epoch 11: train_loss=0.4349 val_f1=0.6970, best_val_f1=0.7000
Epoch 12: train_loss=0.4021 val_f1=0.7027, best_val_f1=0.7027
Epoch 13: train_loss=0.3865 val_f1=0.7113, best_val_f1=0.7113
Epoch 14: train_loss=0.3438 val_f1=0.7080, best_val_f1=0.7113
Epoch 15: train_loss=0.2931 val_f1=0.7015, best_val_f1=0.7113
Epoch 16: train_loss=0.2811 val_f1=0.7076, best_val_f1=0.7113
Epoch 17: train_l

0,1
Epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
Train Accuracy,▁▄▅▅▆▆▆▇▇▇▇▇▇▇█████
Train Loss,█▆▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁
Validation AUC,▁▆▇▇███████████████
Validation Accuracy,▁▆▇▇▇█▇▇███████████
Validation F1,▁▆▇▇▇█▇▇███████████
Validation Loss,█▃▂▁▁▁▁▂▁▂▂▂▂▃▄▃▄▄▄
Validation Precision,▁▅▆▇▇▇▇▇█▇█████████
Validation Recall,▁▆▇▇▇██████████████

0,1
Epoch,19.0
Train Accuracy,0.9101
Train Loss,0.22365
Validation AUC,0.91838
Validation Accuracy,0.69866
Validation F1,0.71016
Validation Loss,1.17351
Validation Precision,0.70681
Validation Recall,0.71528


[I 2025-08-19 09:13:01,582] Trial 13 finished with value: 0.711288999794248 and parameters: {'learning_rate': 3.75008962541288e-05, 'weight_decay': 0.005042121877492635, 'batch_size': 32, 'num_layers': 5}. Best is trial 10 with value: 0.7378896406857568.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 3e-2)
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1: train_loss=1.6070 val_f1=0.1938, best_val_f1=0.1938
Epoch 2: train_loss=1.1848 val_f1=0.5972, best_val_f1=0.5972
Epoch 3: train_loss=0.9125 val_f1=0.6139, best_val_f1=0.6139
Epoch 4: train_loss=0.7977 val_f1=0.6488, best_val_f1=0.6488
Epoch 5: train_loss=0.7276 val_f1=0.6818, best_val_f1=0.6818
Epoch 6: train_loss=0.6773 val_f1=0.6842, best_val_f1=0.6842
Epoch 7: train_loss=0.6404 val_f1=0.6968, best_val_f1=0.6968
Epoch 8: train_loss=0.5982 val_f1=0.6924, best_val_f1=0.6968
Epoch 9: train_loss=0.5572 val_f1=0.6936, best_val_f1=0.6968
Epoch 10: train_loss=0.5295 val_f1=0.7075, best_val_f1=0.7075
Epoch 11: train_loss=0.4894 val_f1=0.7015, best_val_f1=0.7075
Epoch 12: train_loss=0.4621 val_f1=0.6915, best_val_f1=0.7075
Epoch 13: train_loss=0.4342 val_f1=0.7026, best_val_f1=0.7075
Epoch 14: train_loss=0.4038 val_f1=0.7024, best_val_f1=0.7075
Epoch 15: train_loss=0.3856 val_f1=0.7039, best_val_f1=0.7075
Epoch 16: train_loss=0.3640 val_f1=0.7059, best_val_f1=0.7075


0,1
Epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
Train Accuracy,▁▄▅▆▆▆▇▇▇▇▇█████
Train Loss,█▆▄▃▃▃▃▂▂▂▂▂▁▁▁▁
Validation AUC,▁▇▇█████████████
Validation Accuracy,▁▆▇▇████████████
Validation F1,▁▆▇▇████████████
Validation Loss,█▃▂▁▁▁▁▁▁▁▁▂▂▂▂▃
Validation Precision,▁▇▇▇████████████
Validation Recall,▁▆▇▇████████████

0,1
Epoch,16.0
Train Accuracy,0.84932
Train Loss,0.36403
Validation AUC,0.91697
Validation Accuracy,0.69404
Validation F1,0.70586
Validation Loss,0.97215
Validation Precision,0.70063
Validation Recall,0.71811


[I 2025-08-19 09:52:27,481] Trial 14 finished with value: 0.7075298468661098 and parameters: {'learning_rate': 2.8145246238112575e-05, 'weight_decay': 0.0065469647902776165, 'batch_size': 32, 'num_layers': 5}. Best is trial 10 with value: 0.7378896406857568.


# Testing - note didn't run yet

In [None]:
# Function to evaluate the model
def evaluate_model(model_path, test_loader):
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=5, ignore_mismatched_sizes=True).to(device)
    model.load_state_dict(torch.load(model_path)) # loading the trained model
    model = model.to(device)
    model.eval() # eval mode

    all_labels = []
    all_preds = []

    #same idea... just testing and getting resuts...
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}

In [None]:
# Load the test data set
test_dataset = DataLoader(test_df, AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", use_fast=True))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Test multiple models
model_paths = ["best_model_trial_0.pt"]  # Replace with actual model paths
for model_path in model_paths:
    metrics = evaluate_model(model_path, test_loader)
    print(f"Metrics for {model_path}:")
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}")