In [None]:
import pandas as pd
import torch
import optuna
from torch.utils.data import DataLoader
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

#from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch import nn, optim
from torch.utils.data import DataLoader
import wandb
from torch.utils.data import Dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Data

The dataset used in this project is sourced from [Kaggle - Coronavirus tweets NLP](https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification/data).

In [None]:
# Load the dataset
df_train = pd.read_csv("Corona_NLP_train.csv", encoding='latin1')
df_test = pd.read_csv("Corona_NLP_test.csv", encoding='latin1')

# Display first few rows
df_test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
# map sentiment to integers
sentiment_map = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

df_train['Label'] = df_train['Sentiment'].map(sentiment_map)
df_test['Label'] = df_test['Sentiment'].map(sentiment_map)

df_test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Unnamed: 6,1,Label
0,1,44953,NYC,02/03/2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,,23,0
1,2,44954,"Seattle, WA",02/03/2020,When I couldn't find hand sanitizer at Fred Me...,Positive,,30,3
2,3,44955,,02/03/2020,Find out how you can protect yourself and love...,Extremely Positive,,13,4
3,4,44956,Chicagoland,02/03/2020,#Panic buying hits #NewYork City as anxious sh...,Negative,,35,1
4,5,44957,"Melbourne, Victoria",03/03/2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,,26,2


#### Split the training data into train and eval

In [9]:
train_df, eval_df = train_test_split(
    df_train,
    test_size=0.2,
    random_state=42,
    stratify=df_train['Label']
)

train_df = train_df[['OriginalTweet', 'Label']].reset_index(drop=True)
eval_df = eval_df[['OriginalTweet', 'Label']].reset_index(drop=True)
test_df = df_test[['OriginalTweet', 'Label']].reset_index(drop=True)

### Look at the model

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=5).to(device)
model # Lets just look at the structure of the reoerta model from HF

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

### Check how the data is tokenized

In [None]:
# check num of tokens to choose truncation max_length
tok = RobertaTokenizer.from_pretrained("roberta-large", use_fast=True)
lens = df_train['OriginalTweet'].astype(str).map(lambda t: len(tok.encode(t, add_special_tokens=True)))
lens.describe(), lens.quantile([0.90, 0.95, 0.99]).to_dict()

(count    41157.000000
 mean        58.824307
 std         23.142010
 min          5.000000
 25%         42.000000
 50%         58.000000
 75%         74.000000
 max        184.000000
 Name: OriginalTweet, dtype: float64,
 {0.9: 89.0, 0.95: 98.0, 0.99: 118.0})

In [None]:
## Setup & Train

In [12]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.texts = dataframe['OriginalTweet'].tolist()
        self.labels = dataframe['Label'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [13]:
def early_stop_check(patience, best_val_accuracy, best_val_accuracy_epoch, current_val_accuracy, current_val_accuracy_epoch):
    early_stop_flag = False
    if current_val_accuracy > best_val_accuracy:
        best_val_accuracy = current_val_accuracy
        best_val_accuracy_epoch = current_val_accuracy_epoch
    else:
        if current_val_accuracy_epoch - best_val_accuracy_epoch > patience:
            early_stop_flag = True
    return best_val_accuracy, best_val_accuracy_epoch, early_stop_flag

In [14]:
def train_model_with_hyperparams(model, train_loader, val_loader, optimizer, criterion, epochs, patience, trial, scheduler):
    best_val_accuracy = 0.0
    best_val_accuracy_epoch = 0
    early_stop_flag = False
    best_model_state = None

    # Enable automatic mixed precision on CUDA for stability/speed
    scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

    for epoch in range(1, epochs + 1):
        model.train() # Enable training mode
        train_loss = 0.0
        total_train_samples = 0
        correct_train_predictions = 0

        for batch in train_loader: #Iterates over the train_loader, which is a DataLoader object containing batches of training data.
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad(set_to_none=True) # Reset gradients

            # Forward pass (with AMP); save the logits (the raw output of the model) and calculate loss
            with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
                outputs = model(input_ids, attention_mask=attention_mask) # Forward pass
                logits = outputs.logits
                loss = criterion(logits, labels) # Calculate loss

            # Backward pass (with AMP) + gradient clipping, then update weights using the optimizer
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # NEW: prevent exploding gradients
            scaler.step(optimizer)
            scaler.update()

            # NEW: step the LR scheduler once per optimizer step
            if scheduler is not None:
                scheduler.step()

            # Accumulate training loss and predictions
            train_loss += loss.item() * input_ids.size(0)
            total_train_samples += input_ids.size(0)
            correct_train_predictions += (logits.argmax(dim=1) == labels).sum().item()

        train_loss /= total_train_samples
        train_accuracy = correct_train_predictions / total_train_samples

        ###  Validation loop  ###
        model.eval() # Enable evaluation mode
        val_loss = 0.0
        total_val_samples = 0
        correct_val_predictions = 0

        all_val_labels = []
        all_val_preds = []

        with torch.no_grad(): # Disable gradient computation
            for batch in val_loader: # iterate on the val_loader's batches 
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)

                val_loss += loss.item() * input_ids.size(0)
                total_val_samples += input_ids.size(0)
                correct_val_predictions += (logits.argmax(dim=1) == labels).sum().item()

                all_val_labels.extend(labels.cpu().numpy())
                all_val_preds.extend(logits.argmax(dim=1).cpu().numpy())

        # calculate metrics 
        val_loss /= total_val_samples
        val_accuracy = correct_val_predictions / total_val_samples
        val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
        val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
        val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

        # Check for early stopping (UNCHANGED: still based on accuracy)
        best_val_accuracy, best_val_accuracy_epoch, early_stop_flag = early_stop_check(
            patience, best_val_accuracy, best_val_accuracy_epoch, val_accuracy, epoch
        )

        # Save the best model under the best_model_state parameter (UNCHANGED: still by accuracy)
        if val_accuracy == best_val_accuracy:
            best_model_state = model.state_dict()

        # Log metrics to Weights & Biases - THIS IS WHERE WE TRACK THE RESULTS AND THE PROCESS
        wandb.log({ #log == logging of the training process (e.g. results) - will be done each epoch
            "Epoch": epoch,
            "Train Loss": train_loss,
            "Train Accuracy": train_accuracy,
            "Validation Loss": val_loss,
            "Validation Accuracy": val_accuracy,
            "Validation Precision": val_precision,
            "Validation Recall": val_recall,
            "Validation F1": val_f1
        })

        if early_stop_flag:  # Checks whether the early stopping condition has been met, as indicated by the early_stop_flag
            break # Exits the training loop immediately if the early stopping condition is satisfied

    if best_model_state is not None: # Save the best model as a .pt file
        torch.save(best_model_state, f"best_model_trial_{trial.number}.pt")

    return best_val_accuracy

In [15]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large", use_fast=True)

# Objective Function for Optuna
def objective(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 3e-5)
    weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 2e-2)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])  # use grad accumulation if VRAM is tight
    patience   = trial.suggest_int("patience", 4, 8)
    num_layers = trial.suggest_categorical("num_layers", [4, 6, 8, 12])

    train_dataset = TweetDataset(train_df, tokenizer) # Create the TweetDataset object
    val_dataset = TweetDataset(eval_df, tokenizer)    # Create the TweetDataset object

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                              num_workers=6, pin_memory=True, persistent_workers=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,   # NEW: no shuffle for validation
                            num_workers=6, pin_memory=True, persistent_workers=True)

    model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=5).to(device) # initialize RoBerta large from HF, num_labels=5 -> 5 classes.

    for param in model.roberta.parameters():    # Freeze layers
        param.requires_grad = False
    for param in model.roberta.encoder.layer[-num_layers:].parameters():     # unfreeze the last "num_layers" of the encoder
        param.requires_grad = True
    for param in model.classifier.parameters():    #unfreeze the classifier
        param.requires_grad = True

    # Define optimizer and loss function
    # NEW: AdamW optimizer (transformer-friendly) instead of plain Adam
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # NEW: class-weighted CrossEntropy to handle label imbalance
    counts = train_df['Label'].value_counts().sort_index().values
    weights = torch.tensor((counts.sum() / (counts + 1e-9)), dtype=torch.float32, device=device)
    criterion = nn.CrossEntropyLoss(weight=weights)

    # NEW: LR scheduler with warmup (linear decay)
    from transformers import get_linear_schedule_with_warmup
    epochs = 20  # keep your epoch budget here so we can compute total steps
    num_training_steps = epochs * len(train_loader)
    num_warmup_steps = int(0.06 * num_training_steps)  # ~6% warmup
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    # Initialize Weights & Biases - the values in the config are the properties of each trial.
    wandb.init(project="roberta-covid-sentiment-maxlen128",
               config={ 
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "patience": patience,
        "batch_size": batch_size,
        "num_layers": num_layers,
        "architecture": "RoBERTa",
        "dataset": "COVID-19 NLP"}, 
        name=f"trial_{trial.number}") # The name that will be saved in the W&B platform

    # Train the model and get the best validation accuracy
    best_val_accuracy = train_model_with_hyperparams(
        model, train_loader, val_loader, optimizer, criterion,
        epochs=epochs, patience=patience, trial=trial, scheduler=scheduler
    )

    wandb.finish() # Finish the Weights & Biases run
    
    return best_val_accuracy # Return best validation acc as the objective to maximize

In [None]:
# Optuna Study
study = optuna.create_study(direction="maximize")  # Specifies that the goal of the optimization is to maximize the objective function
study.optimize(objective, n_trials=10)

[I 2025-08-12 20:45:57,049] A new study created in memory with name: no-name-592a5201-be08-42bd-b454-70dd2e16a828
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 3e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 2e-2)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33myoyulia[0m ([33myoyulia-tel-aviv-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▄▅▅▆▆▆▇▇▇▇▇▇███████
Train Loss,█▆▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
Validation Accuracy,▁▄▅▆▆▇▇▇▇▇▇█▇█▇█████
Validation F1,▁▄▅▇▆▇▇▇█▇██▇███████
Validation Loss,█▄▃▂▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▂
Validation Precision,▁▄▅▆▆▇▇▇▇▇██████████
Validation Recall,▁▄▅▆▆▇▇▇▇▇▇█▇█▇█████

0,1
Epoch,20.0
Train Accuracy,0.83204
Train Loss,0.40595
Validation Accuracy,0.71016
Validation F1,0.70813
Validation Loss,0.79373
Validation Precision,0.71337
Validation Recall,0.71016


[I 2025-08-12 22:10:39,622] Trial 0 finished with value: 0.7175655976676385 and parameters: {'learning_rate': 1.8084585310083114e-05, 'weight_decay': 0.009508208736324634, 'batch_size': 32, 'patience': 5, 'num_layers': 6}. Best is trial 0 with value: 0.7175655976676385.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 3e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 2e-2)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
Epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
Train Accuracy,▁▄▅▅▆▆▇▇▇▇▇█████████
Train Loss,█▅▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁
Validation Accuracy,▁▄▅▆▇▇▇▇▇▇▇█▇██▇████
Validation F1,▁▄▅▆▇▇▇▇▇▇▇█▇██▇████
Validation Loss,▃▂▂▁▁▁▁▁▁▂▂▂▃▄▅▇▇███
Validation Precision,▁▄▅▆▇▇▇▇▇█▇█▇███████
Validation Recall,▁▄▅▆▇▇▇▇▇▇▇█▇██▇████

0,1
Epoch,20.0
Train Accuracy,0.98147
Train Loss,0.14128
Validation Accuracy,0.8122
Validation F1,0.8118
Validation Loss,2.17918
Validation Precision,0.81256
Validation Recall,0.8122


[I 2025-08-13 00:19:05,109] Trial 1 finished with value: 0.8134110787172012 and parameters: {'learning_rate': 1.6189611749289203e-05, 'weight_decay': 0.008651522228555737, 'batch_size': 16, 'patience': 6, 'num_layers': 12}. Best is trial 1 with value: 0.8134110787172012.
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 3e-5)
  weight_decay = trial.suggest_loguniform("weight_decay", 5e-3, 2e-2)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#

# Testing - note didn't run yet

In [None]:
# Function to evaluate the model
def evaluate_model(model_path, test_loader):
    # Load the model
    model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=5)
    model.load_state_dict(torch.load(model_path)) # loading the trained model
    model = model.to(device)
    model.eval() # eval mode

    all_labels = []
    all_preds = []

    #same idea... just testing and getting resuts...
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = logits.argmax(dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1}

In [None]:
# Load the test data set
test_dataset = DataLoader(test_df, RobertaTokenizer.from_pretrained('roberta-large'))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Test multiple models
model_paths = ["best_model_trial_0.pt"]  # Replace with actual model paths
for model_path in model_paths:
    metrics = evaluate_model(model_path, test_loader)
    print(f"Metrics for {model_path}:")
    for key, value in metrics.items():
        print(f"{key}: {value:.4f}")