In [2]:
!pip install -q transformers[torch] datasets accelerate optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/413.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from sklearn.metrics import confusion_matrix, classification_report
import os
import warnings
warnings.filterwarnings('ignore')

print("✓ All imports successful!")

✓ All imports successful!


In [None]:
class Config:
    MODEL_NAME = 'roberta-base'
    NUM_LABELS = 2
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    NUM_WORKERS = 0
    SEED = 42
    DATA_DIR = r'C:\Users\JASWA\Documents\Projects_AI\Twitter_sentiment_Analysis_NLP\data\processed'
    MODEL_SAVE_DIR = r'C:\Users\JASWA\Documents\Projects_AI\Twitter_sentiment_Analysis_NLP\models\checkpoints'
    RESULTS_DIR = r'C:\Users\JASWA\Documents\Projects_AI\Twitter_sentiment_Analysis_NLP\results'

config = Config()

In [7]:
print(f"Using device: {config.DEVICE}")
print(f"CUDA available: {torch.cuda.is_available()}")

Using device: cuda
CUDA available: True


In [5]:
class TweetSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get text and label
        text = str(self.data.loc[idx, 'text_clean'])
        label = int(self.data.loc[idx, 'sentiment'])

        # Tokenize
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("✓ Dataset class created!")

✓ Dataset class created!


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
print("Loading datasets...")
train_df = pd.read_csv(f'{config.DATA_DIR}/train.csv')
val_df = pd.read_csv(f'{config.DATA_DIR}/val.csv')

print(f"Train: {len(train_df):,}")
print(f"Val: {len(val_df):,}\n")


Loading datasets...
Train: 79,817
Val: 9,977



In [13]:
def objective(trial):
    """
    Optuna will call this function many times
    with different hyperparameters to find the best combo
    """

    # Suggest hyperparameters to try
    max_length = trial.suggest_categorical('max_length', [32, 64, 128])
    batch_size = trial.suggest_categorical('batch_size', [8, 16,32])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.1)

    print(f"\n{'='*60}")
    print(f"Trial {trial.number}")
    print(f"{'='*60}")
    print(f"Parameters:")
    print(f"  - max_length: {max_length}")
    print(f"  - batch_size: {batch_size}")
    print(f"  - learning_rate: {learning_rate:.2e}")
    print(f"  - warmup_steps: {warmup_steps}")
    print(f"  - weight_decay: {weight_decay:.4f}")
    print("="*60 + "\n")

    # Load tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained(config.MODEL_NAME)
    model = RobertaForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=config.NUM_LABELS
    )
    model.to(config.DEVICE)

    # Create datasets
    train_dataset = TweetSentimentDataset(train_df, tokenizer, max_length)
    val_dataset = TweetSentimentDataset(val_df, tokenizer, max_length)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=config.NUM_WORKERS
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=config.NUM_WORKERS
    )

    # Setup optimizer
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay
    )

    total_steps = len(train_loader) * 2  # Only 2 epochs for speed
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    # Train for 2 epochs only (for speed)
    best_val_acc = 0

    for epoch in range(2):
        # Train
        model.train()
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/2'):
            input_ids = batch['input_ids'].to(config.DEVICE)
            attention_mask = batch['attention_mask'].to(config.DEVICE)
            labels = batch['labels'].to(config.DEVICE)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        # Validate
        model.eval()
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(config.DEVICE)
                attention_mask = batch['attention_mask'].to(config.DEVICE)
                labels = batch['labels'].to(config.DEVICE)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                preds = torch.argmax(outputs.logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        val_acc = correct / total

        if val_acc > best_val_acc:
            best_val_acc = val_acc

        print(f"Epoch {epoch+1}: Val Acc = {val_acc:.4f}")

        # Report intermediate value for pruning
        trial.report(val_acc, epoch)

        # Handle pruning
        if trial.should_prune():
            raise optuna.TrialPruned()

    print(f"\nBest Val Acc: {best_val_acc:.4f}\n")

    # Clean up
    del model
    del optimizer
    del scheduler
    torch.cuda.empty_cache()

    return best_val_acc


In [14]:
study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner()
)

study.optimize(
    objective,
    n_trials=20,
    timeout=7200
)

[I 2026-01-26 12:52:02,655] A new study created in memory with name: no-name-ede534c7-99a0-49f3-a339-acb988920fba



Trial 0
Parameters:
  - max_length: 32
  - batch_size: 8
  - learning_rate: 2.57e-05
  - warmup_steps: 458
  - weight_decay: 0.0027



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/9978 [00:00<?, ?it/s]

Epoch 1: Val Acc = 0.8445


Epoch 2/2:   0%|          | 0/9978 [00:00<?, ?it/s]

[I 2026-01-26 13:21:51,476] Trial 0 finished with value: 0.8581737997394007 and parameters: {'max_length': 32, 'batch_size': 8, 'learning_rate': 2.567767497201475e-05, 'warmup_steps': 458, 'weight_decay': 0.0026813592755050354}. Best is trial 0 with value: 0.8581737997394007.


Epoch 2: Val Acc = 0.8582

Best Val Acc: 0.8582


Trial 1
Parameters:
  - max_length: 128
  - batch_size: 32
  - learning_rate: 1.98e-05
  - warmup_steps: 226
  - weight_decay: 0.0872



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/2495 [00:00<?, ?it/s]

Epoch 1: Val Acc = 0.8568


Epoch 2/2:   0%|          | 0/2495 [00:00<?, ?it/s]

[I 2026-01-26 14:15:11,121] Trial 1 finished with value: 0.8678961611706926 and parameters: {'max_length': 128, 'batch_size': 32, 'learning_rate': 1.9755575225039977e-05, 'warmup_steps': 226, 'weight_decay': 0.08717405559930652}. Best is trial 1 with value: 0.8678961611706926.


Epoch 2: Val Acc = 0.8679

Best Val Acc: 0.8679


Trial 2
Parameters:
  - max_length: 128
  - batch_size: 32
  - learning_rate: 1.53e-05
  - warmup_steps: 716
  - weight_decay: 0.0027



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/2495 [00:00<?, ?it/s]

Epoch 1: Val Acc = 0.8595


Epoch 2/2:   0%|          | 0/2495 [00:00<?, ?it/s]

[I 2026-01-26 15:08:30,704] Trial 2 finished with value: 0.8616818682970833 and parameters: {'max_length': 128, 'batch_size': 32, 'learning_rate': 1.52623825694476e-05, 'warmup_steps': 716, 'weight_decay': 0.0026914012959015302}. Best is trial 1 with value: 0.8678961611706926.


Epoch 2: Val Acc = 0.8617

Best Val Acc: 0.8617


Trial 3
Parameters:
  - max_length: 32
  - batch_size: 32
  - learning_rate: 1.37e-05
  - warmup_steps: 375
  - weight_decay: 0.0142



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/2495 [00:00<?, ?it/s]

Epoch 1: Val Acc = 0.8501


Epoch 2/2:   0%|          | 0/2495 [00:00<?, ?it/s]

[I 2026-01-26 15:25:36,081] Trial 3 finished with value: 0.8582740302696201 and parameters: {'max_length': 32, 'batch_size': 32, 'learning_rate': 1.3703525555159499e-05, 'warmup_steps': 375, 'weight_decay': 0.014183013483337248}. Best is trial 1 with value: 0.8678961611706926.


Epoch 2: Val Acc = 0.8583

Best Val Acc: 0.8583


Trial 4
Parameters:
  - max_length: 32
  - batch_size: 32
  - learning_rate: 2.47e-05
  - warmup_steps: 246
  - weight_decay: 0.0310



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/2495 [00:00<?, ?it/s]

Epoch 1: Val Acc = 0.8543


Epoch 2/2:   0%|          | 0/2495 [00:00<?, ?it/s]

[I 2026-01-26 15:42:42,095] Trial 4 finished with value: 0.8632855567805954 and parameters: {'max_length': 32, 'batch_size': 32, 'learning_rate': 2.4734886712604022e-05, 'warmup_steps': 246, 'weight_decay': 0.030985097846734713}. Best is trial 1 with value: 0.8678961611706926.


Epoch 2: Val Acc = 0.8633

Best Val Acc: 0.8633


Trial 5
Parameters:
  - max_length: 128
  - batch_size: 16
  - learning_rate: 3.66e-05
  - warmup_steps: 139
  - weight_decay: 0.0655



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/4989 [00:00<?, ?it/s]

[I 2026-01-26 16:12:11,490] Trial 5 pruned. 


Epoch 1: Val Acc = 0.8518

Trial 6
Parameters:
  - max_length: 128
  - batch_size: 32
  - learning_rate: 2.14e-05
  - warmup_steps: 54
  - weight_decay: 0.0298



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2:   0%|          | 0/2495 [00:00<?, ?it/s]

Epoch 1: Val Acc = 0.8599


Epoch 2/2:   0%|          | 0/2495 [00:00<?, ?it/s]

[I 2026-01-26 17:05:30,957] Trial 6 finished with value: 0.8640874010223514 and parameters: {'max_length': 128, 'batch_size': 32, 'learning_rate': 2.143361718450062e-05, 'warmup_steps': 54, 'weight_decay': 0.029794758622933404}. Best is trial 1 with value: 0.8678961611706926.


Epoch 2: Val Acc = 0.8641

Best Val Acc: 0.8641



In [19]:

print(f"\nBest trial:")
trial = study.best_trial

print(f"  Value (Val Acc): {trial.value:.4f}")
print(f"\nBest hyperparameters:")
for key, value in trial.params.items():
    print(f"  - {key}: {value}")

# Save results
results_df = study.trials_dataframe()
results_df.to_csv(f'{config.RESULTS_DIR}/optuna_results.csv', index=False)

print(f"\n✓ Results saved to: {config.RESULTS_DIR}/optuna_results.csv")


Best trial:
  Value (Val Acc): 0.8679

Best hyperparameters:
  - max_length: 128
  - batch_size: 32
  - learning_rate: 1.9755575225039977e-05
  - warmup_steps: 226
  - weight_decay: 0.08717405559930652

✓ Results saved to: /content/drive/MyDrive/nlp/optuna_results.csv


In [20]:
fig1 = plot_optimization_history(study)
fig1.write_html(f'{config.RESULTS_DIR}/optuna_history.html')

# Parameter importance
fig2 = plot_param_importances(study)
fig2.write_html(f'{config.RESULTS_DIR}/optuna_importance.html')

print(f"✓ Visualizations saved!")

✓ Visualizations saved!
