In [1]:
# Trying RoBERTa-base instead of BERT with enhanced metrics printing

# RoBERTa Fine-tuning for GoEmotions
# This notebook adapts the BERT fine-tuning pipeline for RoBERTa-base.
# Enhanced: compute_metrics now includes per-label F1, precision, recall from classification_report.
# After training, we print a detailed classification report for validation set.

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install transformers datasets accelerate tokenizers
!pip install scikit-learn pandas jupyterlab notebook ipywidgets
!pip install nbconvert



Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting jupyterlab
  Downloading jupyterlab-4.4.7-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab)
  Downloading jupyter_lsp-2.3.0-py3-none-any.whl.metadata (1.8 kB)
Collecting jupyterlab-server<3,>=2.27.1 (from jupyterlab)
  Downloading jupyterlab_server-2.27.3-py3-none-any.whl.metadata (5.9 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting json5>=0.9.0 (from jupyterlab-server<3,>=2.27.1->jupyterlab)
  Downloading json5-0.12.1-py3-none-any.whl.metadata (36 kB)
Downloading jupyterlab-4.4.7-py3-none-any.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_lru-2.0.5-py3-none-any.whl (6.1 kB)
Downloadin

In [2]:
# 1. Imports and basic setup
import os
import glob
import random
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)



In [3]:
# Reproducibility
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', device)



Using device: cuda


In [4]:
# 2. Load preprocessed CSVs and infer label columns
data_files = sorted(glob.glob('Preprocessed_goemotions_*.csv'))
if not data_files:
    # fallback to single file if pattern doesn't match
    data_files = ['Preprocessed_goemotions_1.csv']
print('Found data files:', data_files)



Found data files: ['Preprocessed_goemotions_1.csv', 'Preprocessed_goemotions_2.csv', 'Preprocessed_goemotions_3.csv']


In [5]:
# Read and concatenate (be careful with memory for large datasets)
dfs = [pd.read_csv(f) for f in data_files]
df = pd.concat(dfs, ignore_index=True)
print('Total examples:', len(df))



Total examples: 183356


In [6]:
# Expect a `text` column and multiple binary label columns. Infer label columns as all columns except 'text'.
assert 'text' in df.columns, 'Expected a text column in CSV'
label_cols = [c for c in df.columns if c != 'text']
num_labels = len(label_cols)
print('Detected label columns (%d):' % num_labels, label_cols)



Detected label columns (9): ['Joyful', 'Affectionate', 'Positive_Outlook', 'Anger_Frustration', 'Sadness_Disappointment', 'Fear_Anxiety', 'Surprise_Confusion', 'Desire', 'Neutral']


In [7]:
## Tokenizer and Model Initialization
# Using roberta-base
MODEL_NAME = 'roberta-base'
MAX_LENGTH = 128  # You can increase this if needed

# Tokenizer (RoBERTa uses its own tokenizer)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Config -> ensure problem_type set for multi-label
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels)
config.problem_type = 'multi_label_classification'

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.to(device)
print('Model loaded. num_labels =', model.config.num_labels, 'problem_type =', model.config.problem_type)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded. num_labels = 9 problem_type = multi_label_classification


In [8]:
## Dataset Wrapper
# (Unchanged)
class GoEmotionsDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[List[float]], tokenizer, max_length: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = np.array(self.labels[idx], dtype=np.float32)
        enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors=None,
        )
        item = {
            'input_ids': torch.tensor(enc['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(enc['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.float),
        }
        return item



In [9]:
# Prepare train/val split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED, shuffle=True)
print('Train / Val sizes:', len(train_df), len(val_df))

train_texts = train_df['text'].fillna('').tolist()
val_texts = val_df['text'].fillna('').tolist()
train_labels = train_df[label_cols].values.tolist()
val_labels = val_df[label_cols].values.tolist()

train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer, max_length=MAX_LENGTH)
val_dataset = GoEmotionsDataset(val_texts, val_labels, tokenizer, max_length=MAX_LENGTH)

print('Datasets ready. Example item keys:', list(train_dataset[0].keys()))



Train / Val sizes: 165020 18336
Datasets ready. Example item keys: ['input_ids', 'attention_mask', 'labels']


In [10]:
## Training Configuration (Hugging Face Trainer)
# (Unchanged)
BATCH_SIZE = 16
LEARNING_RATE = 3e-5
NUM_EPOCHS = 3
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 100
OUTPUT_DIR = './roberta_finetuned'
os.makedirs(OUTPUT_DIR, exist_ok=True)

fp16 = True if device == 'cuda' else False

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=False,
    fp16=fp16,
    report_to='none',
)

print('TrainingArguments configured. fp16=', fp16)



TrainingArguments configured. fp16= True


In [11]:
## Metrics & compute_metrics function
# Enhanced: Now computes and returns macro/micro averages plus per-label metrics from classification_report
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(logits, tuple):
        logits = logits[0]
    probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
    preds = (probs >= 0.5).astype(int)

    # Overall metrics
    try:
        acc = accuracy_score(labels, preds)
    except Exception:
        acc = 0.0
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)

    # Per-label metrics (using classification_report)
    report = classification_report(labels, preds, target_names=label_cols, output_dict=True, zero_division=0)

    # Flatten per-label metrics into the dict for Trainer to display
    metrics = {
        'accuracy': acc,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
    }
    for label in label_cols:
        metrics[f'precision_{label}'] = report[label]['precision']
        metrics[f'recall_{label}'] = report[label]['recall']
        metrics[f'f1_{label}'] = report[label]['f1-score']
        metrics[f'support_{label}'] = report[label]['support']

    return metrics



In [12]:
# Prepare Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print('Trainer ready')



Trainer ready


  trainer = Trainer(


In [13]:
## Training Execution
# Run training
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro,Precision Joyful,Recall Joyful,F1 Joyful,Support Joyful,Precision Affectionate,Recall Affectionate,F1 Affectionate,Support Affectionate,Precision Positive Outlook,Recall Positive Outlook,F1 Positive Outlook,Support Positive Outlook,Precision Anger Frustration,Recall Anger Frustration,F1 Anger Frustration,Support Anger Frustration,Precision Sadness Disappointment,Recall Sadness Disappointment,F1 Sadness Disappointment,Support Sadness Disappointment,Precision Fear Anxiety,Recall Fear Anxiety,F1 Fear Anxiety,Support Fear Anxiety,Precision Surprise Confusion,Recall Surprise Confusion,F1 Surprise Confusion,Support Surprise Confusion,Precision Desire,Recall Desire,F1 Desire,Support Desire,Precision Neutral,Recall Neutral,F1 Neutral,Support Neutral
1,0.2247,0.221672,0.389343,0.614758,0.327547,0.418269,0.652043,0.389889,0.487986,0.622584,0.427972,0.507252,1430.0,0.737043,0.569533,0.64255,4070.0,0.564246,0.187384,0.281337,539.0,0.72738,0.220128,0.337974,2812.0,0.651685,0.344945,0.451111,1177.0,0.52349,0.194514,0.283636,401.0,0.606932,0.344695,0.439682,2083.0,0.5,0.258929,0.341176,224.0,0.599465,0.399821,0.4797,5600.0
2,0.2132,0.219839,0.403305,0.614655,0.353289,0.437033,0.647889,0.403305,0.497143,0.635827,0.451748,0.528209,1430.0,0.717699,0.597789,0.652279,4070.0,0.554217,0.170686,0.260993,539.0,0.577277,0.446302,0.50341,2812.0,0.642202,0.356839,0.458766,1177.0,0.564246,0.25187,0.348276,401.0,0.586718,0.43687,0.500826,2083.0,0.576923,0.200893,0.298013,224.0,0.676791,0.266607,0.382526,5600.0
3,0.1973,0.222498,0.4375,0.584612,0.380935,0.453673,0.626894,0.437827,0.515574,0.611311,0.476224,0.535377,1430.0,0.688586,0.632924,0.659583,4070.0,0.507538,0.187384,0.273713,539.0,0.576832,0.433855,0.49523,2812.0,0.599222,0.392523,0.474333,1177.0,0.513636,0.281796,0.363929,401.0,0.584703,0.444071,0.504775,2083.0,0.54,0.241071,0.333333,224.0,0.639676,0.338571,0.442784,5600.0


TrainOutput(global_step=30942, training_loss=0.21402744032217672, metrics={'train_runtime': 3966.4941, 'train_samples_per_second': 124.81, 'train_steps_per_second': 7.801, 'total_flos': 3.256598641595904e+16, 'train_loss': 0.21402744032217672, 'epoch': 3.0})

In [14]:
# After training, evaluate and print detailed classification report
print("\nDetailed Validation Metrics:")
eval_results = trainer.evaluate()
print(eval_results)  # This will print all metrics, including per-label ones




Detailed Validation Metrics:


{'eval_loss': 0.22249843180179596, 'eval_accuracy': 0.4375, 'eval_precision_macro': 0.5846115051535679, 'eval_recall_macro': 0.38093548282584794, 'eval_f1_macro': 0.453672985680526, 'eval_precision_micro': 0.6268936436045603, 'eval_recall_micro': 0.43782722513089006, 'eval_f1_micro': 0.5155738231327468, 'eval_precision_Joyful': 0.6113105924596051, 'eval_recall_Joyful': 0.4762237762237762, 'eval_f1_Joyful': 0.535377358490566, 'eval_support_Joyful': 1430.0, 'eval_precision_Affectionate': 0.6885859395883454, 'eval_recall_Affectionate': 0.6329238329238329, 'eval_f1_Affectionate': 0.6595826398668544, 'eval_support_Affectionate': 4070.0, 'eval_precision_Positive_Outlook': 0.507537688442211, 'eval_recall_Positive_Outlook': 0.18738404452690166, 'eval_f1_Positive_Outlook': 0.27371273712737126, 'eval_support_Positive_Outlook': 539.0, 'eval_precision_Anger_Frustration': 0.5768321513002365, 'eval_recall_Anger_Frustration': 0.43385490753911804, 'eval_f1_Anger_Frustration': 0.4952303633042419, 'eval

In [15]:
# Generate and print full classification report
predictions = trainer.predict(val_dataset)
logits = predictions.predictions
if isinstance(logits, tuple):
    logits = logits[0]
probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
preds = (probs >= 0.5).astype(int)
labels = predictions.label_ids

detailed_report = classification_report(labels, preds, target_names=label_cols, zero_division=0)
print("\nPer-Label Classification Report:\n")
print(detailed_report)





Per-Label Classification Report:

                        precision    recall  f1-score   support

                Joyful       0.61      0.48      0.54      1430
          Affectionate       0.69      0.63      0.66      4070
      Positive_Outlook       0.51      0.19      0.27       539
     Anger_Frustration       0.58      0.43      0.50      2812
Sadness_Disappointment       0.60      0.39      0.47      1177
          Fear_Anxiety       0.51      0.28      0.36       401
    Surprise_Confusion       0.58      0.44      0.50      2083
                Desire       0.54      0.24      0.33       224
               Neutral       0.64      0.34      0.44      5600

             micro avg       0.63      0.44      0.52     18336
             macro avg       0.58      0.38      0.45     18336
          weighted avg       0.62      0.44      0.51     18336
           samples avg       0.44      0.44      0.44     18336



In [16]:
# Optional: Save the model
trainer.save_model(OUTPUT_DIR)
print(f'Model saved to {OUTPUT_DIR}')

Model saved to ./roberta_finetuned
