In [1]:
# Install all the required libraries for PyTorch, Transformers, and scikit-learn
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install transformers datasets accelerate tokenizers scikit-learn pandas

Looking in indexes: https://download.pytorch.org/whl/cpu


In [2]:
# ==============================================================================
# 1. IMPORTS AND SETUP
# ==============================================================================
import os
import glob
import random
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    f1_score, # Added for threshold optimization
)

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)



In [3]:
# ==============================================================================
# 2. CONFIGURATION AND REPRODUCIBILITY
# ==============================================================================

# Reproducibility
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')



Using device: cuda


In [4]:
# ==============================================================================
# 3. DATA LOADING AND PREPARATION
# ==============================================================================

# Load preprocessed CSVs and infer label columns
# IMPORTANT: Make sure your 'Preprocessed_goemotions_*.csv' files are uploaded
# to your Colab or Jupyter environment.
data_files = sorted(glob.glob('Preprocessed_goemotions_*.csv'))
if not data_files:
    raise FileNotFoundError("Could not find the 'Preprocessed_goemotions_*.csv' files. Please ensure they are in the correct directory.")
print(f'Found data files: {data_files}')

# Read and concatenate
dfs = [pd.read_csv(f) for f in data_files]
df = pd.concat(dfs, ignore_index=True)
print(f'Total examples: {len(df)}')

# Infer label columns
assert 'text' in df.columns, 'Expected a text column in CSV'
label_cols = [c for c in df.columns if c != 'text']
num_labels = len(label_cols)
print(f'Detected label columns ({num_labels}): {label_cols}')



Found data files: ['Preprocessed_goemotions_1.csv', 'Preprocessed_goemotions_2.csv', 'Preprocessed_goemotions_3.csv']
Total examples: 183356
Detected label columns (9): ['Joyful', 'Affectionate', 'Positive_Outlook', 'Anger_Frustration', 'Sadness_Disappointment', 'Fear_Anxiety', 'Surprise_Confusion', 'Desire', 'Neutral']


In [5]:
# ==============================================================================
# 4. MODEL AND TOKENIZER INITIALIZATION
# ==============================================================================

# Using roberta-base
MODEL_NAME = 'roberta-base'
MAX_LENGTH = 128

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Config for multi-label classification
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels)
config.problem_type = 'multi_label_classification'

# Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.to(device)
print(f'Model loaded. num_labels = {model.config.num_labels}, problem_type = {model.config.problem_type}')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded. num_labels = 9, problem_type = multi_label_classification


In [6]:
# ==============================================================================
# 5. DATASET AND DATALOADER
# ==============================================================================

# Dataset Wrapper
class GoEmotionsDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[List[float]], tokenizer, max_length: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = np.array(self.labels[idx], dtype=np.float32)
        enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors=None,
        )
        item = {
            'input_ids': torch.tensor(enc['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(enc['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.float),
        }
        return item

# Prepare train/val split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED, shuffle=True)
print(f'Train / Val sizes: {len(train_df)} / {len(val_df)}')

train_texts = train_df['text'].fillna('').tolist()
val_texts = val_df['text'].fillna('').tolist()
train_labels = train_df[label_cols].values.tolist()
val_labels = val_df[label_cols].values.tolist()

train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer, max_length=MAX_LENGTH)
val_dataset = GoEmotionsDataset(val_texts, val_labels, tokenizer, max_length=MAX_LENGTH)

print('Datasets ready.')



Train / Val sizes: 165020 / 18336
Datasets ready.


In [7]:
# ==============================================================================
# 6. TRAINING CONFIGURATION
# ==============================================================================

# Training Hyperparameters
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
NUM_EPOCHS = 3
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 100
OUTPUT_DIR = './roberta_finetuned'
os.makedirs(OUTPUT_DIR, exist_ok=True)

fp16 = True if device == 'cuda' else False

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=False, # Consider setting to True with early stopping
    fp16=fp16,
    report_to='none',
)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(logits, tuple):
        logits = logits[0]
    probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
    # Using a default threshold for training evaluation, final evaluation will optimize this
    preds = (probs >= 0.28).astype(int)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)

    metrics = {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
    }
    return metrics

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print('Trainer ready.')



Trainer ready.


  trainer = Trainer(


In [8]:
# ==============================================================================
# 7. TRAINING EXECUTION
# ==============================================================================

print("Starting model training...")
trainer.train()
print("Training complete.")



Starting model training...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Precision Macro,Recall Macro
1,0.2278,0.225735,0.480294,0.554759,0.488379,0.502789
2,0.2162,0.220993,0.490229,0.56123,0.476867,0.52086
3,0.1993,0.222584,0.495926,0.562502,0.468122,0.535262


Training complete.


In [9]:
# ==============================================================================
# 8. FINAL EVALUATION AND THRESHOLD OPTIMIZATION
# ==============================================================================

print("\nGenerating predictions on the validation set...")
predictions = trainer.predict(val_dataset)

# Extract probabilities and labels
logits = predictions.predictions
if isinstance(logits, tuple):
    logits = logits[0]
probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
labels = predictions.label_ids

# --- NEW: THRESHOLD OPTIMIZATION BLOCK ---
print("\nOptimizing prediction threshold...")
best_f1 = 0
best_threshold = 0.5

for threshold in np.arange(0.1, 0.9, 0.01):
    preds = (probs >= threshold).astype(int)
    current_f1 = f1_score(labels, preds, average='macro', zero_division=0)

    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

# Round for cleaner printing
best_threshold = round(best_threshold, 2)

print(f"\nOptimization complete!")
print(f"Best threshold found: {best_threshold}")
print(f"Macro F1 score at this threshold: {best_f1:.4f}")

# --- UPDATED: Generate final report using the BEST threshold ---
print(f"\nPer-Label Classification Report (using threshold = {best_threshold}):\n")
final_preds = (probs >= best_threshold).astype(int)
detailed_report = classification_report(labels, final_preds, target_names=label_cols, zero_division=0)
print(detailed_report)





Generating predictions on the validation set...



Optimizing prediction threshold...

Optimization complete!
Best threshold found: 0.26
Macro F1 score at this threshold: 0.4984

Per-Label Classification Report (using threshold = 0.26):

                        precision    recall  f1-score   support

                Joyful       0.49      0.62      0.54      1430
          Affectionate       0.56      0.77      0.65      4070
      Positive_Outlook       0.39      0.29      0.33       539
     Anger_Frustration       0.46      0.65      0.54      2812
Sadness_Disappointment       0.47      0.52      0.50      1177
          Fear_Anxiety       0.38      0.42      0.40       401
    Surprise_Confusion       0.48      0.62      0.54      2083
                Desire       0.40      0.42      0.41       224
               Neutral       0.49      0.68      0.57      5600

             micro avg       0.50      0.65      0.56     18336
             macro avg       0.46      0.56      0.50     18336
          weighted avg       0.49      0.6

In [10]:
# ==============================================================================
# 9. SAVE MODEL
# ==============================================================================

trainer.save_model(OUTPUT_DIR)
print(f'\nFinal model saved to {OUTPUT_DIR}')


Final model saved to ./roberta_finetuned
