#Trying agian by changing the learning rate from 2e-5 to 3e-5

# BERT Fine-tuning for GoEmotions (continued)

This notebook continues the fine-tuning pipeline. It initializes the tokenizer and model, wraps the tokenized data in a PyTorch Dataset, configures Trainer-based training, computes multi-label metrics, saves the model, and runs a few sample inferences.

Assumptions: the preprocessed CSV files are available in the workspace and contain a `text` column and one-hot label columns (one column per emotion).

In [10]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [11]:
!pip install transformers datasets accelerate tokenizers
!pip install scikit-learn pandas jupyterlab notebook ipywidgets
!pip install nbconvert



In [12]:
# 1. Imports and basic setup
import os
import glob
import random
from dataclasses import dataclass
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)

# Reproducibility
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', device)

Using device: cuda


In [13]:
# 2. Load preprocessed CSVs and infer label columns
data_files = sorted(glob.glob('Preprocessed_goemotions_*.csv'))
if not data_files:
    # fallback to single file if pattern doesn't match
    data_files = [
        'Preprocessed_goemotions_1.csv'
    ]
print('Found data files:', data_files)

# Read and concatenate (be careful with memory for large datasets)
dfs = [pd.read_csv(f) for f in data_files]
df = pd.concat(dfs, ignore_index=True)
print('Total examples:', len(df))

# Expect a `text` column and multiple binary label columns. Infer label columns as all columns except 'text'.
assert 'text' in df.columns, 'Expected a text column in CSV'
label_cols = [c for c in df.columns if c != 'text']
num_labels = len(label_cols)
print('Detected label columns (%d):' % num_labels, label_cols)

Found data files: ['Preprocessed_goemotions_1.csv', 'Preprocessed_goemotions_2.csv', 'Preprocessed_goemotions_3.csv']
Total examples: 183356
Detected label columns (9): ['Joyful', 'Affectionate', 'Positive_Outlook', 'Anger_Frustration', 'Sadness_Disappointment', 'Fear_Anxiety', 'Surprise_Confusion', 'Desire', 'Neutral']


## Tokenizer and Model Initialization
We'll use `bert-base-uncased` and configure the model for multi-label classification.

In [14]:
MODEL_NAME = 'bert-base-uncased'
MAX_LENGTH = 128

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Config -> ensure problem_type set for multi-label if appropriate
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels)
# If this is multi-label (one-hot columns), set the problem_type so HF uses BCEWithLogitsLoss
config.problem_type = 'multi_label_classification'

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.to(device)
print('Model loaded. num_labels =', model.config.num_labels, 'problem_type =', model.config.problem_type)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded. num_labels = 9 problem_type = multi_label_classification


## Dataset Wrapper
Create a PyTorch Dataset that returns tokenized inputs and float labels (for BCE loss).

In [15]:
class GoEmotionsDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[List[float]], tokenizer, max_length: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = np.array(self.labels[idx], dtype=np.float32)
        enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors=None,
        )
        item = {
            'input_ids': torch.tensor(enc['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(enc['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.float),
        }
        return item

# Prepare train/val split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED, shuffle=True)
print('Train / Val sizes:', len(train_df), len(val_df))

train_texts = train_df['text'].fillna('').tolist()
val_texts = val_df['text'].fillna('').tolist()
train_labels = train_df[label_cols].values.tolist()
val_labels = val_df[label_cols].values.tolist()

train_dataset = GoEmotionsDataset(train_texts, train_labels, tokenizer, max_length=MAX_LENGTH)
val_dataset = GoEmotionsDataset(val_texts, val_labels, tokenizer, max_length=MAX_LENGTH)

print('Datasets ready. Example item keys:', list(train_dataset[0].keys()))

Train / Val sizes: 165020 18336
Datasets ready. Example item keys: ['input_ids', 'attention_mask', 'labels']


## Training Configuration (Hugging Face Trainer)
Set hyperparameters and enable device-aware training.

In [16]:
# Hyperparameters (tweak as needed)
BATCH_SIZE = 16
LEARNING_RATE = 3e-5
NUM_EPOCHS = 3
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 100
OUTPUT_DIR = './bert_finetuned'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Enable fp16 if CUDA available
fp16 = True if device == 'cuda' else False

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=False,
    fp16=fp16,
    report_to='none',
)

print('TrainingArguments configured. fp16=', fp16)

TrainingArguments configured. fp16= True


## Metrics & compute_metrics function
Define metrics for multi-label evaluation. Predictions are logits; apply sigmoid + thresholding.

In [17]:
def compute_metrics(eval_pred):
    # eval_pred.predictions: logits, eval_pred.label_ids: true labels
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(logits, tuple):
        logits = logits[0]
    probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
    preds = (probs >= 0.5).astype(int)
    # Subset accuracy (exact match)
    try:
        acc = accuracy_score(labels, preds)
    except Exception:
        acc = 0.0
    # Macro and micro averages for precision/recall/f1
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds, average='micro', zero_division=0)
    return {
        'accuracy': acc,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        'f1_micro': f1_micro,
    }

# Prepare Trainer - note: our dataset returns dictionaries -> Trainer will collate them properly
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print('Trainer ready')

Trainer ready


  trainer = Trainer(


## Training Execution
Run training and monitor progress. This cell will execute the training loop managed by the Trainer.

In [18]:
# Train
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)  # saves the tokenizer too via Trainer.save_model
metrics = train_result.metrics
print('Training finished. Summary metrics:')
print(metrics)

# Save training metrics to disk
with open(os.path.join(OUTPUT_DIR, 'train_metrics.json'), 'w') as f:
    import json
    json.dump(metrics, f, indent=2)

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro
1,0.2225,0.219841,0.379254,0.625465,0.318221,0.412403,0.65944,0.379854,0.48204
2,0.2079,0.218432,0.406959,0.612923,0.349069,0.433196,0.645184,0.407286,0.499348


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,Precision Micro,Recall Micro,F1 Micro
1,0.2225,0.219841,0.379254,0.625465,0.318221,0.412403,0.65944,0.379854,0.48204
2,0.2079,0.218432,0.406959,0.612923,0.349069,0.433196,0.645184,0.407286,0.499348
3,0.1868,0.226912,0.44219,0.562027,0.387128,0.452888,0.613168,0.444426,0.515335


Training finished. Summary metrics:
{'train_runtime': 3838.693, 'train_samples_per_second': 128.966, 'train_steps_per_second': 8.061, 'total_flos': 3.256598641595904e+16, 'train_loss': 0.20758804471450695, 'epoch': 3.0}


## Evaluation & Detailed Reports
Run prediction on the validation set, compute a classification report and confusion matrices per label.

In [19]:
# Predict on validation set
pred_output = trainer.predict(val_dataset)
logits = pred_output.predictions
if isinstance(logits, tuple):
    logits = logits[0]
probs = torch.sigmoid(torch.from_numpy(logits)).numpy()
preds = (probs >= 0.5).astype(int)
true = pred_output.label_ids

# Overall metrics (same as compute_metrics)
overall = compute_metrics(pred_output)
print('Overall evaluation metrics:')
print(overall)

# Per-label classification reports
for i, col in enumerate(label_cols):
    print('Label:', col)
    try:
        rep = classification_report(true[:, i], preds[:, i], zero_division=0)
        print(rep)
    except Exception as e:
        print('Failed to compute report for', col, e)

# Per-label confusion matrices (printed as small tables)
print('Per-label confusion matrices:')
for i, col in enumerate(label_cols):
    cm = confusion_matrix(true[:, i], preds[:, i])
    print(col, 'confusion_matrix:', cm)

Overall evaluation metrics:
{'accuracy': 0.4421902268760908, 'precision_macro': 0.5620269337904241, 'recall_macro': 0.38712825041496646, 'f1_macro': 0.4528883609153653, 'precision_micro': 0.6131677953348382, 'recall_micro': 0.4444262652705061, 'f1_micro': 0.5153354834629735}
Label: Joyful
              precision    recall  f1-score   support

         0.0       0.96      0.97      0.96     16906
         1.0       0.58      0.48      0.53      1430

    accuracy                           0.93     18336
   macro avg       0.77      0.73      0.75     18336
weighted avg       0.93      0.93      0.93     18336

Label: Affectionate
              precision    recall  f1-score   support

         0.0       0.89      0.92      0.91     14266
         1.0       0.69      0.62      0.65      4070

    accuracy                           0.85     18336
   macro avg       0.79      0.77      0.78     18336
weighted avg       0.85      0.85      0.85     18336

Label: Positive_Outlook
            

## Save Model & Tokenizer
The trained model and tokenizer are saved to `./bert_finetuned`.

In [None]:
# Confirm saved files
print('Saved files in output dir:')
print(''.join(sorted(os.listdir(OUTPUT_DIR))))

## Sample Inference
Run the fine-tuned model on a few validation samples to print predicted labels and probabilities.

In [21]:
# Load model/tokenizer from output dir for inference (ensures reproducibility)
inference_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
inference_model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR)
inference_model.to(device)
inference_model.eval()

def predict_texts(texts: List[str], threshold: float = 0.5) -> List[Dict]:
    enc = inference_tokenizer(texts, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors='pt')
    for k in enc:
        enc[k] = enc[k].to(device)
    with torch.no_grad():
        out = inference_model(**enc)
        logits = out.logits
        probs = torch.sigmoid(logits).cpu().numpy()
        preds = (probs >= threshold).astype(int)
    results = []
    for p, pr in zip(preds, probs):
        label_inds = [label_cols[i] for i, v in enumerate(p) if v == 1]
        scores = {label_cols[i]: float(pr[i]) for i in range(len(label_cols))}
        results.append({'predicted_labels': label_inds, 'scores': scores})
    return results

sample_texts = val_texts[:5]
preds = predict_texts(sample_texts)
for t, p in zip(sample_texts, preds):
    print('TEXT:', t)
    print('PREDICTED:', p['predicted_labels'])
    # print top 3 scores
    top3 = sorted(p['scores'].items(), key=lambda x: x[1], reverse=True)[:3]
    print('TOP SCORES:', top3)
    print('-' * 60)

TEXT: She needs a raise.
PREDICTED: ['Neutral']
TOP SCORES: [('Neutral', 0.8167619109153748), ('Affectionate', 0.10089360922574997), ('Desire', 0.07465910166501999)]
------------------------------------------------------------
TEXT: I just want tk be a random scumbag
PREDICTED: []
TOP SCORES: [('Neutral', 0.4942588210105896), ('Desire', 0.4461451768875122), ('Affectionate', 0.05526706948876381)]
------------------------------------------------------------
TEXT: Well, this is the first time I've seen that in over three years.
PREDICTED: ['Surprise_Confusion']
TOP SCORES: [('Surprise_Confusion', 0.5251181721687317), ('Neutral', 0.30961695313453674), ('Joyful', 0.18194273114204407)]
------------------------------------------------------------
TEXT: Welcome to racial integration, where your civility standards are racist and you better shut up or else.
PREDICTED: []
TOP SCORES: [('Anger_Frustration', 0.49606430530548096), ('Affectionate', 0.37718385457992554), ('Neutral', 0.1251859962940216