# GoEmotions → 3-Class Model Training (neutral / positive / help)

This notebook trains a DistilBERT model on the GoEmotions dataset, after mapping
the original 27 emotion labels into three classes:

- `neutral`
- `positive`
- `help` (distress / negative emotions)

The final model is saved to `goemotions_3class_model/` and can be loaded in a
Streamlit app or a separate inference notebook.


## 1. Imports

In [35]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import torch
import numpy as np
import re

from sklearn.metrics import accuracy_score, f1_score

import warnings

warnings.filterwarnings("ignore")

## 2. Load GoEmotions (simplified config)

In [36]:
ds = load_dataset("google-research-datasets/go_emotions", "simplified")
train_ds = ds["train"]
val_ds = ds["validation"]
test_ds = ds["test"]

label_names = train_ds.features["labels"].feature.names
len(label_names), label_names

(28,
 ['admiration',
  'amusement',
  'anger',
  'annoyance',
  'approval',
  'caring',
  'confusion',
  'curiosity',
  'desire',
  'disappointment',
  'disapproval',
  'disgust',
  'embarrassment',
  'excitement',
  'fear',
  'gratitude',
  'grief',
  'joy',
  'love',
  'nervousness',
  'optimism',
  'pride',
  'realization',
  'relief',
  'remorse',
  'sadness',
  'surprise',
  'neutral'])

## 3. Text Cleaning

In [37]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"@\w+", "@user", text)  # usernames
    text = re.sub(r"http\S+|www\.\S+", "http", text)  # urls
    text = re.sub(r"\s+", " ", text).strip()
    return text


def add_clean(batch):
    batch["clean_text"] = [clean_text(t) for t in batch["text"]]
    return batch


train_ds = train_ds.map(add_clean, batched=True)
val_ds = val_ds.map(add_clean, batched=True)
test_ds = test_ds.map(add_clean, batched=True)

## 4. Map 27 Emotions → 3 Classes

In [38]:
TARGET_LABELS = ["neutral", "positive", "help"]
label2id = {"neutral": 0, "positive": 1, "help": 2}

help_emotions = {
    "sadness",
    "fear",
    "remorse",
    "grief",
    "disappointment",
    "nervousness",
    "anger",
    "disgust",
    "annoyance",
    "embarrassment",
    "confusion",
}

positive_emotions = {
    "joy",
    "gratitude",
    "love",
    "caring",
    "excitement",
    "admiration",
    "relief",
    "optimism",
    "pride",
}

neutral_emotions = {
    "neutral",
    "curiosity",
    "desire",
    "surprise",
    "approval",
    "disapproval",
    "realization",
}


def map_goemotions_labels(ids):
    emos = {label_names[i] for i in ids}
    if emos & help_emotions:
        return "help"
    if emos & positive_emotions:
        return "positive"
    return "neutral"


def add_target(batch):
    batch["target"] = [map_goemotions_labels(lbls) for lbls in batch["labels"]]
    return batch


train_ds = train_ds.map(add_target, batched=True)
val_ds = val_ds.map(add_target, batched=True)
test_ds = test_ds.map(add_target, batched=True)

## 5. Add Numeric Label ID

In [39]:
def add_label_id(batch):
    batch["label_id"] = [label2id[t] for t in batch["target"]]
    return batch


train_ds = train_ds.map(add_label_id, batched=True)
val_ds = val_ds.map(add_label_id, batched=True)
test_ds = test_ds.map(add_label_id, batched=True)

## 6. Tokenize Text and Attach Labels

In [40]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize(batch):
    enc = tokenizer(
        batch["clean_text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    enc["labels"] = batch["label_id"]
    return enc


train_tok = train_ds.map(tokenize, batched=True)
val_tok = val_ds.map(tokenize, batched=True)
test_tok = test_ds.map(tokenize, batched=True)

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

## 7. Remove Extra Columns

In [41]:
remove_cols = ["text", "clean_text", "target", "label_id", "id"]

train_tok = train_tok.remove_columns(remove_cols)
val_tok = val_tok.remove_columns(remove_cols)
test_tok = test_tok.remove_columns(remove_cols)

## 8. Convert to PyTorch Dataset

In [42]:
train_tok = train_tok.with_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
val_tok = val_tok.with_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
test_tok = test_tok.with_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

train_tok[0]

{'labels': tensor(0),
 'input_ids': tensor([ 101, 2026, 8837, 2833, 2003, 2505, 1045, 2134, 1005, 1056, 2031, 2000,
         5660, 2870, 1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 

## 9. Define Model + Trainer

In [43]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3,
    id2label={0: "neutral", 1: "positive", 2: "help"},
    label2id={"neutral": 0, "positive": 1, "help": 2},
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }


training_args = TrainingArguments(
    output_dir="goemotions_3class_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=32,
    logging_steps=100,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 10. Train and Save Model

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("goemotions_3class_model")
tokenizer.save_pretrained("goemotions_3class_model")

  0%|          | 0/7236 [00:00<?, ?it/s]

{'loss': 0.913, 'grad_norm': 5.033580780029297, 'learning_rate': 1.9723604201216142e-05, 'epoch': 0.03}
{'loss': 0.6999, 'grad_norm': 6.683670997619629, 'learning_rate': 1.9447208402432286e-05, 'epoch': 0.06}
{'loss': 0.6808, 'grad_norm': 6.319267272949219, 'learning_rate': 1.9170812603648427e-05, 'epoch': 0.08}
{'loss': 0.6623, 'grad_norm': 5.333232402801514, 'learning_rate': 1.8894416804864568e-05, 'epoch': 0.11}
{'loss': 0.6743, 'grad_norm': 8.517215728759766, 'learning_rate': 1.861802100608071e-05, 'epoch': 0.14}
{'loss': 0.6268, 'grad_norm': 4.32692813873291, 'learning_rate': 1.8341625207296852e-05, 'epoch': 0.17}
{'loss': 0.5757, 'grad_norm': 10.112089157104492, 'learning_rate': 1.8065229408512993e-05, 'epoch': 0.19}
{'loss': 0.6181, 'grad_norm': 9.577991485595703, 'learning_rate': 1.7788833609729134e-05, 'epoch': 0.22}
{'loss': 0.5998, 'grad_norm': 6.249564170837402, 'learning_rate': 1.7512437810945274e-05, 'epoch': 0.25}
{'loss': 0.5975, 'grad_norm': 6.320524215698242, 'learnin

  0%|          | 0/170 [00:00<?, ?it/s]

{'eval_loss': 0.5314818620681763, 'eval_accuracy': 0.7876889052709178, 'eval_f1_macro': 0.7679581912657029, 'eval_runtime': 28.3619, 'eval_samples_per_second': 191.313, 'eval_steps_per_second': 5.994, 'epoch': 1.0}
{'loss': 0.4934, 'grad_norm': 4.408002853393555, 'learning_rate': 9.773355444997236e-06, 'epoch': 1.02}
{'loss': 0.4716, 'grad_norm': 9.094277381896973, 'learning_rate': 9.496959646213379e-06, 'epoch': 1.05}
{'loss': 0.47, 'grad_norm': 5.802071571350098, 'learning_rate': 9.22056384742952e-06, 'epoch': 1.08}
{'loss': 0.4891, 'grad_norm': 6.369538307189941, 'learning_rate': 8.944168048645662e-06, 'epoch': 1.11}
{'loss': 0.49, 'grad_norm': 7.970430850982666, 'learning_rate': 8.667772249861803e-06, 'epoch': 1.13}
{'loss': 0.4376, 'grad_norm': 8.5353422164917, 'learning_rate': 8.391376451077943e-06, 'epoch': 1.16}
{'loss': 0.4514, 'grad_norm': 7.632839679718018, 'learning_rate': 8.114980652294086e-06, 'epoch': 1.19}
{'loss': 0.4658, 'grad_norm': 3.3244190216064453, 'learning_rate

  0%|          | 0/170 [00:00<?, ?it/s]

{'eval_loss': 0.5431628227233887, 'eval_accuracy': 0.7867674161444895, 'eval_f1_macro': 0.7682379656953774, 'eval_runtime': 27.8859, 'eval_samples_per_second': 194.579, 'eval_steps_per_second': 6.096, 'epoch': 2.0}
{'train_runtime': 1592.9886, 'train_samples_per_second': 54.501, 'train_steps_per_second': 4.542, 'train_loss': 0.5334854618892807, 'epoch': 2.0}


('goemotions_3class_model/tokenizer_config.json',
 'goemotions_3class_model/special_tokens_map.json',
 'goemotions_3class_model/vocab.txt',
 'goemotions_3class_model/added_tokens.json',
 'goemotions_3class_model/tokenizer.json')