🧠 STEP 1: Load the Dataset

In [1]:
!pip install datasets



In [2]:
!pip install --upgrade fsspec datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are i

In [3]:
from datasets import load_dataset

dataset = load_dataset("go_emotions")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

🪄 STEP 2: Preprocess the Text (Tokenize)

In [4]:
from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification

model_checkpoint = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=64)

# Apply tokenization
encoded_dataset = dataset.map(tokenize, batched=True)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

🎯 STEP 3: Prepare Labels (Multi-label encoding)

In [5]:
encoded_dataset.reset_format()



In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Fit label binarizer on training labels
mlb = MultiLabelBinarizer()
mlb.fit(dataset['train']['labels'])

def encode_labels(example):
    example['labels'] = mlb.transform([example['labels']])[0].astype(float).tolist()
    return example

encoded_dataset = encoded_dataset.map(encode_labels)



Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

🏋️ STEP 4: Format for PyTorch

In [7]:
import torch
import datasets

encoded_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels'],
    output_all_columns=False,
    device='cpu'
)

# Explicitly set label dtype to float
encoded_dataset = encoded_dataset.cast_column("labels", datasets.features.Sequence(datasets.Value("float32")))


Casting the dataset:   0%|          | 0/43410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5427 [00:00<?, ? examples/s]

🧠 STEP 5: Load Pretrained BERT + Setup Trainer

In [8]:
from transformers import BertForSequenceClassification

# RoBERTa model for multi-label classification (we use BCEWithLogitsLoss under the hood)


tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🧪 STEP 6: Training Setup

In [9]:
!pip install --upgrade transformers



In [10]:
import torch
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(p):
    preds = (p.predictions > 0.5).astype(int)
    labels = p.label_ids
    return {
        'micro_f1': f1_score(labels, preds, average='micro'),
        'macro_f1': f1_score(labels, preds, average='macro'),
        'accuracy': accuracy_score(labels, preds)
    }

# Set the device here in TrainingArguments
device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir=None,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_micro_f1",
    greater_is_better=True,
    disable_tqdm=True,
    report_to=None      # Disable logging to WandB explicitly
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)


🚀 STEP 7: Train the Model

In [11]:
import torch
import os

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the selected device
model.to(device)

# **Import wandb and disable it explicitly**
import wandb
wandb.init(mode="disabled")

# Train the model
trainer.train()




{'loss': 0.1823, 'grad_norm': 0.6423712968826294, 'learning_rate': 1.9540261654689515e-05, 'epoch': 0.09213193292795283}
{'loss': 0.1247, 'grad_norm': 0.8257363438606262, 'learning_rate': 1.9079601990049753e-05, 'epoch': 0.18426386585590565}
{'loss': 0.1102, 'grad_norm': 0.47904959321022034, 'learning_rate': 1.8618942325409988e-05, 'epoch': 0.2763957987838585}
{'loss': 0.1033, 'grad_norm': 0.7590142488479614, 'learning_rate': 1.8158282660770222e-05, 'epoch': 0.3685277317118113}
{'loss': 0.0987, 'grad_norm': 1.3979295492172241, 'learning_rate': 1.769762299613046e-05, 'epoch': 0.46065966463976415}
{'loss': 0.0983, 'grad_norm': 0.9217475056648254, 'learning_rate': 1.7236963331490698e-05, 'epoch': 0.552791597567717}
{'loss': 0.0957, 'grad_norm': 1.197446346282959, 'learning_rate': 1.6776303666850932e-05, 'epoch': 0.6449235304956698}
{'loss': 0.0912, 'grad_norm': 0.6612133383750916, 'learning_rate': 1.631564400221117e-05, 'epoch': 0.7370554634236226}
{'loss': 0.0914, 'grad_norm': 0.58944994

TrainOutput(global_step=21708, training_loss=0.08045158202595006, metrics={'train_runtime': 2794.3029, 'train_samples_per_second': 62.141, 'train_steps_per_second': 7.769, 'train_loss': 0.08045158202595006, 'epoch': 4.0})

🎉 STEP 8: Evaluate the Model

In [12]:
# Evaluate on the test set using the trainer
results = trainer.evaluate()

# Print the results
print("Evaluation results:", results)

from sklearn.metrics import precision_score, recall_score

# Get predictions for the test set
predictions = trainer.predict(encoded_dataset["test"])

# Extract predicted labels (assuming binary classification)
pred_labels = (predictions.predictions > 0.5).astype(int)

# Extract true labels
true_labels = predictions.label_ids

precision = precision_score(true_labels, pred_labels, average='micro')
recall = recall_score(true_labels, pred_labels, average='micro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")




{'eval_loss': 0.08506891131401062, 'eval_micro_f1': 0.563970658283319, 'eval_macro_f1': 0.4301003412436445, 'eval_accuracy': 0.4399189089568743, 'eval_runtime': 18.6125, 'eval_samples_per_second': 291.525, 'eval_steps_per_second': 36.481, 'epoch': 4.0}
Evaluation results: {'eval_loss': 0.08506891131401062, 'eval_micro_f1': 0.563970658283319, 'eval_macro_f1': 0.4301003412436445, 'eval_accuracy': 0.4399189089568743, 'eval_runtime': 18.6125, 'eval_samples_per_second': 291.525, 'eval_steps_per_second': 36.481, 'epoch': 4.0}
Precision: 0.722716049382716
Recall: 0.4624743245378417


Save the Model

In [13]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [14]:
model.save_pretrained("/content/drive/MyDrive/emotion_model")
tokenizer.save_pretrained("/content/drive/MyDrive/emotion_model")


('/content/drive/MyDrive/emotion_model/tokenizer_config.json',
 '/content/drive/MyDrive/emotion_model/special_tokens_map.json',
 '/content/drive/MyDrive/emotion_model/vocab.json',
 '/content/drive/MyDrive/emotion_model/merges.txt',
 '/content/drive/MyDrive/emotion_model/added_tokens.json',
 '/content/drive/MyDrive/emotion_model/tokenizer.json')

In [15]:
!ls -1 /content/drive/MyDrive/emotion_model


config.json
merges.txt
model.safetensors
special_tokens_map.json
tokenizer_config.json
tokenizer.json
vocab.json
