In [2]:
!pip install transformers datasets scikit-learn




In [1]:
!pip install --upgrade fsspec datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are ins

🧠 STEP 1: Load the Dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("go_emotions")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

🪄 STEP 2: Preprocess the Text (Tokenize)

In [4]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization
encoded_dataset = dataset.map(tokenize, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

🎯 STEP 3: Prepare Labels (Multi-label encoding)

In [5]:
encoded_dataset.reset_format()



In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Fit label binarizer on training labels
mlb = MultiLabelBinarizer()
mlb.fit(dataset['train']['labels'])

def encode_labels(example):
    example['labels'] = mlb.transform([example['labels']])[0].astype(float).tolist()
    return example

encoded_dataset = encoded_dataset.map(encode_labels)



Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

🏋️ STEP 4: Format for PyTorch

In [7]:
import torch
import datasets

encoded_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels'],
    output_all_columns=False,
    device='cpu'
)

# Explicitly set label dtype to float
encoded_dataset = encoded_dataset.cast_column("labels", datasets.features.Sequence(datasets.Value("float32")))


Casting the dataset:   0%|          | 0/43410 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5426 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5427 [00:00<?, ? examples/s]

🧠 STEP 5: Load Pretrained BERT + Setup Trainer

In [8]:
from transformers import BertForSequenceClassification

# BERT model for multi-label classification (we use BCEWithLogitsLoss under the hood)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🧪 STEP 6: Training Setup

In [9]:
import torch
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(p):
    preds = (p.predictions > 0.5).astype(int)
    labels = p.label_ids
    return {
        'micro_f1': f1_score(labels, preds, average='micro'),
        'macro_f1': f1_score(labels, preds, average='macro'),
        'accuracy': accuracy_score(labels, preds)
    }

# Set the device here in TrainingArguments
device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import TrainingArguments

import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir='./results',  # output directory for model checkpoints
    num_train_epochs=3,      # number of training epochs
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    logging_dir=None,  # No logging directory for WandB
    logging_steps=500,  # How often to log the metrics
    disable_tqdm=True,  # Disable tqdm progress bar
    report_to="none"    # Disable logging to WandB
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics
)


🚀 STEP 7: Train the Model

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [11]:
# move model to device
model.to(device)

# Train the model
trainer.train()

{'loss': 0.1689, 'grad_norm': 0.6769249439239502, 'learning_rate': 4.846753884896506e-05, 'epoch': 0.09213193292795283}
{'loss': 0.1209, 'grad_norm': 0.34559500217437744, 'learning_rate': 4.693200663349917e-05, 'epoch': 0.18426386585590565}
{'loss': 0.108, 'grad_norm': 0.3314470648765564, 'learning_rate': 4.539647441803329e-05, 'epoch': 0.2763957987838585}
{'loss': 0.1025, 'grad_norm': 0.5194425582885742, 'learning_rate': 4.3860942202567415e-05, 'epoch': 0.3685277317118113}
{'loss': 0.0978, 'grad_norm': 0.46612316370010376, 'learning_rate': 4.232540998710153e-05, 'epoch': 0.46065966463976415}
{'loss': 0.0988, 'grad_norm': 0.3528313636779785, 'learning_rate': 4.078987777163565e-05, 'epoch': 0.552791597567717}
{'loss': 0.0959, 'grad_norm': 0.4226226508617401, 'learning_rate': 3.925434555616977e-05, 'epoch': 0.6449235304956698}
{'loss': 0.0903, 'grad_norm': 0.7070581316947937, 'learning_rate': 3.771881334070389e-05, 'epoch': 0.7370554634236226}
{'loss': 0.0919, 'grad_norm': 0.549559354782

TrainOutput(global_step=16281, training_loss=0.08017076666045705, metrics={'train_runtime': 3573.806, 'train_samples_per_second': 36.44, 'train_steps_per_second': 4.556, 'train_loss': 0.08017076666045705, 'epoch': 3.0})

🎉 STEP 8: Evaluate the Model

In [12]:
# Evaluate on the test set using the trainer
results = trainer.evaluate()

# Print the results
print("Evaluation results:", results)

from sklearn.metrics import precision_score, recall_score

# Get predictions for the test set
predictions = trainer.predict(encoded_dataset["test"])

# Extract predicted labels (assuming binary classification)
pred_labels = (predictions.predictions > 0.5).astype(int)

# Extract true labels
true_labels = predictions.label_ids

precision = precision_score(true_labels, pred_labels, average='micro')
recall = recall_score(true_labels, pred_labels, average='micro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")




{'eval_loss': 0.0870317816734314, 'eval_micro_f1': 0.5668801808590807, 'eval_macro_f1': 0.43860606780848554, 'eval_accuracy': 0.44692222631772943, 'eval_runtime': 41.2094, 'eval_samples_per_second': 131.669, 'eval_steps_per_second': 16.477, 'epoch': 3.0}
Evaluation results: {'eval_loss': 0.0870317816734314, 'eval_micro_f1': 0.5668801808590807, 'eval_macro_f1': 0.43860606780848554, 'eval_accuracy': 0.44692222631772943, 'eval_runtime': 41.2094, 'eval_samples_per_second': 131.669, 'eval_steps_per_second': 16.477, 'epoch': 3.0}
Precision: 0.7128640776699029
Recall: 0.4640543529783536


Save the Model

In [14]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [15]:
model.save_pretrained("/content/drive/MyDrive/BERT_model")
tokenizer.save_pretrained("/content/drive/MyDrive/BERT_model")

('/content/drive/MyDrive/BERT_model/tokenizer_config.json',
 '/content/drive/MyDrive/BERT_model/special_tokens_map.json',
 '/content/drive/MyDrive/BERT_model/vocab.txt',
 '/content/drive/MyDrive/BERT_model/added_tokens.json')