<a href="https://colab.research.google.com/github/justwhyyy/snakes-ladders/blob/main/distilBERT_go_emotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [50]:
import os
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn

In [51]:
data_files = {
    "train": "train.tsv",
    "validation": "dev.tsv",
    "test": "test.tsv"
}

# The dataset has no header, so we specify column names
column_names = ["text", "labels", "id"]
dataset = load_dataset("csv", data_files=data_files, delimiter="\t", column_names=column_names)


In [52]:
print(dataset["train"][0])

{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': '27', 'id': 'eebbqej'}


In [53]:
emotions = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization", "relief",
    "remorse", "sadness", "surprise", "neutral"
]
num_labels = len(emotions)  # should be 28
print(num_labels)

28


In [54]:
def encode_labels(example):
    label_vec = np.zeros(num_labels, dtype=np.float32)  # Changed to float32
    if isinstance(example["labels"], str) and example["labels"]:
        label_ids = [int(x) for x in example["labels"].split(",")]
        for lid in label_ids:
            label_vec[lid] = 1.0
    example["labels"] = label_vec.tolist()  # Convert numpy array to list
    return example

# Apply the mapping once
dataset = dataset.map(encode_labels)
print("Example after encoding:")
print(dataset["train"][0])


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Example after encoding:
{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 'id': 'eebbqej'}


In [55]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # The dataset’s max length is around 30 tokens, but we’ll pad to 64 to be safe.
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "id"])

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [56]:
# Convert labels to float32 tensors
def prepare_labels(example):
    example["labels"] = torch.tensor(example["labels"], dtype=torch.float32)
    return example

tokenized_dataset = tokenized_dataset.map(prepare_labels)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [57]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name="my-goemotions-run",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",  # We’ll define this metric below
    greater_is_better=True,
    fp16=True  # use mixed precision if your GPU supports it, speeds up training
)

In [59]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).numpy().astype(float)

    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro
    }

In [60]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro
1,0.0847,0.084362,0.564418,0.390693
2,0.0737,0.08174,0.581428,0.453209
3,0.0593,0.08602,0.586513,0.469927


TrainOutput(global_step=8142, training_loss=0.0805601318179463, metrics={'train_runtime': 494.3918, 'train_samples_per_second': 263.415, 'train_steps_per_second': 16.469, 'total_flos': 2157403532221440.0, 'train_loss': 0.0805601318179463, 'epoch': 3.0})

In [61]:
from sklearn.metrics import classification_report

def detailed_metrics(model, eval_dataset):
    trainer.model.eval()
    predictions = trainer.predict(eval_dataset)
    probs = 1 / (1 + np.exp(-predictions.predictions))
    preds = (probs > 0.5).astype(int)

    report = classification_report(
        predictions.label_ids,
        preds,
        target_names=emotions,
        output_dict=True
    )

    # Sort emotions by F1-score
    performance = [(emotion, report[emotion]['f1-score'])
                  for emotion in emotions]
    return sorted(performance, key=lambda x: x[1])

performance = detailed_metrics(model, tokenized_dataset["validation"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [63]:
def calculate_class_weights(dataset):
    label_counts = np.sum(dataset["train"]["labels"], axis=0)
    total = len(dataset["train"])
    weights = total / (len(emotions) * label_counts)
    return torch.tensor(weights, dtype=torch.float32)

class_weights = calculate_class_weights(tokenized_dataset)
print(class_weights)

tensor([ 0.3754,  0.6660,  0.9894,  0.6277,  0.5275,  1.4263,  1.1333,  0.7076,
         2.4187,  1.2217,  0.7667,  1.9551,  5.1167,  1.8175,  2.6013,  0.5824,
        20.1345,  1.0677,  0.7432,  9.4534,  0.9806, 13.9672,  1.3967, 10.1331,
         2.8447,  1.1692,  1.4626,  0.1090])


In [64]:
# Evaluate on test set
test_results = trainer.evaluate(tokenized_dataset["test"])
print("Test results:", test_results)

Test results: {'eval_loss': 0.08615915477275848, 'eval_f1_micro': 0.5875756496974012, 'eval_f1_macro': 0.47573617448751654, 'eval_runtime': 9.3697, 'eval_samples_per_second': 579.206, 'eval_steps_per_second': 36.287, 'epoch': 3.0}
