# Load Data Set

In [None]:
from datasets import load_dataset
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

dataset = load_dataset("go_emotions")

train_dataset_raw = dataset["train"]
val_dataset_raw = dataset["validation"]
test_dataset_raw = dataset["test"]

print(f"Train size: {len(train_dataset_raw)}")
print(f"Validation size: {len(val_dataset_raw)}")
print(f"Test size: {len(test_dataset_raw)}")

Using device: mps
Train size: 43410
Validation size: 5426
Test size: 5427


Label Pereparation

In [2]:
print(train_dataset_raw.column_names)
print(train_dataset_raw[1])

['text', 'labels', 'id']
{'text': 'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead', 'labels': [27], 'id': 'ed00q6i'}


In [3]:
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from collections import Counter

num_labels = 27  # GoEmotions has 28 emotion classes (excluding 'neutral')

def encode_labels(example):
    # Exclude neutral (label 27)
    labels = np.zeros(num_labels)
    for label in example['labels']:
        if label < num_labels:  # Only 0 to 26
            labels[label] = 1
    example['labels'] = labels.tolist()
    return example

train_dataset = train_dataset_raw.shuffle(seed=42).map(encode_labels)
val_dataset = val_dataset_raw.map(encode_labels)
test_dataset = test_dataset_raw.map(encode_labels)


print("Head of train dataset:")
print(train_dataset[1])
print("\nHead of validation dataset:")
print(val_dataset[0])
print("\nHead of test dataset:")
print(test_dataset[0])


print("\nNumber of labels in train dataset:", len(train_dataset[0]['labels']))



all_labels = [tuple(labels) for labels in train_dataset["labels"]]
print("All lables in train dataset:", all_labels[:26])  # Display first 5 for brevity
flat_counts = np.sum(train_dataset["labels"], axis=0)
print("Label counts per class:", flat_counts)


# train_dataset = train_dataset.select(range(5000))
# val_dataset = val_dataset.select(range(500))


empty_count = 0
non_empty_count = 0
for sample in train_dataset:
    if sum(sample['labels']) == 0:
        empty_count += 1
    else:
        non_empty_count += 1

print(f"\n\nEmpty label samples: {empty_count}")
print(f"Non-empty label samples: {non_empty_count}")

Head of train dataset:
{'text': 'Done, good luck with your thing.', 'labels': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'id': 'ef6ysvb'}

Head of validation dataset:
{'text': 'Is this in New Orleans?? I really feel like this is New Orleans.', 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'id': 'edgurhb'}

Head of test dataset:
{'text': 'I’m really sorry about your situation :( Although I love the names Sapphira, Cirilla, and Scarlett!', 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], 'id': 'eecwqtt'}

Number of labels in train dataset: 27
All lables in train dataset: [(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0), (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

In [4]:
# Filter Neutral Samples as empty samples add noise and teach the model that predicting nothing is normal
# Filter training samples that have at least one label
train_dataset = train_dataset.filter(lambda x: sum(x['labels']) > 0)
val_dataset = val_dataset.filter(lambda x: sum(x['labels']) > 0)

print(f"Filtered train size: {len(train_dataset)}")
print(f"Filtered validation size: {len(val_dataset)}")

Filtered train size: 30587
Filtered validation size: 3834


In [5]:
label_counts = [sum(example['labels']) for example in train_dataset]
print("Average labels/sample:", np.mean(label_counts))
print("Unique label values:", np.unique(train_dataset[0]['labels']))


labels_array = np.array([x['labels'] for x in train_dataset])
print("\n\nUnique label vectors in training set:", np.unique(labels_array, axis=0).shape[0])
for i in range(5):
    print(f"Train sample {i}: {train_dataset[i]['text']}")
    print(f"Val sample {i}:   {val_dataset[i]['text']}")

Average labels/sample: 1.2058717755909374
Unique label values: [0 1]


Unique label vectors in training set: 642
Train sample 0: I would say they do because it was a horrible accident but they were also in the wrong. The military tried to get them to disperse they didnt.
Val sample 0:   You know the answer man, you are programmed to capture those codes they send you, don’t avoid them!
Train sample 1: Done, good luck with your thing.
Val sample 1:   I've never been this sad in my life!
Train sample 2: Eat better.
Val sample 2:   The economy is heavily controlled and subsidized by the government. In any case, I was poking at the lack of nuance in US politics today
Train sample 3: So you stay on your theft from auto waiting for them to dispatch the next shift to the backlog of calls.
Val sample 3:   He could have easily taken a real camera from a legitimate source and change the price in Word/Photoshop and then print it out.
Train sample 4: Glad to hear it's ubiquitous and not an OS thing

#Tokenize the Text with DistilBertTokenizer Fast

In [6]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset_tk = train_dataset.map(tokenize, batched=True)
val_dataset_tk = val_dataset.map(tokenize, batched=True)
test_dataset_tk = test_dataset.map(tokenize, batched=True)

# This ensures your dataset is ready for PyTorch training
train_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Head of tokenized train dataset:")
print(train_dataset_tk[1])


Head of tokenized train dataset:
{'labels': tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]), 'input_ids': tensor([ 101, 2589, 1010, 2204, 6735, 2007, 2115, 2518, 1012,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0, 

In [7]:
from transformers import Trainer
from sklearn.metrics import f1_score, accuracy_score

labels_matrix = np.array(train_dataset["labels"])
label_freq = labels_matrix.sum(axis=0)
num_samples = labels_matrix.shape[0]

# Avoid divide-by-zero and clip very large weights
pos_weights = (num_samples - label_freq) / (label_freq + 1e-5)
pos_weights = np.clip(pos_weights, a_min=1.0, a_max=None)

class_weights_tensor = torch.tensor(pos_weights, dtype=torch.float).to(device)


class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss


def compute_metrics(pred):
    preds = pred.predictions
    labels = pred.label_ids

    sigmoid = lambda x: 1 / (1 + np.exp(-x))
    probs = sigmoid(preds)
    optimal_threshold = 0.7  # You can try different values later

    y_pred = np.where(probs >= optimal_threshold, 1, 0)

    print("Sample true:", labels[0])
    print("Sample pred:", y_pred[0])
    # print("Raw logits sample:", preds[0])
    # print("Sigmoid probs sample:", probs[0])
    
    f1 = f1_score(labels, y_pred, average='micro')
    acc = accuracy_score(labels, y_pred)

    return {"accuracy": acc, "f1": f1}

In [8]:
from transformers import DistilBertForSequenceClassification
from transformers import TrainerCallback

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
).to(device)
model = model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lotuso",
    eval_strategy="epoch",   
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
class BestF1Callback(TrainerCallback):
    def __init__(self):
        self.best_f1 = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        f1 = metrics.get("eval_f1", 0)
        if f1 > self.best_f1:
            print(f"\nNew best F1: {f1:.4f}")
            self.best_f1 = f1
            control.should_save = True
        else:
            control.should_save = False
        return control

In [10]:
# from transformers import AdamW
from torch.optim import AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm

# Loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
# DataLoader
train_loader = DataLoader(train_dataset_tk, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset_tk, batch_size=16)

def train_model(train_loader, val_loader, model, training_args):
    # Training loop
    for epoch in range(5):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device).float()

            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_loader):.4f}")

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
                labels = batch["labels"].cpu().numpy()
                logits = model(**inputs).logits
                probs = torch.sigmoid(logits).cpu().numpy()
                preds = (probs >= 0.5).astype(int)

                all_preds.extend(preds)
                all_labels.extend(labels)

        f1 = f1_score(all_labels, all_preds, average="micro")
        print(f"Epoch {epoch + 1} - Validation F1: {f1:.4f}")


In [11]:
hist = train_model(train_loader, val_loader, model, training_args)


Training Epoch 1: 100%|██████████| 1912/1912 [06:31<00:00,  4.88it/s]


Epoch 1 - Training Loss: 0.1384
Epoch 1 - Validation F1: 0.5534


Training Epoch 2: 100%|██████████| 1912/1912 [08:11<00:00,  3.89it/s]


Epoch 2 - Training Loss: 0.0875
Epoch 2 - Validation F1: 0.6043


Training Epoch 3: 100%|██████████| 1912/1912 [08:16<00:00,  3.85it/s]


Epoch 3 - Training Loss: 0.0744
Epoch 3 - Validation F1: 0.6000


Training Epoch 4: 100%|██████████| 1912/1912 [08:01<00:00,  3.97it/s]


Epoch 4 - Training Loss: 0.0625
Epoch 4 - Validation F1: 0.6089


Training Epoch 5: 100%|██████████| 1912/1912 [07:58<00:00,  4.00it/s]


Epoch 5 - Training Loss: 0.0509
Epoch 5 - Validation F1: 0.5971


In [12]:
torch.save(model.state_dict(), './saved_final_model/lotus_mul_emotion_classifier_v1.pt')

In [13]:

label_list = [dataset['train'].features['labels'].feature.int2str(i) 
              for i in range(dataset['train'].features['labels'].feature.num_classes) 
              if dataset['train'].features['labels'].feature.int2str(i).lower() != 'neutral']

# Get label list from the dataset (index to string)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

model.config.id2label = {str(k): v for k, v in id2label.items()}
model.config.label2id = label2id
model.config.num_labels = len(label_list)
model.config.problem_type = "multi_label_classification"

# Save model and tokenizer huggingface format
model.save_pretrained("./saved_final_model/lotus_emotion_model_v1")
tokenizer.save_pretrained("./saved_final_model/lotus_emotion_model_v1")


('./saved_final_model/lotus_emotion_model_v1/tokenizer_config.json',
 './saved_final_model/lotus_emotion_model_v1/special_tokens_map.json',
 './saved_final_model/lotus_emotion_model_v1/vocab.txt',
 './saved_final_model/lotus_emotion_model_v1/added_tokens.json',
 './saved_final_model/lotus_emotion_model_v1/tokenizer.json')

In [14]:

# Sigmoid function for multi-label output
sigmoid = lambda x: 1 / (1 + np.exp(-x))

# Get label index mapping
id2label = dataset['train'].features['labels'].feature.int2str


def predict_emotions(text, threshold=0.5):
    # Tokenize and move input to correct device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = sigmoid(logits.cpu().numpy()[0])  # move to CPU before numpy

    # Return all emotions with prob >= threshold
    return [(id2label(i), float(p)) for i, p in enumerate(probs) if p >= threshold]

# Example
#print(predict_emotions("I am scared and angry, but also a bit hopeful."))

print(predict_emotions("loved"))
print(predict_emotions("I am happy and sad at the same time."))


[('love', 0.9945964217185974)]
[('sadness', 0.9196999669075012)]
