In [None]:
!pip install -q transformers datasets

In [None]:
!pip install -q evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
save_path = "/content/drive/MyDrive/beam_emotion_model/"
import os
os.makedirs(save_path, exist_ok=True)

In [None]:
from datasets import load_dataset

# Load GoEmotions dataset (we use "simplified" 27-label version)
dataset = load_dataset("go_emotions", "simplified")

# Check sample
print(dataset["train"][0])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

num_labels = 27  # GoEmotions simplified version has 27 labels

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)

In [None]:
NUM_LABELS = 27

def preprocess_function(examples):
    texts = examples["text"]
    # Pick first label
    raw_labels = [label[0] for label in examples["labels"]]
    # Clamp labels to [0, NUM_LABELS-1]
    safe_labels = [label if label < NUM_LABELS else 0 for label in raw_labels]
    encodings = tokenizer(texts, padding="max_length", truncation=True)
    encodings["labels"] = safe_labels
    return encodings

encoded_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./beam_goemotions_roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)