# Multi-label Emotion Detection

SemEval2025 Task 11 Track A + C

Finetune model to generate code
use our data to fine-tune and evaluate
Evaluate fine-tuned vs zero-shot vs baseline (generate n times and record error rate)
Use a dataset for training

## 1A. Environment Set-up 

In [None]:
# Environment check 
import os
assert os.environ['CONDA_DEFAULT_ENV'] == "cs375"

import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 11

#### Installing other packages

In [None]:
import pandas as pd
import numpy as np
import gc
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(69)

import sentencepiece

#possibly remove some metrics later
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import transformers

#Choose between Bert and XLM Roberta
#from transformers import BertTokenizer, BertForSequenceClassification 
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW
#from transformers import AutoTokenizer, AutoModelForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## 1B. Data exploration

In [None]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def split_data(data, test_size=0.2, random_state=42):
    train_data, val_data = train_test_split(data, test_size=test_size, random_state=random_state)
    return train_data, val_data

# Custom Dataset class
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        labels = self.data.iloc[index][["Anger", "Fear", "Joy", "Sadness", "Surprise"]].values.astype(float)

        # Tokenize text
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(labels, dtype=torch.float),
        }

In [None]:
#Initialize tokenizer and model
def initialize_model_and_tokenizer():
    #USE FOR XLM ROBERTA
    MODEL_TYPE = 'xlm-roberta-base'
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_TYPE, num_labels = 5
    )

    #USE FOR DISTILBERT (not currently in use, need to uncomment autotokenizer in 1A too)
    #tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
    #model = AutoModelForSequenceClassification.from_pretrained(
        #"distilbert-base-multilingual-cased", num_labels=5
    #)
    #return tokenizer, model

    #USE FOR BERT
    #tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
    #model = BertForSequenceClassification.from_pretrained(
        #'bert-base-multilingual-cased', 
        #num_labels = 5,
        #output_attentions = False,
        #output_hidden_states = False
    #)
    return tokenizer, model

def train_model(model, train_loader, val_loader, device, epochs=3, learning_rate=5e-5):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model.to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}")

        evaluate_model(model, val_loader, device)

# Evaluation function
def evaluate_model(model, val_loader, device):
    model.eval()
    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )

            preds = torch.sigmoid(outputs.logits)
            preds = (preds > 0.5).float()

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(preds.cpu().numpy())

    # Calculate F1 score
    f1 = f1_score(np.array(true_labels), np.array(pred_labels), average="macro")
    print(f"Validation F1 Score: {f1:.4f}")

In [None]:
dataset_file_path = "public_data/train/track_a/eng.csv"  # Your complete dataset file
batch_size = 16
max_length = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
data = load_data(dataset_file_path)

# Split the dataset into training and validation sets (80/20 split)
train_data, val_data = split_data(data, test_size=0.2)
train_data = train_data.sample(n=100, random_state=42)
val_data = val_data.sample(n=20, random_state=42)

# Initialize tokenizer and model
tokenizer, model = initialize_model_and_tokenizer()

# Create datasets and data loaders
train_dataset = EmotionDataset(train_data, tokenizer, max_length)
val_dataset = EmotionDataset(val_data, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Train and evaluate the model
train_model(model, train_loader, val_loader, device, epochs=3)