## EmoLlama pretrained model

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm
from transformers import LlamaTokenizer, LlamaForCausalLM

### read de data

In [None]:
file_path = '/Users/juliamf/Desktop/CMS-CLS/winter_semester24:25/LLMs/project/public_data_dev/track_a/train/eng.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset Loaded Successfully!")
except FileNotFoundError:
    print("The specified file path is not found. Please check the path and try again.")

In [None]:
labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
label_counts = df[labels].sum()
print(label_counts)
total = label_counts.sum()
print("total labels:", total)
print(total/label_counts)

# Count how many labels each text has
label_combinations = df[labels].sum(axis=1)
print(label_combinations.value_counts())

### load tokenizer and model

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        # Convert labels to tensor
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label,
        }

In [None]:
# Extract text and labels
texts = df["text"].tolist()
labels = df[["anger", "fear", "joy", "sadness", "surprise"]].values.tolist()  

print("This shows the texts inputs:", texts)
print("This shows the labels for each text input:", labels)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
tokenizer = LlamaTokenizer.from_pretrained('lzw1008/Emollama-7b')
model = LlamaForCausalLM.from_pretrained('lzw1008/Emollama-7b', device_map='auto')

In [None]:
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

print(train_dataset)
print(val_dataset)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

### training 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Compute pos_weight for BCEWithLogitsLoss
labels_tensor = torch.tensor(labels, dtype=torch.float)
num_positives = labels_tensor.sum(dim=0)
num_negatives = labels_tensor.shape[0] - num_positives
pos_weight = num_negatives / num_positives
pos_weight_tensor = torch.tensor(pos_weight, dtype=torch.float).to(device)

print(label_counts)
print(num_positives, num_negatives)
print(pos_weight_tensor)

loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = AdamW(model.parameters(), lr=2e-5)

def compute_metrics(preds, labels):
    preds = (torch.sigmoid(preds) > 0.5).int()  
    
    f1_per_emotion = f1_score(labels.cpu(), preds.cpu(), average=None)  
    macro_f1 = f1_score(labels.cpu(), preds.cpu(), average="macro")
    micro_f1 = f1_score(labels.cpu(), preds.cpu(), average="micro")
    subset_accuracy = (preds == labels).all(dim=1).float().mean().item()
    
    return {"f1_per_emotion": f1_per_emotion, "macro_f1": macro_f1, "micro_f1": micro_f1, "subset_accuracy": subset_accuracy}

In [None]:
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        
        loss = loss_fn(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_loss:.4f}")
    
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(outputs.logits.cpu())
            true_labels.extend(labels.cpu())
    
    preds = torch.stack(preds)
    true_labels = torch.stack(true_labels)
    
    metrics = compute_metrics(preds, true_labels)
    print(f"Epoch {epoch + 1}, Validation Macro F1: {metrics['macro_f1']:.4f}")
    print(f"Epoch {epoch + 1}, Validation Micro F1: {metrics['micro_f1']:.4f}")
    print(f"Epoch {epoch + 1}, Validation Subset Accuracy: {metrics['subset_accuracy']:.4f}")
    print("Validation F1 per emotion:", metrics["f1_per_emotion"])