## Roberta pretrained model

In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Read the data

In [3]:
file_path = '/Users/juliamf/Desktop/CMS-CLS/winter_semester24:25/LLMs/project/public_data_dev/track_a/train/eng.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset Loaded Successfully!")
except FileNotFoundError:
    print("The specified file path is not found. Please check the path and try again.")

Dataset Loaded Successfully!


In [4]:
print(df)

                           id  \
0     eng_train_track_a_00001   
1     eng_train_track_a_00002   
2     eng_train_track_a_00003   
3     eng_train_track_a_00004   
4     eng_train_track_a_00005   
...                       ...   
2763  eng_train_track_a_02764   
2764  eng_train_track_a_02765   
2765  eng_train_track_a_02766   
2766  eng_train_track_a_02767   
2767  eng_train_track_a_02768   

                                                   text  anger  fear  joy  \
0                          Colorado, middle of nowhere.      0     1    0   
1     This involved swimming a pretty large lake tha...      0     1    0   
2           It was one of my most shameful experiences.      0     1    0   
3     After all, I had vegetables coming out my ears...      0     0    0   
4                           Then the screaming started.      0     1    0   
...                                                 ...    ...   ...  ...   
2763  She cants her hip against my waist into my sid...      0  

In [5]:
labels = ['anger', 'fear', 'joy', 'sadness', 'surprise']
label_counts = df[labels].sum()
print(label_counts)
total = label_counts.sum()
print("total labels:", total)
print(total/label_counts)

# Count how many labels each text has
label_combinations = df[labels].sum(axis=1)
print(label_combinations.value_counts())

anger        333
fear        1611
joy          674
sadness      878
surprise     839
dtype: int64
total labels: 4335
anger       13.018018
fear         2.690875
joy          6.431751
sadness      4.937358
surprise     5.166865
dtype: float64
1    1141
2    1031
3     298
0     239
4      57
5       2
Name: count, dtype: int64


### Load tokenizer and model 

A custom dataset class needs to be defined in order to convert the dataframe into a readible format compatible with PyTorch's DataLoader 

In [6]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        # Convert labels to tensor
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label,
        }

In [7]:
# Extract text and labels
texts = df["text"].tolist()
labels = df[["anger", "fear", "joy", "sadness", "surprise"]].values.tolist()  

print("This shows the texts inputs:", texts)
print("This shows the labels for each text input:", labels)

This shows the labels for each text input: [[0, 1, 0, 0, 1], [0, 1, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 1], [0, 1, 0, 0, 1], [1, 1, 0, 0, 0], [0, 1, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 0, 0, 0], [0, 0, 1, 0, 1], [0, 1, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 1, 0, 0], [1, 1, 0, 0, 0], [0, 1, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 1, 0, 1], [1, 0, 0, 0, 0], [0, 1, 0, 0, 1], [0, 1, 0, 1, 0], [0, 1, 0, 0, 1], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 1], [0, 1, 0, 1, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 1, 1, 0, 1], [0, 1, 0, 1, 0], [1, 1, 0, 1, 1], [0, 1, 0, 0, 0], [1, 1, 0, 1, 0], [0, 1, 0, 0, 1], [0, 0, 1, 1, 0], [0, 1, 0, 1, 0], [0, 0, 1, 0, 1], [0, 0, 1, 0, 0], [0, 0, 1, 0, 1], [0, 1, 0, 0, 0], [0, 1, 0, 1, 0], [0, 1, 0, 0, 1], [0, 1, 0, 0, 0], [0, 1, 1, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 1], [0, 1, 0, 0, 1], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 1], [0, 

In [8]:
# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [9]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [10]:
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

print(train_dataset)
print(val_dataset)

<__main__.EmotionDataset object at 0x132b45730>
<__main__.EmotionDataset object at 0x132b45c70>


In [11]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

### Loading the pretrained model Roberta

In [12]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining training components and training loop

- **Loss function**: binary cross-entropy loss for multi-label classification 
- **Optimizer**: Adam optimizer with a learning rate of 2e-5
- **Metrics**: for multilabel tasks, the F1-score together with the accuracy

In [16]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Compute pos_weight for BCEWithLogitsLoss
labels_tensor = torch.tensor(labels, dtype=torch.float)
num_positives = labels_tensor.sum(dim=0)
num_negatives = labels_tensor.shape[0] - num_positives
pos_weight = num_negatives / num_positives
pos_weight_tensor = torch.tensor(pos_weight, dtype=torch.float).to(device)

print(label_counts)
print(num_positives, num_negatives)
print(pos_weight_tensor)

# Compute weights based on label frquency
#weights_tensor = torch.tensor([4335/333, 4335/1611, 4335/674, 4335/878, 4335/839])

# Define the loss function
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Function to compute metrics
def compute_metrics(preds, labels):
    preds = (torch.sigmoid(preds) > 0.5).int()  # Threshold to get binary predictions
    
    # Per-emotion F1 scores
    f1_per_emotion = f1_score(labels.cpu(), preds.cpu(), average=None)  # One F1 for each label
    
    # Macro F1 Score
    macro_f1 = f1_score(labels.cpu(), preds.cpu(), average="macro")
    
    # Micro F1 Score
    micro_f1 = f1_score(labels.cpu(), preds.cpu(), average="micro")
    
    # Subset accuracy
    subset_accuracy = (preds == labels).all(dim=1).float().mean().item()
    
    return {"f1_per_emotion": f1_per_emotion, "macro_f1": macro_f1, "micro_f1": micro_f1, "subset_accuracy": subset_accuracy}

anger        333
fear        1611
joy          674
sadness      878
surprise     839
dtype: int64
tensor([ 333., 1611.,  674.,  878.,  839.]) tensor([2435., 1157., 2094., 1890., 1929.])
tensor([7.3123, 0.7182, 3.1068, 2.1526, 2.2992])


  pos_weight_tensor = torch.tensor(pos_weight, dtype=torch.float).to(device)


In [15]:
# Training Loop
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0

    # Training phase
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Compute loss
        loss = loss_fn(outputs.logits, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_loss:.4f}")
    
    # Validation phase
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(outputs.logits.cpu())
            true_labels.extend(labels.cpu())
    
    # Convert predictions and labels to tensors
    preds = torch.stack(preds)
    true_labels = torch.stack(true_labels)
    
    # Compute metrics
    metrics = compute_metrics(preds, true_labels)
    print(f"Epoch {epoch + 1}, Validation Macro F1: {metrics['macro_f1']:.4f}")
    print(f"Epoch {epoch + 1}, Validation Micro F1: {metrics['micro_f1']:.4f}")
    print(f"Epoch {epoch + 1}, Validation Subset Accuracy: {metrics['subset_accuracy']:.4f}")
    print("Validation F1 per emotion:", metrics["f1_per_emotion"])

100%|██████████| 277/277 [26:39<00:00,  5.77s/it] 


Epoch 1, Training Loss: 3.0038
Epoch 1, Validation Macro F1: 0.5992
Epoch 1, Validation Micro F1: 0.6824
Epoch 1, Validation Subset Accuracy: 0.3502
Validation F1 per emotion: [0.35416667 0.79676985 0.59668508 0.575      0.67341772]


100%|██████████| 277/277 [26:17<00:00,  5.70s/it]


Epoch 2, Training Loss: 2.1126
Epoch 2, Validation Macro F1: 0.6722
Epoch 2, Validation Micro F1: 0.7114
Epoch 2, Validation Subset Accuracy: 0.4061
Validation F1 per emotion: [0.54545455 0.78733032 0.66942149 0.68111455 0.67763158]


### Save and evaluate model

In [16]:
# Save model
model.save_pretrained("./roberta_emotion_model_frequency_weights")
tokenizer.save_pretrained("./roberta_emotion_model_frequency_weights")

('./roberta_emotion_model_frequency_weights/tokenizer_config.json',
 './roberta_emotion_model_frequency_weights/special_tokens_map.json',
 './roberta_emotion_model_frequency_weights/vocab.json',
 './roberta_emotion_model_frequency_weights/merges.txt',
 './roberta_emotion_model_frequency_weights/added_tokens.json')