In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertModel, Trainer, TrainingArguments
import torch
from torch import nn
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


In [64]:

# Step 1: Load Data
data_path = "../public_data/train/track_a/eng.csv"
data = pd.read_csv(data_path)

# Step 2: Prepare Labels (multi-label)
emotions = ["Anger", "Fear", "Joy", "Sadness", "Surprise"]

# Step 3: Train-Validation-Test Split
train_data, val_data = train_test_split(data, test_size=0.25, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [65]:
test_data = pd.read_csv("../public_data_test/track_a/dev/eng.csv")
# uppercase solumns 'anger', 'fear', 'joy', 'sadness', 'surprise' in test_data
test_data.rename(columns={'anger': 'Anger', 'fear': 'Fear', 'joy': 'Joy', 'sadness': 'Sadness', 'surprise': 'Surprise'}, inplace=True)

In [66]:

# Custom Dataset Class
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.texts = data["text"].tolist()
        self.labels = data[["Anger", "Fear", "Joy", "Sadness", "Surprise"]].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encodings = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encodings["input_ids"].squeeze(0)
        attention_mask = encodings["attention_mask"].squeeze(0)
        return (input_ids, attention_mask), labels

In [68]:

# Step 4: Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

# Convert to Hugging Face Dataset
train_dataset = EmotionDataset(train_data, tokenizer)
val_dataset = EmotionDataset(val_data, tokenizer)
test_dataset = EmotionDataset(test_data, tokenizer)
# test_dataset = Dataset.from_pandas(test_data)

# Step 5: Model Definition
class DistilBertMultiLabel(torch.nn.Module):
    def __init__(self, num_labels):
        super(DistilBertMultiLabel, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, x):
        input_ids, attention_mask = x
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        return self.fc(pooled_output)
    

# # Model definition
# class BertEmotionClassifier(nn.Module):
#     def __init__(self, num_classes):
#         super(BertEmotionClassifier, self).__init__()
#         self.bert = DistilBertModel.from_pretrained('bert-base-uncased')
#         self.dropout = nn.Dropout(0.3)
#         self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

#     def forward(self, x):
#         input_ids, attention_mask = x
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = self.dropout(outputs.pooler_output)
#         return self.fc(pooled_output)


# Instantiate the model
num_labels = len(emotions)
model = DistilBertMultiLabel(num_labels=num_labels)


In [69]:
train_data

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
1584,eng_train_track_a_01585,"I gasped and my heart began to flutter, it was...",0,1,0,0,1
1800,eng_train_track_a_01801,She is awesome and totally not Bride-zilla-y a...,0,0,1,0,1
2429,eng_train_track_a_02430,I go to New York City about once a year to vis...,0,0,1,0,0
1706,eng_train_track_a_01707,I can see it in my head.,0,0,0,0,0
1027,eng_train_track_a_01028,Awkward lunch time... sometimes I think my mou...,0,1,0,1,0
...,...,...,...,...,...,...,...
1638,eng_train_track_a_01639,"At one point, I got pissed at Bryce for holdin...",1,0,0,0,0
1095,eng_train_track_a_01096,3rd I really don't know... Any of the main cha...,0,1,0,0,0
1130,eng_train_track_a_01131,I blinked a few times and fought with myself t...,0,1,0,1,1
1294,eng_train_track_a_01295,Grandma died of an massive heart attack before...,0,1,0,1,0


In [70]:

# Step 6: Metrics for Multi-label Classification
def compute_metrics(pred):
    logits, labels = pred
    preds = torch.sigmoid(torch.tensor(logits)) > 0.5  # Thresholding at 0.5
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Step 7: Training Arguments
training_args = TrainingArguments(
    output_dir="distilbert-multilabel-emotion",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)




In [None]:

# FastAI DataLoaders
dls = DataLoaders(train_loader, valid_loader, device=device)
output_path = Path('base_model_2')
output_path.mkdir(exist_ok=True, parents=True)

# Define Learner
learn = Learner(
    dls,
    model,
    loss_func=loss_func,
    opt_func=partial(OptimWrapper, opt=torch.optim.AdamW),
    metrics=[RocAuc()],
    path=output_path
)

# Callbacks
cbs = [
    SaveModelCallback(monitor='valid_loss', fname='best_valid'),
    EarlyStoppingCallback(monitor='valid_loss', patience=9),
    CSVLogger()
]

In [None]:

# # Step 8: Trainer Setup
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )


  trainer = Trainer(


In [72]:

# Step 9: Train the Model
trainer.train()


AttributeError: 'EmotionDataset' object has no attribute '_data'

In [None]:

# Step 10: Save the Model
trainer.save_model("distilbert-multilabel-emotion")


In [None]:

# Step 11: Evaluation
results = trainer.evaluate(test_dataset)
print(results)
