In [None]:
import torch
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torch import nn, optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = "../data/processed/full_processed_dataset.csv"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv(DATA_PATH)
df["label_encoded"] = LabelEncoder().fit_transform(df["label"])

In [4]:
class MultimodalEmotionDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row.filepath).convert("RGB")
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(row.label_encoded, dtype=torch.long)
        text_feat = torch.tensor([len(row.label)], dtype=torch.float32)
        return image, text_feat, label


In [5]:
transform = transforms.Compose([
    transforms.Resize((48,48)),
    transforms.ToTensor(),
    transforms.Normalize([0.5],[0.5])
])

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df.label_encoded, random_state=42)
train_dataset = MultimodalEmotionDataset(train_df, transform=transform)
val_dataset = MultimodalEmotionDataset(val_df, transform=transform)

In [6]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
class MultimodalEmotionNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.cnn = models.resnet18(weights=None)
        self.cnn.fc = nn.Identity()
        self.text_fc = nn.Linear(1, 32)
        self.classifier = nn.Sequential(
            nn.Linear(512 + 32, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, image, text_feat):
        img_feat = self.cnn(image)
        txt_feat = self.text_fc(text_feat)
        x = torch.cat([img_feat, txt_feat], dim=1)
        return self.classifier(x)

In [8]:
num_classes = df.label_encoded.nunique()
model = MultimodalEmotionNet(num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [9]:
best_val_acc = 0
EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    train_correct, train_total = 0, 0
    for imgs, texts, labels in train_loader:
        imgs, texts, labels = imgs.to(DEVICE), texts.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(imgs, texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        _, preds = torch.max(outputs, 1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)
    train_acc = train_correct / train_total

    model.eval()
    val_correct, val_total = 0, 0
    with torch.no_grad():
        for imgs, texts, labels in val_loader:
            imgs, texts, labels = imgs.to(DEVICE), texts.to(DEVICE), labels.to(DEVICE)
            outputs = model(imgs, texts)
            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)
    val_acc = val_correct / val_total

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "../models/emotion_cnn_best.pt")