In [None]:
kaggle: (https://www.kaggle.com/datasets/ramisashararnidhi/emotion-dataset?select=Untitled+spreadsheet+-+emotion_dataset.csv)

In [351]:
import os
import numpy as np
import cv2
import pandas as pd

import torch
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader

import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import ast
import librosa
from transformers import AutoTokenizer

from sklearn.preprocessing import LabelEncoder, scale
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [352]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

BATCH_SIZE = 16
EPOCHS = 500
lr = 0.0001

In [353]:
emotion_encoder = LabelEncoder()
class Emotion_Dataset(Dataset):
    def __init__(self, transform, mode):
        self.transform = transform
        self.audio, self.text, self.emotion = [],[],[]
        
        model_id = "distilbert-base-uncased"
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        
        df = pd.read_csv('spreadsheet-emotion_dataset.csv')
        self.audio = df['audio_file'].tolist()
        self.text = df['transcribed_data'].tolist()
        self.emotion = df['emotion'].tolist()
        train_size = int(len(df)*0.8)
        if mode == 'train':
            self.audio = self.audio[:train_size]
            self.text = self.text[:train_size]
            self.emotion = self.emotion[:train_size]
            self.emotion = emotion_encoder.fit_transform(self.emotion)
        else:
            self.audio = self.audio[train_size:]
            self.text = self.text[train_size:]
            self.emotion = self.emotion[train_size:]
            self.emotion = emotion_encoder.transform(self.emotion)
        print(self.tokenizer.vocab_size)
        
    def __len__(self):
        return len(self.audio)
    
    def __getitem__(self, idx):
        audio, text, emotion = self.audio[idx], self.text[idx], self.emotion[idx]
        
        audio, _ = librosa.load('audio_file/audio_file/'+audio, sr=16000)
        mfcc = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=100, n_fft=400, hop_length=160)
        mfcc = scale(mfcc, axis=1)
        pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))
        mfcc = pad2d(mfcc, 120)
        
        text = self.tokenizer(text, padding=True, truncation=True, add_special_tokens=True)['input_ids']
        text = text + [0]*(20-len(text))
        
        mfcc = torch.tensor(mfcc, dtype=torch.float32).unsqueeze(0)
        text = torch.tensor(text, dtype=torch.long)
        
        return (mfcc, text, emotion)

def tokenize(batch):
    return to

In [354]:
train_transforms = transforms.Compose([transforms.ToTensor(),
                                       transforms.Resize((256,256))])
test_transforms = transforms.Compose([transforms.ToTensor(),
                                       transforms.Resize((256,256))])
train_set = Emotion_Dataset(transform=train_transforms, mode='train')
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_set = Emotion_Dataset(transform=test_transforms, mode='test')
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

30522
30522


In [355]:
class MFCCBlock(nn.Module):
    def __init__(self, out_dims):
        super(MFCCBlock, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2)

        self.conv3 = nn.Conv2d(64, 96, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(2)

        self.flatten = nn.Flatten()

        self.fc1 = nn.Linear(in_features=1, out_features=64)
        self.fc2 = nn.Linear(64, out_dims)

    def forward(self, x):
        x = F.relu(self.conv1(x))  # [B, 32, H, W]
        x = self.pool1(x)

        x = F.relu(self.conv2(x))  # [B, 64, H, W]
        x = self.pool2(x)

        x = F.relu(self.conv3(x))  # [B, 96, H, W]
        x = self.pool3(x)

        x = self.flatten(x)

        if isinstance(self.fc1, nn.Linear) and self.fc1.in_features == 1:
            self.fc1 = nn.Linear(x.size(1), 64).to(x.device)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        return x

In [356]:
class TextBlock(nn.Module):
    def __init__(self, out_dims, vocab_size, embedding_dim, hidden_dim):
        super(TextBlock, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(64, out_dims)
        
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        
        x = F.relu(self.fc1(outputs[:, -1]))
        x = F.relu(self.fc2(x))
        
        return x

In [357]:
class MultiModal(nn.Module):
    def __init__(self, num_classes=6):
        super(MultiModal, self).__init__()
        out_dims = 64
        self.audiomodel = MFCCBlock(out_dims=64)
        self.textmodel = TextBlock(out_dims=64, vocab_size=30522, embedding_dim=100, hidden_dim=1024)

        self.fc1 = nn.Linear(out_dims*2, 32)
        self.fc2 = nn.Linear(32, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, audio, text):
        audio_fc = self.audiomodel(audio)
        text_fc = self.textmodel(text)
        
        x = torch.cat([audio_fc, text_fc], axis=1)
        
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [358]:
model = MultiModal(num_classes=6).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [359]:
for epoch in range(EPOCHS):
    model.train()
    for i, (audios, texts, emotions) in enumerate(train_loader):
        audios = audios.float().to(device)
        texts = texts.long().to(device)
        emotions = emotions.long().to(device)

        model.zero_grad()
        outputs = model(audios, texts)
        loss = criterion(outputs, emotions)
        loss.backward()
        optimizer.step()

    if epoch%10==0:
        model.eval()
        val_loss = []
        preds, true_labels = [], []
        with torch.no_grad():
            for (audios, texts, emotions) in test_loader:
                audios = audios.float().to(device)
                texts = texts.long().to(device)
                emotions = emotions.long().to(device)
                
                pred = model(audios, texts)
                
                loss = criterion(pred, emotions)
                
                preds += pred.argmax(1).detach().cpu().numpy().tolist()
                true_labels += emotions.detach().cpu().numpy().tolist()
                
                val_loss.append(loss.item())
            
            _val_loss = np.mean(val_loss)
            _val_score = f1_score(true_labels, preds, average='macro')
        
        print(f'Epoch [{epoch}], Train Loss: {loss.item():.4f}, Val Loss: {_val_loss:.5f}, Val Macro F1: {_val_score:.5f}')


Epoch [0], Train Loss: 1.7912, Val Loss: 1.79123, Val Macro F1: 0.00000
Epoch [10], Train Loss: 1.7702, Val Loss: 1.77021, Val Macro F1: 0.08000
Epoch [20], Train Loss: 1.7529, Val Loss: 1.75294, Val Macro F1: 0.08000
Epoch [30], Train Loss: 1.7347, Val Loss: 1.73469, Val Macro F1: 0.08000
Epoch [40], Train Loss: 1.6654, Val Loss: 1.66544, Val Macro F1: 0.08571
Epoch [50], Train Loss: 1.6447, Val Loss: 1.64467, Val Macro F1: 0.22564
Epoch [60], Train Loss: 1.7459, Val Loss: 1.74592, Val Macro F1: 0.22564
Epoch [70], Train Loss: 1.7833, Val Loss: 1.78330, Val Macro F1: 0.35778
Epoch [80], Train Loss: 2.0668, Val Loss: 2.06675, Val Macro F1: 0.15741
Epoch [90], Train Loss: 2.1669, Val Loss: 2.16693, Val Macro F1: 0.18095
Epoch [100], Train Loss: 2.4606, Val Loss: 2.46062, Val Macro F1: 0.29206
Epoch [110], Train Loss: 2.5725, Val Loss: 2.57247, Val Macro F1: 0.32778
Epoch [120], Train Loss: 2.8010, Val Loss: 2.80103, Val Macro F1: 0.29206
Epoch [130], Train Loss: 1.8377, Val Loss: 1.8376