In [None]:
import numpy as np
import pandas as pd
import librosa
import os
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from warnings import simplefilter
from sklearn.metrics import f1_score
from sklearn.utils import resample
import random

In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(42)
    np.random.seed(42)
    
set_seed(42)

In [None]:
data_train = pd.read_csv('Data_Train_modified.csv')
data_val = pd.read_csv('Data_Val_modified.csv')
data_test = pd.read_csv('Data_Test_original.csv')

In [None]:
tqdm.pandas()
data_train['sentiment'] = data_train['sentiment'].progress_apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutaral')
data_val['sentiment'] = data_val['sentiment'].progress_apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutaral')
data_test['sentiment'] = data_test['sentiment'].progress_apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutaral')

In [None]:
data_test["sentiment"].value_counts()

In [None]:
neutral_count = data_train[data_train["sentiment"] == "neutaral"].shape[0]
data_positive = data_train[data_train["sentiment"] == "positive"]
data_negative = data_train[data_train["sentiment"] == "negative"]

data_positive_downsampled = resample(data_positive, replace=False, n_samples=neutral_count, random_state=42)
data_negative_downsampled = resample(data_negative, replace=False, n_samples=neutral_count, random_state=42)
data_train = pd.concat([data_train[data_train["sentiment"] == "neutaral"], data_positive_downsampled, data_negative_downsampled])

In [None]:
data_train = data_train.reset_index().drop(columns=["index"])
data_val = data_val.reset_index().drop(columns=["index"])
data_test = data_test.reset_index().drop(columns=["index"])

data_train = data_train.rename(columns={"video": "audio_file"})
data_val = data_val.rename(columns={"video": "audio_file"})
data_test = data_test.rename(columns={"video": "audio_file"})

In [None]:
audio_folder = "./Audio/WAV_16000/"
SAMPLE_RATE = 16000
HOP_LENGTH = 512
N_FFT = 2048
N_MFCC= 13
MAX_TIME = int(np.array(data_train["end_time"] - data_train["start_time"]).mean())
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(DEVICE)

In [None]:
def extract_mfcc(file_path, start_time, end_time):
    y, sr = librosa.load(file_path, sr=None)
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    y_segment = y[start_sample:end_sample]
        
    mfcc = librosa.feature.mfcc(y=y_segment, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mfcc=N_MFCC)
    mfcc = mfcc.T
    max_frames = int(MAX_TIME * sr / HOP_LENGTH)

    if len(mfcc) < max_frames:
        pad_width = max_frames - len(mfcc)
        mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:max_frames, :]
    
    return mfcc

def extract_bert_features(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=600).to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)
        
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()[0]

In [None]:
def get_mfcc(data):
    data["mfcc"] = data.progress_apply(lambda row: extract_mfcc(os.path.join(audio_folder, row["audio_file"] + ".wav"), row["start_time"], row["end_time"]), axis=1)
    return data

def get_bert_embedings(data):
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

    data["bert_features"] = data["text"].progress_apply(lambda x: extract_bert_features(x))
    bert_columns = [f"bert_{i}" for i in range(768)]
    data[bert_columns] = pd.DataFrame(data["bert_features"].tolist(), index=data.index)
    data.drop(columns=["bert_features"], inplace=True)
    return data

In [None]:
audio_train = get_mfcc(data_train.copy())
audio_val = get_mfcc(data_val.copy())
audio_test = get_mfcc(data_test.copy())

text_train = get_bert_embedings(data_train.copy())
text_val = get_bert_embedings(data_val.copy())
text_test = get_bert_embedings(data_test.copy())

In [None]:
audio_train = pd.concat([audio_train, pd.get_dummies(audio_train['sentiment'])], axis=1)
audio_val = pd.concat([audio_val, pd.get_dummies(audio_val['sentiment'])], axis=1)
audio_test = pd.concat([audio_test, pd.get_dummies(audio_test['sentiment'])], axis=1)

text_train = pd.concat([text_train, pd.get_dummies(text_train['sentiment'])], axis=1)
text_val = pd.concat([text_val, pd.get_dummies(text_val['sentiment'])], axis=1)
text_test = pd.concat([text_test, pd.get_dummies(text_test['sentiment'])], axis=1)

In [None]:
X_audio_train = audio_train.drop(['audio_file', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_audio_train = audio_train[['negative', 'neutaral', 'positive']]

X_audio_val = audio_val.drop(['audio_file', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_audio_val = audio_val[['negative', 'neutaral', 'positive']]

X_audio_test = audio_test.drop(['audio_file', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_audio_test = audio_test[['negative', 'neutaral', 'positive']]

X_text_train = text_train.drop(['audio_file', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_text_train = text_train[['negative', 'neutaral', 'positive']]

X_text_val = text_val.drop(['audio_file', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_text_val = text_val[['negative', 'neutaral', 'positive']]

X_text_test = text_test.drop(['audio_file', 'start_time', 'end_time', 'sentiment', 'happy', 'sad',	'anger', 'surprise', 'disgust',	'fear', 'text', 'ASR', 'negative', 'neutaral', 'positive'], axis=1)
y_text_test = text_test[['negative', 'neutaral', 'positive']]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, inputs_text, input_audio, targets, device):
        self.inputs_text = torch.tensor(inputs_text, dtype=torch.float32).to(device)
        self.inputs_audio = torch.tensor(input_audio, dtype=torch.float32).to(device)
        self.targets = torch.tensor(targets, dtype=torch.float32).to(device)
    
    def __len__(self):
        return len(self.inputs_text)
    
    def __getitem__(self, index):
        return self.inputs_text[index], self.inputs_audio[index], self.targets[index]

class CNNTextClassifier(nn.Module):
    def __init__(self, input_shape, num_classes):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(2, 2)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(2, 2)
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool1d(2, 2)

        self.linear1 = nn.Linear(input_shape[1] // 8 * 64, 128)

    def extract_features(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.linear1(x))
        return x

class TransformerAudioClassifier(nn.Module):
    def __init__(self, seq_len, input_dim, num_classes, nhead=4, num_layers=2):
        super().__init__()
        self.input_projection = nn.Linear(input_dim, 128)
        self.positional_encoding = nn.Parameter(torch.rand(seq_len, 1, 128))
        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=nhead, dim_feedforward=512)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_layers)

    def extract_features(self, x):
        x = self.input_projection(x)
        x = x + self.positional_encoding
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)
        return x

class MultimodalSentimentClassifier(nn.Module):
    def __init__(self, text_model, audio_model, num_classes=3):
        super().__init__()
        self.text_model = text_model
        self.audio_model = audio_model
        self.classifier = nn.Sequential(
            nn.Linear(128 + 128, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, text_input, audio_input):
        text_feat = self.text_model.extract_features(text_input)
        audio_feat = self.audio_model.extract_features(audio_input)
        combined = torch.cat((text_feat, audio_feat), dim=1)
        out = self.classifier(combined)
        return out      

In [None]:
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

trainset = CustomDataset(X_text_train.to_numpy(), X_audio_train["mfcc"], y_text_train.to_numpy(), device)
trainloader = DataLoader(trainset, batch_size=256, shuffle=True)

valset = CustomDataset(X_text_val.to_numpy(), X_audio_val["mfcc"], y_text_val.to_numpy(), device)
valloader = DataLoader(valset, batch_size=256, shuffle=False)

testset = CustomDataset(X_text_test.to_numpy(), X_audio_test["mfcc"], y_text_test.to_numpy(), device)
testloader = DataLoader(testset, batch_size=256, shuffle=False)

text_model = CNNTextClassifier(input_shape=X_text_train.shape, num_classes=y_text_train.shape[1])
audio_model = TransformerAudioClassifier(seq_len=X_audio_train['mfcc'][0].shape[0], input_dim=13, num_classes=3)
model = MultimodalSentimentClassifier(text_model, audio_model, num_classes=3).to(device)

optimizer = optim.AdamW(model.parameters(), lr=3e-4)
loss_function = nn.CrossEntropyLoss()

In [None]:
train_losses = []
val_losses = []

for epoch in range(15):
    model.train()
    train_loss = 0
    for data in trainloader:
        X_text, X_audio, y = data
        X_text = X_text.unsqueeze(1)
        X_audio = X_audio.permute(1, 0, 2)
        output = model(X_text, X_audio)
        loss = loss_function(output, y)
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss /= len(trainloader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data in valloader:
            X_text, X_audio, y = data
            X_text = X_text.unsqueeze(1)
            X_audio = X_audio.permute(1, 0, 2)
            output = model(X_text, X_audio)
            val_loss += loss_function(output, y).item()
    
    val_loss /= len(valloader)

    print(f"Epoch {epoch+1}, Training Loss: {train_loss:.4f} , Validation Loss: {val_loss:.4f}")

In [None]:
model.eval()
all_preds = []
all_targets = []
    
with torch.no_grad():
    for data in testloader:
        X_text, X_audio, y = data
        X_text = X_text.unsqueeze(1)
        X_audio = X_audio.permute(1, 0, 2)
        output = F.sigmoid(model(X_text, X_audio))
        preds = torch.argmax(output, dim=1)
            
        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(torch.argmax(y, dim=1).cpu().numpy())

        
    all_preds = np.array(all_preds).flatten()
    all_targets = np.array(all_targets).flatten()
    f1 = f1_score(all_targets, all_preds, average='weighted')

    print("F1-score:", f1)
