In [6]:
import numpy as np
x = np.load("/project/msoleyma_1026/ecp/data/audio/train-emotion/dia1utt1.npy")
x.shape

(1, 1024)

In [7]:
import os
import numpy as np
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

In [8]:
class ConversationDataset(Dataset):
    def __init__(self, data_dir, embeddings_dir):
        self.data_dir = data_dir
        self.embeddings_dir = embeddings_dir
        self.conversations = self.load_conversations()
        self.emotion_list = sorted(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness','surprise'])
        self.encoder = LabelEncoder()
        self.encoder.fit(self.emotion_list)
        self.class_weights = self.compute_class_weights()

    def compute_class_weights(self):
        all_labels = []
        for conversation in self.conversations:
            for utterance in conversation['conversation']:
                all_labels.append(utterance['emotion'])
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(all_labels), y=all_labels)
        return torch.tensor(class_weights, dtype=torch.float32)

    def encode_emotion(self, emotion):
        encoded = self.encoder.transform([emotion])[0]
        return encoded

    def load_conversations(self):
        with open(self.data_dir, 'r') as file:
            conversations = json.load(file)
        # print(f"Loaded {len(conversations)} conversations")
        return conversations

    def load_embeddings(self, video_name):
        video_name = video_name.split('.')[0]

        embedding_file = os.path.join(self.embeddings_dir, f'{video_name}.npy')
        embedding = np.load(embedding_file)
        return embedding

    def positional_encoding(self, embeddings):
        seq_length = embeddings.shape[0]
        embedding_dim = embeddings.shape[1]
        position_enc = torch.zeros(seq_length, embedding_dim)
        position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-np.log(10000.0) / embedding_dim))
        position_enc[:, 0::2] = torch.sin(position * div_term)
        position_enc[:, 1::2] = torch.cos(position * div_term)

        return position_enc

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        conversation = self.conversations[idx]
        context_embeddings = []
        emotions = []
        conversation_ID = conversation['conversation_ID']
        utterance_list = []
        labels_list = []

        for utterance in conversation['conversation']:
            video_name = utterance['video_name']
            utterance_ID = utterance['utterance_ID']
            utterance_list.append(utterance_ID)
            emotion = self.encode_emotion(utterance['emotion'])
            labels_list.append(emotion)
            embedding = self.load_embeddings(video_name)
            context_embeddings.append(torch.from_numpy(embedding))
            emotions.append(emotion)

        max_seq_length = 33
        utterance_list = utterance_list + [0] * (33 - len(utterance_list)) if len(utterance_list) < 33 else utterance_list
        labels_list = labels_list + [-1] * (33 - len(labels_list)) if len(labels_list) < 33 else labels_list

        padded_embeddings = []
        num_to_add = 0
        if len(context_embeddings) < max_seq_length:
            num_to_add = max_seq_length - len(context_embeddings)
            zero_tensor = torch.zeros((1,1024), dtype=torch.float32)
            context_embeddings += [zero_tensor] * num_to_add

        context_embeddings_padded = torch.cat(context_embeddings, dim=0)

        positional_encodings = self.positional_encoding(context_embeddings_padded)
        context_embeddings_with_pos = context_embeddings_padded + positional_encodings


        emotions += [-1] * num_to_add
        encoded_emotions_tensor = torch.tensor(emotions, dtype=torch.long)


        return context_embeddings_with_pos, encoded_emotions_tensor, torch.tensor(conversation_ID), utterance_list, labels_list

# class EmotionDetector(nn.Module):
#     def __init__(self, input_dim, hidden_dim, num_emotions, n_layers=2, dropout=0.2):
#         super(EmotionDetector, self).__init__()
#         self.input_dim = input_dim
#         self.hidden_dim = hidden_dim
#         self.num_emotions = num_emotions
#         self.n_layers = n_layers

#         encoder_layers = TransformerEncoderLayer(d_model=input_dim, nhead=4, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
#         self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=n_layers)

#         self.decoder_emotion = nn.Linear(input_dim, num_emotions)
#         print("Initialized EmotionDetector")

#     def forward(self, context_embeddings):
#         encoded_context = self.transformer_encoder(context_embeddings)
#         prediction_emotion = self.decoder_emotion(encoded_context)  # (batch_size, seq_length, num_emotions)
#         return prediction_emotion

In [9]:
class EmotionDetector(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_emotions, n_layers=4, dropout=0.2, class_weights=None):
        super(EmotionDetector, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_emotions = num_emotions
        self.n_layers = n_layers
        self.class_weights = class_weights

        # Forward Transformer encoder
        self.forward_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=4, dim_feedforward=hidden_dim, dropout=dropout),
            num_layers=n_layers)

        # Backward Transformer encoder
        self.backward_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=4, dim_feedforward=hidden_dim, dropout=dropout),
            num_layers=n_layers)

        # Linear layer for emotion prediction
        self.decoder_emotion = nn.Sequential(
            nn.Linear(input_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_emotions))

        print("Initialized EmotionDetector")

    def forward(self, context_embeddings):
        # Forward pass
        encoded_forward = self.forward_encoder(context_embeddings)

        # Reverse the sequence for backward pass
        context_embeddings_reversed = torch.flip(context_embeddings, dims=[1])

        # Backward pass
        encoded_backward = self.backward_encoder(context_embeddings_reversed)

        # Concatenate forward and backward outputs
        encoded_context = torch.cat((encoded_forward, encoded_backward), dim=-1)

        # Predict emotions
        prediction_emotion = self.decoder_emotion(encoded_context)

        return prediction_emotion

In [10]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("Cleared CUDA cache")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


data_dir = "/project/msoleyma_1026/ecp/data/train.json"
embeddings_dir = "/project/msoleyma_1026/ecp/data/audio/train-emotion/"
num_emotions = 7  # Number of emotions for classification
input_dim = 1024  # Dimensionality of your embeddings
hidden_dim = 512  # Hidden dimension for the Transformer
n_layers = 4  # Number of layers in the Transformer
dropout = 0  # Dropout probability
batch_size = 32
num_epochs = 50
learning_rate = 1e-5
print_interval = 5

dataset = ConversationDataset(data_dir, embeddings_dir)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

class_weights = dataset.class_weights.to(device)
print(class_weights)

model = EmotionDetector(input_dim, hidden_dim, num_emotions, n_layers=n_layers, dropout=dropout, class_weights=class_weights).to(device)

criterion_emotion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    model.train()
    epoch_loss_emotion = 0.0
    for batch_idx, (context_embeddings, emotions, convID, utteranceList, labelsList) in enumerate(dataloader):
        optimizer.zero_grad()
        context_embeddings, emotions = context_embeddings.to(device), emotions.to(device)
        prediction_emotion_logits = model(context_embeddings)
        
        outputs_reshaped = prediction_emotion_logits.view(-1, num_emotions)
        emotions_reshaped = emotions.view(-1)

        # Compute the loss only on non-padded data points
        active_outputs = outputs_reshaped[emotions_reshaped != -1]
        active_emotions = emotions_reshaped[emotions_reshaped != -1]
        loss_emotion = criterion_emotion(active_outputs, active_emotions)
        
        loss_emotion.backward()
        optimizer.step()

        epoch_loss_emotion += loss_emotion.item()

        if (batch_idx + 1) % print_interval == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_idx + 1}/{len(dataloader)}], '
                  f'Emotion Loss: {loss_emotion.item():.4f}')

    print(f'Epoch [{epoch + 1}/{num_epochs}], Average Emotion Loss: {epoch_loss_emotion / len(dataloader):.4f}')

torch.save(model.state_dict(), '/project/msoleyma_1026/ecp/models/emotion/late/emotion_detection_model_late_audio.pt')

Cleared CUDA cache
Using device: cuda
tensor([1.2317, 4.7134, 4.9811, 0.8435, 0.3284, 1.7428, 1.0300],
       device='cuda:0')
Initialized EmotionDetector




Epoch [1/50], Batch [5/35], Emotion Loss: 1.9595
Epoch [1/50], Batch [10/35], Emotion Loss: 1.9316
Epoch [1/50], Batch [15/35], Emotion Loss: 1.9466
Epoch [1/50], Batch [20/35], Emotion Loss: 1.9429
Epoch [1/50], Batch [25/35], Emotion Loss: 1.9459
Epoch [1/50], Batch [30/35], Emotion Loss: 1.9530
Epoch [1/50], Batch [35/35], Emotion Loss: 1.9335
Epoch [1/50], Average Emotion Loss: 1.9482
Epoch [2/50], Batch [5/35], Emotion Loss: 1.9169
Epoch [2/50], Batch [10/35], Emotion Loss: 1.9625
Epoch [2/50], Batch [15/35], Emotion Loss: 1.9431
Epoch [2/50], Batch [20/35], Emotion Loss: 1.9689
Epoch [2/50], Batch [25/35], Emotion Loss: 1.9407
Epoch [2/50], Batch [30/35], Emotion Loss: 1.9301
Epoch [2/50], Batch [35/35], Emotion Loss: 1.9199
Epoch [2/50], Average Emotion Loss: 1.9426
Epoch [3/50], Batch [5/35], Emotion Loss: 1.9527
Epoch [3/50], Batch [10/35], Emotion Loss: 1.9617
Epoch [3/50], Batch [15/35], Emotion Loss: 1.9543
Epoch [3/50], Batch [20/35], Emotion Loss: 1.9342
Epoch [3/50], Bat

RuntimeError: Parent directory /project/msoleyma_1026/ecp/models/emotion/late does not exist.

In [11]:
def evaluate_model(model, dataloader, criterion, device, num_emotions):
    """
    Evaluate the model on the given dataloader and compute loss and accuracy, accounting for padding.

    Args:
    - model (torch.nn.Module): The model to evaluate.
    - dataloader (DataLoader): The DataLoader providing the dataset.
    - criterion (loss function): The loss function used to evaluate the model's performance.
    - device (torch.device): The device computations will be performed on.
    - num_emotions (int): Number of emotion categories used in the model output.

    Returns:
    - float: Average loss over the dataset.
    - float: Accuracy, excluding padded data points.
    """
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_valid = 0  # Total non-padded data points
    all_predictions = []
    all_true_emotions = []
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            context_embeddings, emotions, conversationList, utteranceList, labelsList = batch
            context_embeddings, emotions = context_embeddings.to(device), emotions.to(device)

            outputs = model(context_embeddings)

            # Reshape for loss calculation
            outputs_reshaped = outputs.view(-1, num_emotions)
            emotions_reshaped = emotions.view(-1)

            # Compute the loss only on non-padded data points
            active_outputs = outputs_reshaped[emotions_reshaped != -1]
            active_emotions = emotions_reshaped[emotions_reshaped != -1]
            loss = criterion(active_outputs, active_emotions)
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, dim=2)
            valid_data = (emotions != -1)
            correct_predictions = predicted.eq(emotions) & valid_data
            total_correct += correct_predictions.sum().item()
            total_valid += valid_data.sum().item()

            all_predictions.extend(predicted[valid_data].cpu().numpy())
            all_true_emotions.extend(active_emotions.cpu().numpy())

            for i, conv_id in enumerate(conversationList):
                conv_data = {'conversation_id': int(conv_id), 'utterances': []}
                for j, utterance_tensor in enumerate(utteranceList):
                    if int(utterance_tensor[i]) != 0:
                        conv_data['utterances'].append({'utt_id': int(utterance_tensor[i]), 'emotion': int(predicted[i][j])})
                predictions.append(conv_data)

    file_path = "/project/msoleyma_1026/ecp/data/predictions/late_fusion_predictions_audio_emotion.json"

    with open(file_path, 'w') as file:
        json.dump(predictions, file, indent=4)
    
    f1_score_val = f1_score(all_true_emotions, all_predictions, average='weighted')
    classification_report_result = classification_report(all_true_emotions, all_predictions)

    average_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_valid if total_valid > 0 else 0
    
    return average_loss, accuracy, f1_score_val, classification_report_result

In [12]:
data_dir_test = "/project/msoleyma_1026/ecp/data/test.json"
embeddings_dir_test = "/project/msoleyma_1026/ecp/data/audio/test-emotion/"

dataset = ConversationDataset(data_dir_test, embeddings_dir_test)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
validation_dataloader = DataLoader(dataset, batch_size=8, shuffle=False)  # Same dataloader for simplicity
model.to(device)
criterion_emotion = nn.CrossEntropyLoss()

val_loss, val_accuracy, val_f1, val_class_report = evaluate_model(model, validation_dataloader, criterion_emotion, device, num_emotions=7)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}%")
print(f"F1 Score: {val_f1 * 100:.2f}%")
print(val_class_report)

Validation Loss: 1.9511, Validation Accuracy: 0.1247%
F1 Score: 11.00%
              precision    recall  f1-score   support

           0       0.22      0.23      0.22       333
           1       0.05      0.04      0.04        79
           2       0.03      0.52      0.05        56
           3       0.22      0.11      0.15       429
           4       0.57      0.01      0.01      1121
           5       0.16      0.35      0.22       241
           6       0.22      0.23      0.23       307

    accuracy                           0.12      2566
   macro avg       0.21      0.21      0.13      2566
weighted avg       0.36      0.12      0.11      2566

