# Single Modality AutoEncoder ( V + A + T )

## Imports 

In [1]:
import h5py
import torch
from torch.utils.data import Dataset

import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from collections import defaultdict
import json
import torch.optim as optim

# Dataset

In [2]:
class CombinedDataset(Dataset):
    def __init__(self, file_path, bert_feature_size='bert_text_features_128', split='train', dtype=torch.float32):
        self.file_path = file_path
        self.bert_feature_size = bert_feature_size
        self.split = split
        self.dtype = dtype  # Store the desired data type
        self.data_keys = []
        with h5py.File(self.file_path, 'r') as file:
            # Iterate through groups in HDF5 file and store keys for the specified split
            for key in file.keys():
                if file[key].attrs['split'] == self.split:
                    self.data_keys.append(key)

    def __len__(self):
        return len(self.data_keys)

    def __getitem__(self, idx):
        with h5py.File(self.file_path, 'r') as file:
            # Access the group corresponding to the index
            group_key = self.data_keys[idx]
            group = file[group_key]

            # Read datasets from the group and convert them to the specified dtype
            label = group.attrs['label']
            text = group.attrs['text']
            audio_features = torch.from_numpy(group['audio_features_averaged'][()]).type(self.dtype)
            facial_features = torch.from_numpy(group['averaged_facial_features'][()]).type(self.dtype)
            bert_features = torch.from_numpy(group[self.bert_feature_size][()]).type(self.dtype)

            # Convert label to a numeric format, if necessary
            label_to_index = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
            label_index = label_to_index[label]

            # Create a dictionary with the data
            sample = {
                'label': label_index,
                'text': text,
                'audio_features': audio_features,
                'facial_features': facial_features,
                'bert_features': bert_features
            }

            return sample

In [3]:
# Paths to the HDF5 file for each BERT feature size version
file_path = './combined_features.h5'

# Create a CombinedDataset for each BERT feature size and split
datasets = {}
bert_feature_sizes = ['bert_text_features_128', 'bert_text_features_256', 'bert_text_features_512']
splits = ['train', 'validate', 'test']

for feature_size in bert_feature_sizes:
    datasets[feature_size] = {}
    for split in splits:
        dataset_key = f"{split}_{feature_size}"
        datasets[feature_size][split] = CombinedDataset(file_path, bert_feature_size=feature_size, split=split)

### Common Trainer Class 

In [4]:
class ModelTrainer:
    def __init__(self, model, train_dataset, val_dataset, model_name, epochs, save_interval, lr=1e-3, device='cuda'):
        self.model = model.to(device)
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.model_name = model_name
        self.start_epoch = 0
        self.epochs = epochs
        self.save_interval = save_interval
        self.lr = lr
        self.device = device
        self.history = defaultdict(list)
        self.checkpoint_dir = f'modelCheckPoints/{self.model_name}'
        os.makedirs(self.checkpoint_dir, exist_ok=True)
        self.optimizer = None  # Initialized later in initialize_optimizer()
        
    def save_checkpoint(self, epoch):
        # Save model parameters along with the epoch number
        state = {'epoch': epoch, 'state_dict': self.model.state_dict()}
        torch.save(state, f'{self.checkpoint_dir}/{epoch}.pt')

    def load_checkpoint(self):
        # Load the latest model checkpoint
        checkpoints = [ckpt for ckpt in os.listdir(self.checkpoint_dir) if ckpt.endswith('.pt')]
        if checkpoints:
            latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('.')[0]))
            checkpoint = torch.load(f'{self.checkpoint_dir}/{latest_checkpoint}', map_location=self.device)
            self.model.load_state_dict(checkpoint['state_dict'])
            self.start_epoch = checkpoint['epoch'] + 1  # Continue from next epoch
            print(f"Loaded checkpoint: {latest_checkpoint} at epoch {checkpoint['epoch']}")
        else:
            self.start_epoch = 0  # Start from scratch
            print("No checkpoints found, starting from scratch.")

    def save_history(self):
        # Save history of performance
        with open(f'{self.checkpoint_dir}/history.json', 'w') as f:
            json.dump(self.history, f)
            
    def initialize_optimizer(self):
        # Perform a dummy forward pass to initialize model parameters
        sample_batch = next(iter(DataLoader(self.train_dataset, batch_size=1, shuffle=True)))
        facial_features = sample_batch['facial_features'].to(self.device)
        audio_features = sample_batch['audio_features'].to(self.device)
        bert_features = sample_batch['bert_features'].to(self.device)

        # Assuming the model's forward method expects bert_features and audio_features 
        _ = self.model(audio_features, facial_features, bert_features)

        # Initialize the optimizer
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

    def train_one_epoch(self, dataloader, criterion):
        self.model.train()
        total_loss = 0
        correct_predictions = 0

        for batch in dataloader:
            facial_features = batch['facial_features'].to(self.device)
            audio_features = batch['audio_features'].to(self.device)
            bert_features = batch['bert_features'].to(self.device)
            labels = batch['label'].to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(audio_features, facial_features, bert_features)

            loss = criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()

        avg_loss = total_loss / len(dataloader.dataset)
        accuracy = correct_predictions / len(dataloader.dataset)
        return avg_loss, accuracy

    def validate(self, dataloader, criterion):
        self.model.eval()
        total_loss = 0
        correct_predictions = 0

        with torch.no_grad():
            for batch in dataloader:
                facial_features = batch['facial_features'].to(self.device)
                audio_features = batch['audio_features'].to(self.device)
                bert_features = batch['bert_features'].to(self.device)
                labels = batch['label'].to(self.device)

                outputs = self.model(audio_features, facial_features, bert_features)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()

        avg_loss = total_loss / len(dataloader.dataset)
        accuracy = correct_predictions / len(dataloader.dataset)
        return avg_loss, accuracy

    def train(self, criterion):
        self.initialize_optimizer()
        self.load_checkpoint()

        train_dataloader = DataLoader(self.train_dataset, batch_size=128, shuffle=True)
        val_dataloader = DataLoader(self.val_dataset, batch_size=128, shuffle=False)

        try:
            for epoch in range(self.start_epoch, self.epochs):
                train_loss, train_acc = self.train_one_epoch(train_dataloader, criterion)
                val_loss, val_acc = self.validate(val_dataloader, criterion)

                self.history['train_loss'].append(train_loss)
                self.history['train_acc'].append(train_acc)
                self.history['val_loss'].append(val_loss)
                self.history['val_acc'].append(val_acc)

                print(f"Epoch {epoch+1}/{self.epochs}, "
                      f"Train Loss: {train_loss:.4f}, "
                      f"Train Accuracy: {train_acc:.4f}, "
                      f"Val Loss: {val_loss:.4f}, "
                      f"Val Accuracy: {val_acc:.4f}")

                if (epoch + 1) % self.save_interval == 0:
                    self.save_checkpoint(epoch + 1)

                self.save_history()

        except KeyboardInterrupt:
            print("\nTraining interrupted by user. Saving last model state...")
            self.save_checkpoint(epoch + 1)
            self.save_history()

# DynamicEncoder (T + V + A all modalities use the same)

In [5]:
class DynamicEncoder(nn.Module):
    def __init__(self, encoded_size=128, dropout_rate=0.5):
        super(DynamicEncoder, self).__init__()
        self.encoded_size = encoded_size
        self.dropout_rate = dropout_rate
        self.encoder = None

    def forward(self, x):
        if self.encoder is None:
            input_size = x.size(1)
            self.encoder = nn.Sequential(
                nn.Linear(input_size, input_size // 2),  # Example of dynamic layer sizing
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(input_size // 2, self.encoded_size),
                nn.ReLU()
            ).to(x.device)
        return self.encoder(x)

In [6]:
class CombinedClassifier(nn.Module):
    def __init__(self, num_classes, encoded_audio_size=128, encoded_video_size=128, encoded_text_size=128, dropout_rate=0.5):
        super(CombinedClassifier, self).__init__()
        # Initialize encoders for each modality with their respective encoded sizes
        self.audio_encoder = DynamicEncoder(encoded_size=encoded_audio_size, dropout_rate=dropout_rate)
        self.video_encoder = DynamicEncoder(encoded_size=encoded_video_size, dropout_rate=dropout_rate)
        self.text_encoder = DynamicEncoder(encoded_size=encoded_text_size, dropout_rate=dropout_rate)
        
        self.dropout_rate = dropout_rate
        self.num_classes = num_classes
        self.classifier = None  # This will be dynamically created

    def forward(self, audio_features, video_features, text_features):
        encoded_audio = self.audio_encoder(audio_features)
        encoded_video = self.video_encoder(video_features)
        encoded_text = self.text_encoder(text_features)

        # Initialize the classifier dynamically based on the combined feature size
        if self.classifier is None:
            combined_feature_size = encoded_audio.size(1) + encoded_video.size(1) + encoded_text.size(1)
            self.classifier = nn.Sequential(
                nn.Linear(combined_feature_size, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, self.num_classes)
            ).to(audio_features.device)  # Assume all features are on the same device

        combined_features = torch.cat((encoded_audio, encoded_video, encoded_text), dim=1)
        return self.classifier(combined_features)

In [8]:
# Initialize datasets for 'bert_text_features_512'
train_dataset_512 = datasets['bert_text_features_512']['train']
validate_dataset_512 = datasets['bert_text_features_512']['validate']
# test_dataset_512 = datasets['bert_text_features_512']['test']  # for evaluation later

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Initialize the model
num_classes = 3  # The number of classes in your classification task
CombinedClassifier_A1_V1_T1 = CombinedClassifier(num_classes, encoded_audio_size=128, encoded_video_size=128, encoded_text_size=128, dropout_rate=0.5).to(device)
modelName = 'CombinedClassifier_A1_V1_T1'

# Define the trainer
trainer = ModelTrainer(CombinedClassifier_A1_V1_T1, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=5, device=device)

# Start training
trainer.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/25, Train Loss: 0.0073, Train Accuracy: 0.5437, Val Loss: 0.0063, Val Accuracy: 0.6265
Epoch 2/25, Train Loss: 0.0064, Train Accuracy: 0.6277, Val Loss: 0.0062, Val Accuracy: 0.6315
Epoch 3/25, Train Loss: 0.0062, Train Accuracy: 0.6433, Val Loss: 0.0061, Val Accuracy: 0.6458
Epoch 4/25, Train Loss: 0.0061, Train Accuracy: 0.6524, Val Loss: 0.0061, Val Accuracy: 0.6422
Epoch 5/25, Train Loss: 0.0060, Train Accuracy: 0.6601, Val Loss: 0.0061, Val Accuracy: 0.6500
Epoch 6/25, Train Loss: 0.0059, Train Accuracy: 0.6702, Val Loss: 0.0060, Val Accuracy: 0.6429
Epoch 7/25, Train Loss: 0.0058, Train Accuracy: 0.6781, Val Loss: 0.0061, Val Accuracy: 0.6522
Epoch 8/25, Train Loss: 0.0058, Train Accuracy: 0.6781, Val Loss: 0.0059, Val Accuracy: 0.6436
Epoch 9/25, Train Loss: 0.0056, Train Accuracy: 0.6874, Val Loss: 0.0061, Val Accuracy: 0.6543
Epoch 10/25, Train Loss: 0.0055, Train Accuracy: 0.6919, Val Loss: 0.0062, Val Accuracy: 0.6322
Epoc

In [9]:
# Initialize datasets for 'bert_text_features_512'
train_dataset_512 = datasets['bert_text_features_512']['train']
validate_dataset_512 = datasets['bert_text_features_512']['validate']
# test_dataset_512 = datasets['bert_text_features_512']['test']  # for evaluation later

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Initialize the model
num_classes = 3  # The number of classes in your classification task
CombinedClassifier_A1_V1_T1_r1 = CombinedClassifier(num_classes, encoded_audio_size=128, encoded_video_size=128, encoded_text_size=128, dropout_rate=0.7).to(device)
modelName = 'CombinedClassifier_A1_V1_T1_r1'

# Define the trainer
trainer1 = ModelTrainer(CombinedClassifier_A1_V1_T1_r1, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=5, device=device)

# Start training
trainer1.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/25, Train Loss: 0.0077, Train Accuracy: 0.5001, Val Loss: 0.0065, Val Accuracy: 0.6123
Epoch 2/25, Train Loss: 0.0068, Train Accuracy: 0.6018, Val Loss: 0.0064, Val Accuracy: 0.6151
Epoch 3/25, Train Loss: 0.0065, Train Accuracy: 0.6238, Val Loss: 0.0061, Val Accuracy: 0.6450
Epoch 4/25, Train Loss: 0.0064, Train Accuracy: 0.6273, Val Loss: 0.0062, Val Accuracy: 0.6443
Epoch 5/25, Train Loss: 0.0064, Train Accuracy: 0.6335, Val Loss: 0.0060, Val Accuracy: 0.6344
Epoch 6/25, Train Loss: 0.0063, Train Accuracy: 0.6417, Val Loss: 0.0060, Val Accuracy: 0.6515
Epoch 7/25, Train Loss: 0.0062, Train Accuracy: 0.6475, Val Loss: 0.0060, Val Accuracy: 0.6479
Epoch 8/25, Train Loss: 0.0062, Train Accuracy: 0.6462, Val Loss: 0.0061, Val Accuracy: 0.6450
Epoch 9/25, Train Loss: 0.0061, Train Accuracy: 0.6546, Val Loss: 0.0060, Val Accuracy: 0.6550
Epoch 10/25, Train Loss: 0.0061, Train Accuracy: 0.6592, Val Loss: 0.0060, Val Accuracy: 0.6450
Epoc

In [10]:
# Initialize datasets for 'bert_text_features_512'
train_dataset_512 = datasets['bert_text_features_512']['train']
validate_dataset_512 = datasets['bert_text_features_512']['validate']
# test_dataset_512 = datasets['bert_text_features_512']['test']  # for evaluation later

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Initialize the model
num_classes = 3  # The number of classes in your classification task
CombinedClassifier_A2_V2_T1_r0 = CombinedClassifier(num_classes, encoded_audio_size=64, encoded_video_size=64, encoded_text_size=128, dropout_rate=0.4).to(device)
modelName = 'CombinedClassifier_A2_V2_T1_r0'

# Define the trainer
trainer2 = ModelTrainer(CombinedClassifier_A2_V2_T1_r0, train_dataset_512, validate_dataset_512, modelName, epochs=30, save_interval=5, device=device)

# Start training
trainer2.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/30, Train Loss: 0.0072, Train Accuracy: 0.5531, Val Loss: 0.0063, Val Accuracy: 0.6408
Epoch 2/30, Train Loss: 0.0063, Train Accuracy: 0.6378, Val Loss: 0.0063, Val Accuracy: 0.6272
Epoch 3/30, Train Loss: 0.0062, Train Accuracy: 0.6427, Val Loss: 0.0061, Val Accuracy: 0.6486
Epoch 4/30, Train Loss: 0.0060, Train Accuracy: 0.6566, Val Loss: 0.0061, Val Accuracy: 0.6344
Epoch 5/30, Train Loss: 0.0059, Train Accuracy: 0.6663, Val Loss: 0.0062, Val Accuracy: 0.6393
Epoch 6/30, Train Loss: 0.0058, Train Accuracy: 0.6745, Val Loss: 0.0059, Val Accuracy: 0.6536
Epoch 7/30, Train Loss: 0.0056, Train Accuracy: 0.6844, Val Loss: 0.0060, Val Accuracy: 0.6600
Epoch 8/30, Train Loss: 0.0055, Train Accuracy: 0.6944, Val Loss: 0.0060, Val Accuracy: 0.6600
Epoch 9/30, Train Loss: 0.0054, Train Accuracy: 0.7030, Val Loss: 0.0058, Val Accuracy: 0.6636
Epoch 10/30, Train Loss: 0.0053, Train Accuracy: 0.7138, Val Loss: 0.0060, Val Accuracy: 0.6372
Epoc