# Single Modality AutoEncoder ( T + V )

## Imports 

In [None]:
!pip install h5py

In [2]:
import h5py
import torch
from torch.utils.data import Dataset

import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from collections import defaultdict
import json
import torch.optim as optim

# Dataset

In [3]:
class CombinedDataset(Dataset):
    def __init__(self, file_path, bert_feature_size='bert_text_features_128', split='train', dtype=torch.float32):
        self.file_path = file_path
        self.bert_feature_size = bert_feature_size
        self.split = split
        self.dtype = dtype  
        self.data_keys = []
        with h5py.File(self.file_path, 'r') as file:
            for key in file.keys():
                if file[key].attrs['split'] == self.split:
                    self.data_keys.append(key)

    def __len__(self):
        return len(self.data_keys)

    def __getitem__(self, idx):
        with h5py.File(self.file_path, 'r') as file:

            group_key = self.data_keys[idx]
            group = file[group_key]

            label = group.attrs['label']
            text = group.attrs['text']
            audio_features = torch.from_numpy(group['audio_features_averaged'][()]).type(self.dtype)
            facial_features = torch.from_numpy(group['averaged_facial_features'][()]).type(self.dtype)
            bert_features = torch.from_numpy(group[self.bert_feature_size][()]).type(self.dtype)

            # Convert label to a numeric format, if necessary
            label_to_index = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
            label_index = label_to_index[label]

            # Create a dictionary with the data
            sample = {
                'label': label_index,
                'text': text,
                'audio_features': audio_features,
                'facial_features': facial_features,
                'bert_features': bert_features
            }

            return sample

In [4]:
file_path = './combined_features.h5'

datasets = {}
bert_feature_sizes = ['bert_text_features_128', 'bert_text_features_256', 'bert_text_features_512']
splits = ['train', 'validate', 'test']

for feature_size in bert_feature_sizes:
    datasets[feature_size] = {}
    for split in splits:
        dataset_key = f"{split}_{feature_size}"
        datasets[feature_size][split] = CombinedDataset(file_path, bert_feature_size=feature_size, split=split)

### Common Trainer Class 

In [14]:
class ModelTrainer:
    def __init__(self, model, train_dataset, val_dataset, model_name, epochs, save_interval, lr=1e-3, device='cuda'):
        self.model = model.to(device)
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.model_name = model_name
        self.start_epoch = 0 
        self.epochs = epochs
        self.save_interval = save_interval
        self.lr = lr
        self.device = device
        self.history = defaultdict(list)
        self.checkpoint_dir = f'modelCheckPoints/{self.model_name}'
        os.makedirs(self.checkpoint_dir, exist_ok=True)
        self.optimizer = None 

    def initialize_optimizer(self):
        sample_batch = next(iter(DataLoader(self.train_dataset, batch_size=1, shuffle=True)))
        
        bert_features = sample_batch['bert_features'].to(self.device)
        facial_features = sample_batch['facial_features'].to(self.device)
  
        _ = self.model(bert_features,facial_features)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

        
    def save_checkpoint(self, epoch):
        state = {'epoch': epoch, 'state_dict': self.model.state_dict()}
        torch.save(state, f'{self.checkpoint_dir}/{epoch}.pt')


    def load_checkpoint(self):
        checkpoints = [ckpt for ckpt in os.listdir(self.checkpoint_dir) if ckpt.endswith('.pt')]
        if checkpoints:
            latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('.')[0]))
            checkpoint = torch.load(f'{self.checkpoint_dir}/{latest_checkpoint}', map_location=self.device)
            self.model.load_state_dict(checkpoint['state_dict'])
            self.start_epoch = checkpoint['epoch'] + 1  
            print(f"Loaded checkpoint: {latest_checkpoint} at epoch {checkpoint['epoch']}")
        else:
            self.start_epoch = 0  
            print("No checkpoints found, starting from scratch.")

    def save_history(self):
        with open(f'{self.checkpoint_dir}/history.json', 'w') as f:
            json.dump(self.history, f)
            
    def train_one_epoch(self, dataloader, criterion):
        self.model.train()
        total_loss = 0
        correct_predictions = 0
    
        for batch in dataloader:
            bert_features = batch['bert_features'].to(self.device)
            facial_features = batch['facial_features'].to(self.device)
            labels = batch['label'].to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(bert_features, facial_features)
            
            loss = criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
    
        avg_loss = total_loss / len(dataloader.dataset)
        accuracy = correct_predictions / len(dataloader.dataset)
        return avg_loss, accuracy


    
    def validate(self, dataloader, criterion):
        self.model.eval()
        total_loss = 0
        correct_predictions = 0
        with torch.no_grad():
            for batch in dataloader:
                bert_features = batch['bert_features'].to(self.device)
                facial_features = batch['facial_features'].to(self.device)
                labels = batch['label'].to(self.device)
                outputs = self.model(bert_features, facial_features)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()
    
        avg_loss = total_loss / len(dataloader.dataset)
        accuracy = correct_predictions / len(dataloader.dataset)
        return avg_loss, accuracy
    
    def train(self, criterion):
        self.initialize_optimizer()
        self.load_checkpoint()
    
        train_dataloader = DataLoader(self.train_dataset, batch_size=32, shuffle=True)
        val_dataloader = DataLoader(self.val_dataset, batch_size=32, shuffle=False)
    
        try:
            for epoch in range(self.start_epoch, self.epochs):
                train_loss, train_acc = self.train_one_epoch(train_dataloader, criterion)
                val_loss, val_acc = self.validate(val_dataloader, criterion)
    
                self.history['train_loss'].append(train_loss)
                self.history['train_acc'].append(train_acc)
                self.history['val_loss'].append(val_loss)
                self.history['val_acc'].append(val_acc)
    
                print(f"Epoch {epoch+1}/{self.epochs}, "
                      f"Train Loss: {train_loss:.4f}, "
                      f"Train Accuracy: {train_acc:.4f}, "
                      f"Val Loss: {val_loss:.4f}, "
                      f"Val Accuracy: {val_acc:.4f}")
    
                if (epoch + 1) % self.save_interval == 0:
                    self.save_checkpoint(epoch + 1)
    
                self.save_history()
    
        except KeyboardInterrupt:
            print("\nTraining interrupted by user. Saving last model state...")
            self.save_checkpoint(epoch + 1)
            self.save_history()

### Text AutoEncoder Model 1 

In [5]:
class TextAutoEncoder(nn.Module):
    def __init__(self):
        super(TextAutoEncoder, self).__init__()
        self.encoder = None
        self.decoder = None

    def forward(self, x):
        input_size = x.size(1)
        if self.encoder is None or self.decoder is None:
            self.encoder = nn.Sequential(
                nn.Linear(input_size, 128),
                nn.ReLU(),
                nn.Linear(128, 64)
            ).to(x.device) 
            self.decoder = nn.Sequential(
                nn.Linear(64, 128),
                nn.ReLU(),
                nn.Linear(128, input_size),
                nn.Sigmoid()
            ).to(x.device) 
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

### Video AutoEncoder Model 1 

In [114]:
class VideoAutoEncoder(nn.Module):
    def __init__(self):
        super(VideoAutoEncoder, self).__init__()
        self.encoder = None
        self.decoder = None

    def forward(self, x):
        input_size = x.size(1) 
        if self.encoder is None or self.decoder is None:
            self.encoder = nn.Sequential(
                nn.Linear(input_size, 256),
                nn.ReLU(),
                nn.Linear(256, 128)
            ).to(x.device)

            self.decoder = nn.Sequential(
                nn.Linear(128, 256),
                nn.ReLU(),
                nn.Linear(256, input_size),
                nn.Sigmoid()
            ).to(x.device)

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [121]:
class CombinedClassifier(nn.Module):
    def __init__(self, num_classes):
        super(CombinedClassifier, self).__init__()
        self.text_autoencoder = TextAutoEncoder()
        self.video_autoencoder = VideoAutoEncoder()
        self.classifier = None
        self.num_classes = num_classes

    def forward(self,bert_features,facial_features):        
        text_encoded, _ = self.text_autoencoder(bert_features)
        video_encoded, _ = self.video_autoencoder(facial_features)
        
        combined_feature_size = text_encoded.size(1) + video_encoded.size(1)
        if self.classifier is None:
            self.classifier = nn.Sequential(
                nn.Linear(combined_feature_size, 256),
                nn.ReLU(),
                nn.Linear(256, self.num_classes)
            ).to(bert_features.device) 
        combined_features = torch.cat((text_encoded, video_encoded), dim=1)
    
        class_logits = self.classifier(combined_features)
        return class_logits

### Training with bert 512 

In [126]:
train_dataset_512 = datasets['bert_text_features_512']['train']
validate_dataset_512 = datasets['bert_text_features_512']['validate']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = 3
combinedClassifier_BERT512_TE1_VE1 = CombinedClassifier(num_classes=num_classes).to(device)
modelName = 'CombinedClassifier_BERT512_TE1_VE1'
criterion = nn.CrossEntropyLoss()
trainer = ModelTrainer(combinedClassifier_BERT512_TE1_VE1, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=5)
trainer.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/25, Train Loss: 0.0270, Train Accuracy: 0.5960, Val Loss: 0.0249, Val Accuracy: 0.6415
Epoch 2/25, Train Loss: 0.0246, Train Accuracy: 0.6422, Val Loss: 0.0245, Val Accuracy: 0.6450
Epoch 3/25, Train Loss: 0.0238, Train Accuracy: 0.6608, Val Loss: 0.0246, Val Accuracy: 0.6372
Epoch 4/25, Train Loss: 0.0231, Train Accuracy: 0.6700, Val Loss: 0.0251, Val Accuracy: 0.6315
Epoch 5/25, Train Loss: 0.0223, Train Accuracy: 0.6874, Val Loss: 0.0247, Val Accuracy: 0.6336
Epoch 6/25, Train Loss: 0.0215, Train Accuracy: 0.6995, Val Loss: 0.0252, Val Accuracy: 0.6408
Epoch 7/25, Train Loss: 0.0206, Train Accuracy: 0.7137, Val Loss: 0.0257, Val Accuracy: 0.6322
Epoch 8/25, Train Loss: 0.0197, Train Accuracy: 0.7319, Val Loss: 0.0260, Val Accuracy: 0.6222
Epoch 9/25, Train Loss: 0.0185, Train Accuracy: 0.7461, Val Loss: 0.0276, Val Accuracy: 0.6208
Epoch 10/25, Train Loss: 0.0176, Train Accuracy: 0.7615, Val Loss: 0.0280, Val Accuracy: 0.6101
Epoc

# Model 2 (More robust classifier, but including regularsiation)

In [128]:
class CombinedClassifier2(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.5):
        super(CombinedClassifier2, self).__init__()
        self.text_autoencoder = TextAutoEncoder()
        self.video_autoencoder = VideoAutoEncoder()
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.classifier = None

    def forward(self, bert_features, facial_features):        
        text_encoded, _ = self.text_autoencoder(bert_features)
        video_encoded, _ = self.video_autoencoder(facial_features)
        
        combined_feature_size = text_encoded.size(1) + video_encoded.size(1)
        if self.classifier is None:
            self.classifier = nn.Sequential(
                nn.Linear(combined_feature_size, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, self.num_classes)
            ).to(bert_features.device) 
        combined_features = torch.cat((text_encoded, video_encoded), dim=1)
        
        class_logits = self.classifier(combined_features)
        return class_logits

In [130]:
num_classes = 3  
combinedClassifier2_BERT512_TE1_VE1 = CombinedClassifier2(num_classes=num_classes,dropout_rate=0.8).to(device)
modelName = 'combinedClassifier2_BERT512_TE1_VE1IncreasedRegularisation'
criterion = nn.CrossEntropyLoss()
trainer2 = ModelTrainer(combinedClassifier2_BERT512_TE1_VE1, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=5)
trainer2.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/25, Train Loss: 0.0298, Train Accuracy: 0.5233, Val Loss: 0.0253, Val Accuracy: 0.6279
Epoch 2/25, Train Loss: 0.0264, Train Accuracy: 0.6225, Val Loss: 0.0249, Val Accuracy: 0.6237
Epoch 3/25, Train Loss: 0.0256, Train Accuracy: 0.6380, Val Loss: 0.0246, Val Accuracy: 0.6458
Epoch 4/25, Train Loss: 0.0251, Train Accuracy: 0.6382, Val Loss: 0.0247, Val Accuracy: 0.6365
Epoch 5/25, Train Loss: 0.0244, Train Accuracy: 0.6537, Val Loss: 0.0257, Val Accuracy: 0.6101
Epoch 6/25, Train Loss: 0.0238, Train Accuracy: 0.6638, Val Loss: 0.0249, Val Accuracy: 0.6208
Epoch 7/25, Train Loss: 0.0235, Train Accuracy: 0.6717, Val Loss: 0.0250, Val Accuracy: 0.6329
Epoch 8/25, Train Loss: 0.0227, Train Accuracy: 0.6786, Val Loss: 0.0246, Val Accuracy: 0.6265
Epoch 9/25, Train Loss: 0.0223, Train Accuracy: 0.6897, Val Loss: 0.0259, Val Accuracy: 0.6165
Epoch 10/25, Train Loss: 0.0217, Train Accuracy: 0.7035, Val Loss: 0.0254, Val Accuracy: 0.6087

Tra

In [None]:
# Model 3 (modify the videoAutoEncoder to add a hidden layer)

In [6]:
class VideoAutoEncoder2(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(VideoAutoEncoder2, self).__init__()
        self.dropout_rate = dropout_rate
        self.encoder = None
        self.decoder = None

    def forward(self, x):
        input_size = x.size(1)  # Dynamically determine the input size

        if self.encoder is None or self.decoder is None:
            self.encoder = nn.Sequential(
                nn.Linear(input_size, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, 128)
            ).to(x.device)

            self.decoder = nn.Sequential(
                nn.Linear(128, 256),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, input_size),
                nn.Sigmoid()
            ).to(x.device)

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [7]:
class CombinedClassifier3(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.5):
        super(CombinedClassifier3, self).__init__()
        self.text_autoencoder = TextAutoEncoder()
        self.video_autoencoder = VideoAutoEncoder2()
        self.num_classes = num_classes
        self.dropout_rate = dropout_rate
        self.classifier = None

    def forward(self, bert_features, facial_features):        
        text_encoded, _ = self.text_autoencoder(bert_features)
        video_encoded, _ = self.video_autoencoder(facial_features)
        
        combined_feature_size = text_encoded.size(1) + video_encoded.size(1)
        if self.classifier is None:
            self.classifier = nn.Sequential(
                nn.Linear(combined_feature_size, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, self.num_classes)
            ).to(bert_features.device)  

        combined_features = torch.cat((text_encoded, video_encoded), dim=1)
        
        class_logits = self.classifier(combined_features)
        return class_logits

In [13]:
train_dataset_512 = datasets['bert_text_features_512']['train']
validate_dataset_512 = datasets['bert_text_features_512']['validate']

device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

num_classes = 3 
combinedClassifier3_BERT512_TE1_VE2 = CombinedClassifier3(num_classes=num_classes,dropout_rate=0.5).to(device)
modelName = 'combinedClassifier3_BERT512_TE1_VE2'

criterion = nn.CrossEntropyLoss()

trainer3 = ModelTrainer(combinedClassifier3_BERT512_TE1_VE2, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=3)

trainer3.train(criterion)

Loaded checkpoint: 6.pt
Epoch 1/25, Train Loss: 0.0224, Train Accuracy: 0.6879, Val Loss: 0.0250, Val Accuracy: 0.6436
Epoch 2/25, Train Loss: 0.0214, Train Accuracy: 0.7033, Val Loss: 0.0257, Val Accuracy: 0.6180
Epoch 3/25, Train Loss: 0.0208, Train Accuracy: 0.7110, Val Loss: 0.0261, Val Accuracy: 0.6230
Epoch 4/25, Train Loss: 0.0197, Train Accuracy: 0.7297, Val Loss: 0.0277, Val Accuracy: 0.6172
Epoch 5/25, Train Loss: 0.0191, Train Accuracy: 0.7389, Val Loss: 0.0270, Val Accuracy: 0.6215

Training interrupted by user. Saving last model state...


# Model 4: Directly using Bert Features , plus a more stronger video Encoder

In [15]:
class VideoAutoEncoder3(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(VideoAutoEncoder2, self).__init__()
        self.dropout_rate = dropout_rate
        self.encoder = None
        self.decoder = None

    def forward(self, x):
        input_size = x.size(1) 

        if self.encoder is None or self.decoder is None:
            self.encoder = nn.Sequential(
                nn.Linear(input_size, 256),  
                nn.LeakyReLU(0.1),  
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, 128), 
                nn.LeakyReLU(0.1),
                nn.Dropout(self.dropout_rate)
            ).to(x.device)

            self.decoder = nn.Sequential(
                nn.Linear(128, 256),
                nn.LeakyReLU(0.1),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, input_size),
                nn.Sigmoid() 
            ).to(x.device)

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


In [18]:
class CombinedClassifier4(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.5):
        super(CombinedClassifier4, self).__init__()
        self.video_autoencoder = VideoAutoEncoder3(dropout_rate=dropout_rate)
        self.num_classes = num_classes
        self.classifier = None
        self.dropout_rate = dropout_rate

    def forward(self, bert_features, facial_features):        
        video_encoded, _ = self.video_autoencoder(facial_features)
        
        combined_feature_size = bert_features.size(1) + video_encoded.size(1)
        if self.classifier is None:
            self.classifier = nn.Sequential(
                nn.Linear(combined_feature_size, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, self.num_classes)
            ).to(bert_features.device)
    
        combined_features = torch.cat((bert_features, video_encoded), dim=1)

        class_logits = self.classifier(combined_features)
        return class_logits

In [20]:
num_classes = 3 
cc4_TE0_VE3_r_04 = CombinedClassifier4(num_classes=num_classes,dropout_rate=0.4).to(device)
modelName = 'cc4_TE0_VE3_r_04'

criterion = nn.CrossEntropyLoss()

trainer5 = ModelTrainer(cc4_TE0_VE3_r_04, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=3)

trainer5.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/25, Train Loss: 0.0278, Train Accuracy: 0.5744, Val Loss: 0.0253, Val Accuracy: 0.6080
Epoch 2/25, Train Loss: 0.0255, Train Accuracy: 0.6295, Val Loss: 0.0247, Val Accuracy: 0.6386
Epoch 3/25, Train Loss: 0.0249, Train Accuracy: 0.6387, Val Loss: 0.0242, Val Accuracy: 0.6415
Epoch 4/25, Train Loss: 0.0244, Train Accuracy: 0.6431, Val Loss: 0.0246, Val Accuracy: 0.6436
Epoch 5/25, Train Loss: 0.0241, Train Accuracy: 0.6476, Val Loss: 0.0251, Val Accuracy: 0.6329
Epoch 6/25, Train Loss: 0.0239, Train Accuracy: 0.6521, Val Loss: 0.0241, Val Accuracy: 0.6422
Epoch 7/25, Train Loss: 0.0235, Train Accuracy: 0.6664, Val Loss: 0.0239, Val Accuracy: 0.6493
Epoch 8/25, Train Loss: 0.0232, Train Accuracy: 0.6672, Val Loss: 0.0248, Val Accuracy: 0.6336
Epoch 9/25, Train Loss: 0.0231, Train Accuracy: 0.6768, Val Loss: 0.0238, Val Accuracy: 0.6600
Epoch 10/25, Train Loss: 0.0224, Train Accuracy: 0.6827, Val Loss: 0.0239, Val Accuracy: 0.6536
Epoc

In [None]:
# Model 5: Directly using Bert Features , plus a more stronger video Encoder (2)

In [54]:
class VideoAutoEncoder4(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(VideoAutoEncoder4, self).__init__()
        self.dropout_rate = dropout_rate
        # Define the size of the intermediate layers
        self.mid_size = 256
        self.enc_out_size = 128
        self.encoder_first = None
        self.path1 = None
        self.path2 = None
        self.decoder = None

    def forward(self, x):
        if self.encoder_first is None:
            input_size = x.size(1)
            self.encoder_first = nn.Linear(input_size, self.mid_size).to(x.device)
            self.path1 = nn.Sequential(
                nn.LeakyReLU(0.1),
                nn.Dropout(self.dropout_rate),
                nn.Linear(self.mid_size, self.mid_size) 
            ).to(x.device)
            self.path2 = nn.Sequential(
                nn.ELU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(self.mid_size, self.mid_size) 
            ).to(x.device)
            self.decoder = nn.Sequential(
                nn.Linear(self.mid_size, input_size), 
                nn.Sigmoid()
            ).to(x.device)

        x_enc = self.encoder_first(x)
        path1_output = self.path1(x_enc)
        path2_output = self.path2(x_enc)
        encoded = path1_output + path2_output + x_enc

        decoded = self.decoder(encoded)
        return encoded, decoded

In [55]:
class CombinedClassifier5(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.5):
        super(CombinedClassifier5, self).__init__()
        self.video_autoencoder = VideoAutoEncoder4(dropout_rate=dropout_rate)
        self.num_classes = num_classes
        self.classifier = None
        self.dropout_rate = dropout_rate

    def forward(self, bert_features, facial_features):        
        video_encoded, _ = self.video_autoencoder(facial_features)
        
        combined_feature_size = bert_features.size(1) + video_encoded.size(1)
        if self.classifier is None:
            self.classifier = nn.Sequential(
                nn.Linear(combined_feature_size, 512),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Dropout(self.dropout_rate),
                nn.Linear(256, self.num_classes)
            ).to(bert_features.device)
        
        combined_features = torch.cat((bert_features, video_encoded), dim=1)

        class_logits = self.classifier(combined_features)
        return class_logits

In [56]:
num_classes = 3 
cc5_TE0_VE4_r_04 = CombinedClassifier5(num_classes=num_classes,dropout_rate=0.4).to(device)
modelName = 'cc5_TE0_VE4_r_04'

criterion = nn.CrossEntropyLoss()
trainer6 = ModelTrainer(cc5_TE0_VE4_r_04, train_dataset_512, validate_dataset_512, modelName, epochs=25, save_interval=3)
trainer6.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/25, Train Loss: 0.0295, Train Accuracy: 0.5359, Val Loss: 0.0271, Val Accuracy: 0.5766
Epoch 2/25, Train Loss: 0.0269, Train Accuracy: 0.6000, Val Loss: 0.0263, Val Accuracy: 0.6023
Epoch 3/25, Train Loss: 0.0260, Train Accuracy: 0.6176, Val Loss: 0.0267, Val Accuracy: 0.5895
Epoch 4/25, Train Loss: 0.0256, Train Accuracy: 0.6268, Val Loss: 0.0245, Val Accuracy: 0.6329
Epoch 5/25, Train Loss: 0.0252, Train Accuracy: 0.6286, Val Loss: 0.0247, Val Accuracy: 0.6486
Epoch 6/25, Train Loss: 0.0249, Train Accuracy: 0.6404, Val Loss: 0.0254, Val Accuracy: 0.6101
Epoch 7/25, Train Loss: 0.0244, Train Accuracy: 0.6451, Val Loss: 0.0246, Val Accuracy: 0.6386
Epoch 8/25, Train Loss: 0.0242, Train Accuracy: 0.6505, Val Loss: 0.0249, Val Accuracy: 0.6336
Epoch 9/25, Train Loss: 0.0240, Train Accuracy: 0.6548, Val Loss: 0.0257, Val Accuracy: 0.6208
Epoch 10/25, Train Loss: 0.0241, Train Accuracy: 0.6500, Val Loss: 0.0252, Val Accuracy: 0.6379
Epoc