In [2]:
import pickle

# Replace 'path_to_your_file.pkl' with the actual file path of your .pkl file
file_path = 'aligned_50.pkl'

# Unpack the pickle file
with open(file_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader , WeightedRandomSampler
import os
import json
from collections import defaultdict
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from collections import Counter

In [9]:
class MultimodalDataset(Dataset):
    def __init__(self, data):
        self.audio = torch.tensor(data['audio'])
        self.vision = torch.tensor(data['vision'])
        self.text_bert = torch.tensor(data['text_bert'])
        # Map annotations to integers
        label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
        self.labels = torch.tensor([label_mapping[ann] for ann in data['annotations']])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = {
            'audio': self.audio[idx],
            'vision': self.vision[idx],
            'text_bert': self.text_bert[idx],
            'label': self.labels[idx]
        }
        return sample
        
def get_class_weights(dataset):
    # Count the number of occurrences of each class
    class_counts = Counter([dataset[i]['label'].item() for i in range(len(dataset))])
    num_samples = sum(class_counts.values())
    
    # Calculate weight for each class
    weights = {cls: num_samples / count for cls, count in class_counts.items()}
    
    # Assign weight to each sample in the dataset
    sample_weights = [weights[dataset[i]['label'].item()] for i in range(len(dataset))]
    
    return sample_weights

train_dataset = MultimodalDataset(data['train'])
valid_dataset = MultimodalDataset(data['valid'])
test_dataset = MultimodalDataset(data['test'])

# Calculate weights for each sample in the training and validation datasets
train_sample_weights = get_class_weights(train_dataset)
val_sample_weights = get_class_weights(valid_dataset)

# Create WeightedRandomSampler for training and validation datasets
train_sampler = WeightedRandomSampler(weights=train_sample_weights, num_samples=len(train_sample_weights), replacement=True)
val_sampler = WeightedRandomSampler(weights=val_sample_weights, num_samples=len(val_sample_weights), replacement=True)

# Create DataLoaders with WeightedRandomSampler
train_dataloader = DataLoader(train_dataset, batch_size=256, sampler=train_sampler)
val_dataloader = DataLoader(valid_dataset, batch_size=256, sampler=val_sampler)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [10]:
# Assuming train_dataloader is already created and initialized
# Let's fetch one batch from the DataLoader
for batch in train_dataloader:
    audio = batch['audio']
    vision = batch['vision']
    text_bert = batch['text_bert']
    labels = batch['label']

    print(f'Audio tensor shape: {audio.shape}')
    print(f'Vision tensor shape: {vision.shape}')
    print(f'Text BERT tensor shape: {text_bert.shape}')
    print(f'Labels tensor shape: {labels.shape}')

    # Break after the first batch to just see one sample of the data shapes
    break

Audio tensor shape: torch.Size([256, 50, 74])
Vision tensor shape: torch.Size([256, 50, 35])
Text BERT tensor shape: torch.Size([256, 3, 50])
Labels tensor shape: torch.Size([256])


In [11]:
class ModelTrainer:
    def __init__(self, model, train_dataset, val_dataset, model_name, epochs, save_interval, lr=1e-3, device='cuda'):
        self.model = model.to(device)
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.model_name = model_name
        self.start_epoch = 0
        self.epochs = epochs
        self.save_interval = save_interval
        self.lr = lr
        self.device = device
        self.history = defaultdict(list)
        self.checkpoint_dir = f'modelCheckPoints/{self.model_name}'
        os.makedirs(self.checkpoint_dir, exist_ok=True)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

    def save_checkpoint(self, epoch):
        state = {'epoch': epoch, 'state_dict': self.model.state_dict()}
        torch.save(state, f'{self.checkpoint_dir}/{epoch}.pt')

    def load_checkpoint(self):
        checkpoints = [ckpt for ckpt in os.listdir(self.checkpoint_dir) if ckpt.endswith('.pt')]
        if checkpoints:
            latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('.')[0]))
            checkpoint = torch.load(f'{self.checkpoint_dir}/{latest_checkpoint}', map_location=self.device)
            self.model.load_state_dict(checkpoint['state_dict'])
            self.start_epoch = checkpoint['epoch'] + 1
            print(f"Loaded checkpoint: {latest_checkpoint} at epoch {checkpoint['epoch']}")
        else:
            print("No checkpoints found, starting from scratch.")

    def save_history(self):
        with open(f'{self.checkpoint_dir}/history.json', 'w') as f:
            json.dump(self.history, f)

    def train_one_epoch(self, dataloader, criterion, max_grad_norm=1.0):
        self.model.train()
        total_loss = 0
        correct_predictions = 0
    
        for batch in dataloader:
            audio = batch['audio'].to(self.device).float()
            vision = batch['vision'].to(self.device).float()
            text_bert = batch['text_bert'].to(self.device).float()
            labels = batch['label'].to(self.device)
    
            self.optimizer.zero_grad()
            outputs = self.model(audio, vision, text_bert)
    
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
            self.optimizer.step()
    
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
    
        avg_loss = total_loss / len(dataloader.dataset)
        accuracy = correct_predictions / len(dataloader.dataset)
        return avg_loss, accuracy

    def validate(self, dataloader, criterion):
        self.model.eval()
        total_loss = 0
        correct_predictions = 0
    
        with torch.no_grad():
            for batch in dataloader:
                audio = batch['audio'].to(self.device).float()
                vision = batch['vision'].to(self.device).float()
                text_bert = batch['text_bert'].to(self.device).float()
                labels = batch['label'].to(self.device)
    
                outputs = self.model(audio, vision, text_bert)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()
    
        avg_loss = total_loss / len(dataloader.dataset)
        accuracy = correct_predictions / len(dataloader.dataset)
        return avg_loss, accuracy

    def train(self, criterion):
        self.load_checkpoint()
        train_dataloader = DataLoader(self.train_dataset, batch_size=256, shuffle=True)
        val_dataloader = DataLoader(self.val_dataset, batch_size=256, shuffle=False)

        for epoch in range(self.start_epoch, self.epochs):
            train_loss, train_acc = self.train_one_epoch(train_dataloader, criterion)
            val_loss, val_acc = self.validate(val_dataloader, criterion)
    
            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
            self.history['val_loss'].append(val_loss)
            self.history['val_acc'].append(val_acc)
    
            print(f"Epoch {epoch+1}/{self.epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")
    
            if (epoch + 1) % self.save_interval == 0:
                self.save_checkpoint(epoch + 1)
    
            self.save_history()

# LSTM 

In [12]:
class MultimodalClassifier(nn.Module):
    def __init__(self, audio_dim, vision_dim, text_dim, hidden_dim, num_classes):
        super(MultimodalClassifier, self).__init__()

        self.audio_lstm = nn.LSTM(audio_dim, hidden_dim, batch_first=True)
        self.vision_lstm = nn.LSTM(vision_dim, hidden_dim, batch_first=True)
        
        # Adjust the text_fc layer to match the flattened text_bert dimensions
        self.text_fc = nn.Linear(text_dim * 50, hidden_dim)  # Multiply by sequence length

        self.combined_fc = nn.Linear(hidden_dim * 3, num_classes)

    def forward(self, audio, vision, text_bert):
        _, (audio_hidden, _) = self.audio_lstm(audio)
        audio_repr = audio_hidden.squeeze(0)

        _, (vision_hidden, _) = self.vision_lstm(vision)
        vision_repr = vision_hidden.squeeze(0)

        # Flatten the text_bert features before passing through the linear layer
        text_flattened = text_bert.reshape(text_bert.size(0), -1)
        text_repr = F.relu(self.text_fc(text_flattened))

        combined_features = torch.cat((audio_repr, vision_repr, text_repr), dim=1)
        output = self.combined_fc(combined_features)

        return output

In [17]:
model = MultimodalClassifier(
    audio_dim=74,
    vision_dim=35,
    text_dim=3,  # Assuming each time step of BERT features has 3 dimensions
    hidden_dim=128,
    num_classes=3
)

# Define the loss criterion
criterion = nn.CrossEntropyLoss()

# Assuming train_dataset and valid_dataset are already defined and loaded
# Initialize the ModelTrainer
trainer = ModelTrainer(
    model=model,
    train_dataset=train_dataset,
    val_dataset=valid_dataset,
    model_name='multimodal_classifier',
    epochs=200,  # Set the number of epochs according to your needs
    save_interval=2,
    lr=1e-3,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

# Start training
trainer.train(criterion)

Loaded checkpoint: 100.pt at epoch 100
Epoch 102/200, Train Loss: 0.0014, Train Accuracy: 0.8583, Val Loss: 0.0102, Val Accuracy: 0.4254
Epoch 103/200, Train Loss: 0.0013, Train Accuracy: 0.8690, Val Loss: 0.0108, Val Accuracy: 0.4313
Epoch 104/200, Train Loss: 0.0013, Train Accuracy: 0.8716, Val Loss: 0.0108, Val Accuracy: 0.4367
Epoch 105/200, Train Loss: 0.0012, Train Accuracy: 0.8755, Val Loss: 0.0112, Val Accuracy: 0.4196
Epoch 106/200, Train Loss: 0.0012, Train Accuracy: 0.8828, Val Loss: 0.0118, Val Accuracy: 0.4415
Epoch 107/200, Train Loss: 0.0012, Train Accuracy: 0.8807, Val Loss: 0.0114, Val Accuracy: 0.4105
Epoch 108/200, Train Loss: 0.0012, Train Accuracy: 0.8791, Val Loss: 0.0114, Val Accuracy: 0.4303
Epoch 109/200, Train Loss: 0.0012, Train Accuracy: 0.8842, Val Loss: 0.0117, Val Accuracy: 0.4270
Epoch 110/200, Train Loss: 0.0011, Train Accuracy: 0.8948, Val Loss: 0.0118, Val Accuracy: 0.4249
Epoch 111/200, Train Loss: 0.0010, Train Accuracy: 0.8956, Val Loss: 0.0123, Va

KeyboardInterrupt: 

# Transformer 

In [29]:
class MultimodalTransformer(nn.Module):
    def __init__(self, audio_feature_dim, vision_feature_dim, text_feature_dim, num_classes, hidden_dim=256, num_heads=4, num_layers=6, dropout_rate=0.3):
        super(MultimodalTransformer, self).__init__()
        
        # Define encoders for each modality
        self.audio_encoder = nn.Linear(audio_feature_dim, hidden_dim)
        self.vision_encoder = nn.Linear(vision_feature_dim, hidden_dim)
        self.text_encoder = nn.Linear(text_feature_dim, hidden_dim)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout_rate)
        
        # Transformer encoder layer
        encoder_layer = TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dropout=dropout_rate, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Classifier layer
        self.classifier = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, audio, vision, text_bert):
        # Average the audio and vision features over time
        audio_feature = audio.mean(dim=1)
        vision_feature = vision.mean(dim=1)
        
        # Flatten the text features
        text_feature = text_bert.view(text_bert.size(0), -1)
        
        # Encode and apply dropout to the features
        audio_encoded = self.dropout(self.audio_encoder(audio_feature))
        vision_encoded = self.dropout(self.vision_encoder(vision_feature))
        text_encoded = self.dropout(self.text_encoder(text_feature))
        
        # Combine the features
        combined_features = audio_encoded + vision_encoded + text_encoded
        
        # Pass the combined features through the transformer encoder
        transformer_output = self.transformer_encoder(combined_features.unsqueeze(1))
        pooled_output = transformer_output.mean(dim=1)
        
        # Apply dropout before classification
        pooled_output = self.dropout(pooled_output)
        
        # Classify
        output = self.classifier(pooled_output)
        
        return output


In [None]:
model = MultimodalTransformer(
    audio_feature_dim=74,  # from Audio tensor shape [64, 50, 74]
    vision_feature_dim=35,  # from Vision tensor shape [64, 50, 35]
    text_feature_dim=3*50,  # from Text BERT tensor shape [64, 3, 50]
    num_classes=3  # for the classification task
)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Assuming train_dataset and valid_dataset are defined and loaded
trainer = ModelTrainer(
    model=model,
    train_dataset=train_dataset,
    val_dataset=valid_dataset,
    model_name='multimodal_transformer5',
    epochs=100,  # Number of training epochs
    save_interval=10,  # Interval for saving the model checkpoint
    lr=1e-4,  # Learning rate
    device='cuda' if torch.cuda.is_available() else 'cpu'  # Use CUDA if available
)

# Start the training process
trainer.train(criterion)

No checkpoints found, starting from scratch.
Epoch 1/100, Train Loss: 0.0043, Train Accuracy: 0.4610, Val Loss: 0.0043, Val Accuracy: 0.5131
Epoch 2/100, Train Loss: 0.0041, Train Accuracy: 0.4891, Val Loss: 0.0044, Val Accuracy: 0.5110
Epoch 3/100, Train Loss: 0.0041, Train Accuracy: 0.4993, Val Loss: 0.0043, Val Accuracy: 0.5110
Epoch 4/100, Train Loss: 0.0040, Train Accuracy: 0.5017, Val Loss: 0.0043, Val Accuracy: 0.5158
Epoch 5/100, Train Loss: 0.0040, Train Accuracy: 0.5057, Val Loss: 0.0043, Val Accuracy: 0.5190
Epoch 6/100, Train Loss: 0.0040, Train Accuracy: 0.5075, Val Loss: 0.0043, Val Accuracy: 0.5115
Epoch 7/100, Train Loss: 0.0040, Train Accuracy: 0.5063, Val Loss: 0.0044, Val Accuracy: 0.5158
Epoch 8/100, Train Loss: 0.0040, Train Accuracy: 0.5082, Val Loss: 0.0044, Val Accuracy: 0.5174
Epoch 9/100, Train Loss: 0.0040, Train Accuracy: 0.5045, Val Loss: 0.0043, Val Accuracy: 0.5184
Epoch 10/100, Train Loss: 0.0040, Train Accuracy: 0.5069, Val Loss: 0.0043, Val Accuracy: 0