In [3]:
# Import required libraries
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.optim import Adam
from torch.nn import DataParallel  # For multi-GPU support
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training


In [4]:
import pandas as pd
import os
from PIL import Image

In [5]:
# Import Vision Transformer
from torchvision.models import vit_b_16, ViT_B_16_Weights  # ViT model and pre-trained weights

In [13]:
# Initialize a pre-trained Vision Transformer model
class EmotionRecognitionViT(nn.Module):
    def __init__(self, num_classes=6):
        super(EmotionRecognitionViT, self).__init__()
        # Load the pre-trained Vision Transformer
        self.vit = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
        
        # Replace the classifier head with one suitable for our task
        in_features = self.vit.heads[0].in_features  # Access the input features of the last layer
        self.vit.heads = nn.Sequential(nn.Linear(in_features, num_classes))

    def forward(self, x):
        return self.vit(x)

# Initialize the ViT model
model = EmotionRecognitionViT(num_classes=6)


In [7]:
# Define the dataset class
class EmotionDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data  # Preprocessed frames
        self.labels = labels  # Corresponding emotion labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


In [14]:
# Path to the CSV file and GIF folder
csv_path = "/home/jecroisp/Thesis/processed_data/p_crema/GIF_Annotations.csv"
gif_folder = "/home/jecroisp/Thesis/processed_data/p_crema/CremaGifs"

# Load the CSV
metadata = pd.read_csv(csv_path)

# Filter valid files and map paths
metadata['GIF_Path'] = metadata['fileName'].apply(lambda x: os.path.join(gif_folder, f"{x}.gif"))
metadata = metadata[metadata['GIF_Path'].apply(os.path.exists)].reset_index(drop=True)

# Map emotion labels (emoVote) to integers
emotion_mapping = {
    "A": 0,  # Anger
    "D": 1,  # Disgust
    "F": 2,  # Fear
    "H": 3,  # Happiness
    "N": 4,  # Neutral
    "S": 5   # Sadness
}
metadata['Emotion_Label'] = metadata['emoVote'].map(emotion_mapping)

# Optional: Filter out low-agreement samples
metadata = metadata[metadata['agreement'] >= 0.6].reset_index(drop=True)



In [15]:
class GIFDataset(Dataset):
    def __init__(self, metadata, transform=None, max_frames=10):
        self.metadata = metadata
        self.transform = transform
        self.max_frames = max_frames

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        gif_path = self.metadata.loc[idx, 'GIF_Path']
        label = self.metadata.loc[idx, 'Emotion_Label']

        # Load GIF as a sequence of frames
        gif = Image.open(gif_path)
        frames = []
        try:
            while True:
                frame = gif.copy().convert("RGB")
                if self.transform:
                    frame = self.transform(frame)
                frames.append(frame)
                gif.seek(gif.tell() + 1)
        except EOFError:
            pass

        # If too many frames, sample evenly
        if len(frames) > self.max_frames:
            indices = torch.linspace(0, len(frames) - 1, self.max_frames).long()
            frames = [frames[i] for i in indices]

        # Aggregate frames (e.g., average pooling across frames)
        frames_tensor = torch.stack(frames, dim=0)  # Shape: [num_frames, channels, height, width]
        aggregated_tensor = frames_tensor.mean(dim=0)  # Shape: [channels, height, width]

        return aggregated_tensor, label




In [16]:
from torchvision import transforms

# Define transformations for GIF frames
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize frames to 224x224
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

# Create the dataset and DataLoader
dataset = GIFDataset(metadata, transform=transform)
batch_size_per_gpu = 16  # Adjust based on memory capacity
dataloader = DataLoader(dataset, batch_size=batch_size_per_gpu * torch.cuda.device_count(), shuffle=True)


In [17]:
# # Set device and wrap model for multi-GPU training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

if torch.cuda.device_count() > 1:
    print(f"Multiple GPUs available. Using only GPU 0 for debugging.")
    device = torch.device("cuda:0")
    model = model.to(device)

Multiple GPUs available. Using only GPU 0 for debugging.


In [48]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)  # Lower learning rate for fine-tuning ViT

# Initialize the GradScaler for mixed precision
scaler = GradScaler()


  scaler = GradScaler()


In [50]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for frames, labels in dataloader:
        # Move data to the appropriate device
        frames = frames.to(device, dtype=torch.float32)  # Ensure frames are float32
        labels = labels.to(device, dtype=torch.long)    # Ensure labels are long (int)

        optimizer.zero_grad()

        # Forward pass with mixed precision
        with autocast():
            outputs = model(frames)
            loss = criterion(outputs, labels)  # CrossEntropyLoss expects float32 and long

        # Backward pass and optimization
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()


        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(dataloader):.4f}")


  with autocast():


Epoch 1/10, Loss: 1.2780
Epoch 2/10, Loss: 0.8405
Epoch 3/10, Loss: 0.6777
Epoch 4/10, Loss: 0.5773
Epoch 5/10, Loss: 0.5119
Epoch 6/10, Loss: 0.4653
Epoch 7/10, Loss: 0.4308
Epoch 8/10, Loss: 0.4017
Epoch 9/10, Loss: 0.3893
Epoch 10/10, Loss: 0.3642


In [52]:

torch.save(model.state_dict(), "emotion_recognition_vit.pth")

print("Model saved successfully.")


Model saved successfully.


In [18]:
model = "/home/jecroisp/Thesis/processed_data/p_crema/emotion_recognition_vit.pth"

In [19]:
def evaluate_model(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for frames, labels in dataloader:
            # Move data to the appropriate device
            frames = frames.to(device, dtype=torch.float32)
            labels = labels.to(device, dtype=torch.long)

            # Forward pass
            outputs = model(frames)
            _, predicted = torch.max(outputs, 1)  # Get the class with the highest score
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy


In [20]:
# Example test DataLoader
test_sampler = torch.utils.data.SequentialSampler(dataset)  # No shuffling for test
test_dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, sampler=test_sampler)


In [21]:
# Evaluate model
accuracy = evaluate_model(model, test_dataloader, device)
print(f"Model Accuracy on Test Dataset: {accuracy:.2f}%")


AttributeError: 'str' object has no attribute 'eval'

In [23]:
import numpy as np

def evaluate_model_per_class(model, dataloader, device, num_classes):
    model.eval()  # Set the model to evaluation mode
    class_correct = np.zeros(num_classes)
    class_total = np.zeros(num_classes)

    with torch.no_grad():
        for frames, labels in dataloader:
            # Move data to the appropriate device
            frames = frames.to(device, dtype=torch.float32)
            labels = labels.to(device, dtype=torch.long)

            # Forward pass
            outputs = model(frames)
            _, predicted = torch.max(outputs, 1)

            for i in range(len(labels)):
                label = labels[i].item()
                class_correct[label] += (predicted[i] == label).item()
                class_total[label] += 1

    # Print per-class accuracy
    for i in range(num_classes):
        accuracy = 100 * class_correct[i] / class_total[i] if class_total[i] > 0 else 0
        print(f"Accuracy of class {i}: {accuracy:.2f}%")


In [24]:
evaluate_model_per_class(model, test_dataloader, device, num_classes=6)


Accuracy of class 0: 88.34%
Accuracy of class 1: 95.51%
Accuracy of class 2: 87.17%
Accuracy of class 3: 99.12%
Accuracy of class 4: 83.69%
Accuracy of class 5: 65.80%


In [22]:
# Initialize the model architecture
model = EmotionRecognitionViT(num_classes=6)

# Load the trained weights
model.load_state_dict(torch.load("emotion_recognition_vit.pth"))

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set the model to evaluation mode
model.eval()


  model.load_state_dict(torch.load("emotion_recognition_vit.pth"))


EmotionRecognitionViT(
  (vit): VisionTransformer(
    (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (encoder): Encoder(
      (dropout): Dropout(p=0.0, inplace=False)
      (layers): Sequential(
        (encoder_layer_0): EncoderBlock(
          (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (self_attention): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (dropout): Dropout(p=0.0, inplace=False)
          (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): MLPBlock(
            (0): Linear(in_features=768, out_features=3072, bias=True)
            (1): GELU(approximate='none')
            (2): Dropout(p=0.0, inplace=False)
            (3): Linear(in_features=3072, out_features=768, bias=True)
            (4): Dropout(p=0.0, inplace=False)
          )
        )
        (encoder_layer_1): EncoderBlock(
          (ln

In [25]:
from PIL import Image
import torch
from torchvision import transforms

# Define the same transformations used during training
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize frames
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

def preprocess_gif(gif_path, max_frames=10):
    gif = Image.open(gif_path)
    frames = []

    # Extract frames and apply transformations
    try:
        while True:
            frame = gif.copy().convert("RGB")  # Convert frame to RGB
            frame = transform(frame)
            frames.append(frame)
            gif.seek(gif.tell() + 1)  # Move to the next frame
    except EOFError:
        pass

    # # Sample or pad frames to max_frames
    # if len(frames) > max_frames:
    #     indices = torch.linspace(0, len(frames) - 1, steps=max_frames).long()
    #     frames = [frames[i] for i in indices]
    # elif len(frames) < max_frames:
    #     padding = [torch.zeros_like(frames[0]) for _ in range(max_frames - len(frames))]
    #     frames.extend(padding)
    
    frames_tensor = torch.stack(frames)  # Shape: [num_frames, 3, 224, 224]
    aggregated_tensor = frames_tensor.mean(dim=0)  # Shape: [3, 224, 224]

    # Stack frames into a single tensor
    # return torch.stack(frames)  # Shape: [max_frames, 3, 224, 224]
    return aggregated_tensor


In [26]:
# Path to the unseen GIF
unseen_gif_path = "/home/jecroisp/Thesis/processed_data/p_crema/rant-black.gif"

# Preprocess the GIF
frames_tensor = preprocess_gif(unseen_gif_path).unsqueeze(0).to(device)  # Add batch dimension

# Predict emotion
with torch.no_grad():
    outputs = model(frames_tensor)  # Forward pass
    _, predicted = torch.max(outputs, 1)  # Get the predicted class
    predicted_class = predicted.item()

# Map the predicted class to the corresponding emotion
emotion_mapping = {
    0: "Anger",
    1: "Disgust",
    2: "Fear",
    3: "Happiness",
    4: "Neutral",
    5: "Sadness"
}
print(f"Predicted Emotion: {emotion_mapping[predicted_class]}")


Predicted Emotion: Anger


In [27]:
import torch.nn.functional as F

# Predict emotion
with torch.no_grad():
    outputs = model(frames_tensor)  # Forward pass
    probabilities = F.softmax(outputs, dim=1)  # Convert logits to probabilities
    confidence, predicted_class = torch.max(probabilities, 1)  # Get confidence and predicted class

# Map the predicted class to the corresponding emotion
emotion_mapping = {
    0: "Anger",
    1: "Disgust",
    2: "Fear",
    3: "Happiness",
    4: "Neutral",
    5: "Sadness"
}

predicted_emotion = emotion_mapping[predicted_class.item()]
confidence_score = confidence.item() * 100  # Convert to percentage
print(f"Predicted Emotion: {predicted_emotion} (Confidence: {confidence_score:.2f}%)")


Predicted Emotion: Anger (Confidence: 61.01%)


In [28]:
# Print confidence scores for all classes
print("Confidence Scores:")
for i, prob in enumerate(probabilities.squeeze(0)):  # Remove batch dimension
    print(f"{emotion_mapping[i]}: {prob.item() * 100:.2f}%")


Confidence Scores:
Anger: 61.01%
Disgust: 7.34%
Fear: 3.02%
Happiness: 4.32%
Neutral: 23.40%
Sadness: 0.90%
