### Install Packages

In [4]:
# Install required packages
!pip install torch torchvision transformers datasets opencv-python gradio uniface huggingface_hub



In [2]:
# Install old version of datasets to be able to use load_dataset()
!pip install datasets==3.6.0



### Import Packages

In [5]:
# Import required packages
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from datasets import load_dataset
import cv2
import gradio as gr
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm.notebook import tqdm
from uniface import RetinaFace
from huggingface_hub import notebook_login, HfApi, whoami

### Load Dataset from HuggingFace Model

In [4]:
# Load Dataset from HuggingFace (FER2013)
# This loads the FER2013 facial expression dataset from HuggingFace Datasets.
dataset = load_dataset('Jeneral/fer-2013')

# Split the dataset into training and test sets
train_ds = dataset['train']
test_ds = dataset['test']

# Print the number of samples in each split
print(f"Train samples: {len(train_ds)} | Test samples: {len(test_ds)}")

# Check class distribution in training set
labels_train = [sample['labels'] for sample in dataset['train']]  # Extract all labels from training set
label_counts_train = Counter(labels_train)  # Count occurrences of each label
print("Class distribution in training set: ", label_counts_train)
class_names = dataset['train'].features['labels'].names  # Get class names from dataset metadata
for idx, count in label_counts_train.items():
    print(f"{class_names[idx]}: {count}")  # Print class name and count

# Check class distribution in test set
labels_test = [sample['labels'] for sample in dataset['test']]  # Extract all labels from test set
label_counts_test = Counter(labels_test)  # Count occurrences of each label
print("Class distribution in test set: ", label_counts_test)
for idx, count in label_counts_test.items():
    print(f"{class_names[idx]}: {count}")  # Print class name and count

Train samples: 28709 | Test samples: 7178
Class distribution in training set:  Counter({3: 7215, 4: 4965, 5: 4830, 2: 4097, 0: 3995, 6: 3171, 1: 436})
angry: 3995
disgust: 436
fear: 4097
happy: 7215
neutral: 4965
sad: 4830
surprise: 3171
Class distribution in test set:  Counter({3: 1774, 5: 1247, 4: 1233, 2: 1024, 0: 958, 6: 831, 1: 111})
angry: 958
disgust: 111
fear: 1024
happy: 1774
neutral: 1233
sad: 1247
surprise: 831


### Data Preprocessing

In [5]:
# Data Preprocessing
# Define image transformations for training data augmentation and normalization.
transform_train = transforms.Compose([
    transforms.ToPILImage(),  # Convert numpy array to PIL image
    transforms.RandomResizedCrop(128, scale=(0.9, 1.0)),  # Random crop and resize to 128x128
    transforms.RandomHorizontalFlip(),  # Randomly flip image horizontally
    transforms.RandomVerticalFlip(p=0.1),  # Randomly flip image vertically with 10% probability
    transforms.RandomRotation(15),  # Randomly rotate image by up to 15 degrees
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10),  # Random affine transformation
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.15, hue=0.07),  # Randomly change brightness, contrast, etc.
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.RandomErasing(p=0.2, scale=(0.02, 0.15)),  # Randomly erase part of image for regularization
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalize image to match ImageNet stats
])

# Define image transformations for test/validation data (no augmentation, only resize and normalize)
transform_test = transforms.Compose([
    transforms.ToPILImage(),  # Convert numpy array to PIL image
    transforms.Resize((128, 128)),  # Resize image to 128x128
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalize image
])

# Function to preprocess image bytes for training or testing
def preprocess_image(img_bytes, train=True):
    img_array = np.frombuffer(img_bytes, np.uint8)  # Convert bytes to numpy array
    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)  # Decode image from array
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    if train:
        img = transform_train(img)  # Apply training transformations
    else:
        img = transform_test(img)  # Apply test transformations
    return img

# Custom PyTorch Dataset for FER2013
class FERDataset(Dataset):
    def __init__(self, split, train=True):
        self.data = dataset[split]  # Load data split (train or test)
        self.train = train  # Flag to indicate training or testing transformations
    def __len__(self):
        return len(self.data)  # Return number of samples
    def __getitem__(self, idx):
        img_bytes = self.data[idx]['img_bytes']  # Get image bytes
        label = self.data[idx]['labels']  # Get label
        img = preprocess_image(img_bytes, train=self.train)  # Preprocess image
        return img, label  # Return image tensor and label

# Create DataLoader for training and validation/test sets
train_loader = DataLoader(FERDataset('train', train=True), batch_size=32, shuffle=True)  # Shuffle for training
val_loader = DataLoader(FERDataset('test', train=False), batch_size=32)  # No shuffle for validation

### Training Model - Definition Part

In [6]:
# Training Model - Definition Part
# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load VGG19 model pre-trained on ImageNet
model = models.vgg19(weights=models.VGG19_Weights.IMAGENET1K_V1)
# Replace the dropout layer in the classifier to increase regularization
model.classifier[5] = nn.Dropout(0.5)  # Dropout set to 0.5 for regularization
# Replace the final fully connected layer to match the number of classes in FER2013 (7 classes)
model.classifier[6] = nn.Linear(4096, 7)  # FER2013 has 7 classes
# Move model to the selected device (GPU or CPU)
model = model.to(device)

# Calculate class weights for imbalanced dataset
# Get the count of each class from the training set
class_counts = [label_counts_train[i] for i in range(len(class_names))]
# Compute weights inversely proportional to class frequency
class_weights = torch.tensor([1.0 / c for c in class_counts])
# Normalize weights so their sum equals the number of classes
class_weights = class_weights / class_weights.sum() * len(class_names)  # Normalize
# Move class weights to the selected device
class_weights = class_weights.to(device)

# Use class weights in the loss function to handle imbalance
criterion = nn.CrossEntropyLoss(weight=class_weights)

### Training Model - Execution Part

In [None]:
# Training Model - Execution Part
# Initialize optimizer (Adam) and learning rate scheduler (ReduceLROnPlateau)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
# Track best validation accuracy and early stopping parameters
best_val_acc = 0
epochs = 50
early_stop_acc = 0.75
counter = 0
early_stop_patience = 10  # Number of epochs to wait for val_acc improvement
no_improve_epochs = 0     # Counter for epochs with no val_acc improvement

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    # Training phase
    model.train()
    correct, total = 0, 0
    train_loss = 0.0
    train_iter = tqdm(train_loader, desc='Training', leave=False)
    for imgs, labels in train_iter:
        imgs, labels = imgs.to(device), labels.to(device)  # Move data to device
        optimizer.zero_grad()  # Reset gradients
        outputs = model(imgs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        _, preds = torch.max(outputs, 1)  # Get predicted class
        correct += (preds == labels).sum().item()  # Count correct predictions
        total += labels.size(0)  # Count total samples
        train_loss += loss.item() * labels.size(0)  # Accumulate loss
        train_iter.set_postfix({'loss': loss.item()})  # Show current loss in progress bar
    train_acc = correct / total  # Calculate training accuracy
    avg_train_loss = train_loss / total  # Calculate average training loss

    # Validation phase
    model.eval()
    correct, total = 0, 0
    val_loss = 0.0
    val_iter = tqdm(val_loader, desc='Validation', leave=False)
    with torch.no_grad():  # Disable gradient calculation for validation
        for imgs, labels in val_iter:
            imgs, labels = imgs.to(device), labels.to(device)  # Move data to device
            outputs = model(imgs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            _, preds = torch.max(outputs, 1)  # Get predicted class
            correct += (preds == labels).sum().item()  # Count correct predictions
            total += labels.size(0)  # Count total samples
            val_loss += loss.item() * labels.size(0)  # Accumulate loss
            val_iter.set_postfix({'loss': loss.item()})  # Show current loss in progress bar
    val_acc = correct / total  # Calculate validation accuracy
    avg_val_loss = val_loss / total  # Calculate average validation loss

    # Update learning rate based on validation loss
    scheduler.step(avg_val_loss)

    # Print training and validation metrics for this epoch
    print(f"Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")
    print(f"Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}")

    # Early stopping: stop when both train and val accuracy reach threshold
    if train_acc >= early_stop_acc and val_acc >= early_stop_acc:
        print("Early stopping: both train and validation accuracy reached threshold.")
        torch.save(model.state_dict(), "models/best_fer32_model.pth")  # Save model
        break

    # Early stopping: stop if val_acc does not improve for early_stop_patience epochs
    if val_acc > best_val_acc:
        best_val_acc = val_acc  # Update best validation accuracy
        torch.save(model.state_dict(), "models/best_fer32_model.pth")  # Save model
        no_improve_epochs = 0  # Reset no improvement counter
    else:
        no_improve_epochs += 1  # Increment no improvement counter
        print(f"No improvement in val_acc for {no_improve_epochs} epoch(s)")
        if no_improve_epochs >= early_stop_patience:
            print(f"Early stopping: validation accuracy did not improve for {early_stop_patience} consecutive epochs.")
            break

### Publish Model to HuggingFace Hub

In [7]:
# Input token to login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# Write your code for publishing here
hf_username = whoami()["name"]
repo_name = f"{hf_username}/vgg19-finetuned-fer2013"

# Create a repo if it doesn't exist (will be under your username)
api = HfApi()
api.create_repo(repo_id=repo_name, exist_ok=True)

# Upload the model file to the repo
api.upload_file(
    path_or_fileobj="models/best_fer32_model.pth",
    path_in_repo="best_fer32_model.pth",
    repo_id=repo_name,
 )

print(f"Model pushed to https://huggingface.co/{repo_name}")
print("You can now share or load your model from the Hugging Face Hub.")

No files have been modified since last commit. Skipping to prevent empty commit.


Model pushed to https://huggingface.co/hanseltertius/vgg19-finetuned-fer2013
You can now share or load your model from the Hugging Face Hub.


### Gradio Implementation

In [None]:
# Gradio Interface
# Load RetinaFace model for face detection with custom parameters
retinaface = RetinaFace(
    model="retinaface_r34",  # Use ResNet34-based RetinaFace model
    conf_thresh=0.3,         # Confidence threshold for face detection
    nms_thresh=0.3,          # Non-maximum suppression threshold (lower = stricter)
    input_size=1024,         # Input size for face detector (higher = more accurate, slower)
    dynamic_size=True        # Allow dynamic input size for different images
 )

# Load emotion recognition model
# Load the best saved model weights for emotion recognition
model.load_state_dict(torch.load("models/best_fer32_model.pth", map_location=device))
model.eval()  # Set model to evaluation mode

def detect_emotion(image):
    # Detect faces in the input image using RetinaFace
    faces = retinaface.detect(image)
    if faces is None or len(faces[0]) == 0:
        return image, {"error": "No face detected"}  # No face detected, return error message

    results = []
    boxes = faces[0]  # bounding boxes: [x1, y1, x2, y2, score]
    annotated_img = image.copy()  # Copy image for annotation
    for box in boxes:
        # Extract bounding box coordinates and score
        x1, y1, x2, y2, _ = int(box[0]), int(box[1]), int(box[2]), int(box[3]), float(box[4])
        face_img = image[y1:y2, x1:x2]  # Crop face from image
        face_img_rgb = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)  # Convert to RGB
        face_img_resized = cv2.resize(face_img_rgb, (128, 128))  # Resize to model input size
        face_tensor = transform_test(face_img_resized).unsqueeze(0).to(device)  # Preprocess and add batch dimension
        with torch.no_grad():  # Disable gradients for inference
            outputs = model(face_tensor)  # Get model predictions
            probs = torch.softmax(outputs, dim=1)  # Convert logits to probabilities
            conf, pred = torch.max(probs, 1)  # Get highest probability and predicted class
            emotion = class_names[pred.item()]  # Get emotion label
            confidence = round(conf.item() * 100, 2)  # Convert confidence to percentage
        # Draw bounding box and label on annotated image
        cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)  # Draw rectangle around face
        cv2.putText(annotated_img, f"{emotion} ({confidence}%)", (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,255,0), 2)  # Draw label above face
        results.append({
            "emotion": emotion,
            "confidence": confidence,
            "bbox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}
        })  # Add result for this face
    return annotated_img, {"result": results}  # Return annotated image and results as 'result' array

# Create Gradio interface for emotion detection
iface = gr.Interface(
    fn=detect_emotion,  # Function to run for each input
    inputs=gr.Image(type="numpy", label="Upload Image"),  # Input: image upload
    outputs=[gr.Image(type="numpy", label="Annotated Image"), gr.JSON(label="Detection Results")],  # Output: annotated image and JSON results
    title="Facial Emotion Recognition with RetinaFace & VGG19 Pretrained Model",
    description="Upload an image. The app will detect all human faces using RetinaFace powered by uniface library, which predict the emotion and confidence for each face using VGG19 model trained on FER2013 Dataset from HuggingFace. Returns annotated image and JSON with emotion, confidence, and bounding box coordinates."
 )

iface.launch()  # Launch Gradio app

# Publish Model 