In [1]:
import cv2
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
from approach.ResEmoteNet import ResEmoteNet
from ultralytics import YOLO

# Settings for text
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.7
font_color = (0, 0, 255)  # BGR format
thickness = 2
line_type = cv2.LINE_AA

# Emotions labels
emotions = ['happy', 'surprise', 'sad', 'anger', 'disgust', 'fear', 'neutral']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the YOLOv8 model for face detection
face_detecter = YOLO("/root/Project/class/Emotion_detection/yolov11m-face.pt")
face_detecter = face_detecter.to(device)

# Load the emotion classification model
emotion_classifier = ResEmoteNet().to(device)
checkpoint = torch.load('emotion_best_model.pth', weights_only=True)
emotion_classifier.load_state_dict(checkpoint['model_state_dict'])
emotion_classifier.eval()

# Image transformation pipeline
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Emotion detection function
def detect_emotion(image):
    img_tensor = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = emotion_classifier(img_tensor)
        probabilities = F.softmax(outputs, dim=1)
    scores = probabilities.cpu().numpy().flatten()
    rounded_scores = [round(score, 2) for score in scores]
    return rounded_scores

# Get the emotion with the highest probability
def get_max_emotion(x, y, w, h, image):
    crop_img = image[y:y + h, x:x + w]
    pil_crop_img = Image.fromarray(crop_img)
    rounded_scores = detect_emotion(pil_crop_img)
    max_index = np.argmax(rounded_scores)
    max_emotion = emotions[max_index]
    return max_emotion

# Display the max emotion on the image
def print_max_emotion(x, y, image, max_emotion):
    org = (x, y - 15)
    cv2.putText(image, max_emotion, org, font, font_scale, font_color, thickness, line_type)

# Display all emotion scores on the image
def print_all_emotion(x, y, w, h, image):
    crop_img = image[y:y + h, x:x + w]
    pil_crop_img = Image.fromarray(crop_img)
    rounded_scores = detect_emotion(pil_crop_img)
    org = (x + w + 10, y - 20)
    for index, value in enumerate(emotions):
        emotion_str = f'{value}: {rounded_scores[index]:.2f}'
        y = org[1] + 30
        org = (org[0], y)
        cv2.putText(image, emotion_str, org, font, font_scale, font_color, thickness, line_type)

# Detect faces and emotions using YOLO
def detect_bounding_box(image):
    # YOLO face detection
    results = face_detecter(image)
    faces = []
    
    # Extract bounding boxes from YOLO results
    for result in results:
        for box in result.boxes.xyxy:  # Bounding box coordinates
            x1, y1, x2, y2 = box[:4].int().cpu().numpy()
            w, h = x2 - x1, y2 - y1
            faces.append((x1, y1, w, h))
    
    # Process each detected face
    for (x, y, w, h) in faces:
        # Draw bounding box
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        # Emotion detection and display
        max_emotion = get_max_emotion(x, y, w, h, image)
        print_max_emotion(x, y, image, max_emotion)
        print_all_emotion(x, y, w, h, image)
    
    return faces

# Process the video
input_video_path = '/root/Project/class/Emotion_detection/input/test.mp4'
output_video_path = './output/output_video.mp4'

# Open input video
cap = cv2.VideoCapture(input_video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Video writer for saving output
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Detect faces and emotions in the frame
    detect_bounding_box(frame)

    # Write the processed frame to the output video
    out.write(frame)

# Release resources
cap.release()
out.release()
print(f"Processed video saved to {output_video_path}")



0: 384x640 (no detections), 49.5ms
Speed: 8.8ms preprocess, 49.5ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 11.0ms
Speed: 1.2ms preprocess, 11.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 10.2ms
Speed: 1.0ms preprocess, 10.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 11.7ms
Speed: 1.4ms preprocess, 11.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 10.5ms
Speed: 0.9ms preprocess, 10.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 10.4ms
Speed: 0.8ms preprocess, 10.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 11.6ms
Speed: 1.2ms preprocess, 11.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 10.4ms
Speed: 0.8ms preprocess, 10.4ms i