# Multimodal Emotion Detection

A comprehensive multimodal emotional and psychological analysis system that integrates facial expression, body movement, voice tone and NLP to detect and interpret emotional and mental states from multiple input sources.

#### import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import cv2
import dlib
import tensorflow as tf
from tensorflow.keras.models import load_model
import timeit
import datetime
import mediapipe as mp
import time
import sounddevice as sd
import soundfile as sf

## Import scripts

### Video detection scripts

In [2]:
# Face expression detection script related to the 'face_expression_recognition' repository
from scripts.expression.face_script import * 
# Body movement detection script related to the 'body_movement_analysis' repository
from scripts.body.body_script import *

Model metadata saved to: models/expression/expression_model_RAF_metadata.pkl


### Audio detection scripts

In [3]:
# Voice tone emotion detection script related to the 'voice_emotion_recognition' repository
from scripts.voice.voice_script import *
# NLP sentiment detection script related to the 'NLP_emotion_recognition' repository
from scripts.nlp.nlp_script import *

## Multimodal emotion detection

In [4]:
# Performs real-time emotion detection with user feedback using pre-trained model and video capture.
def emotion_detection(model_metadata, img_size, emotion_frames):
    """
    Initialize Face expression detection variables
    """
    # Load Dlib's face detector
    detector = dlib.get_frontal_face_detector()
    
    # Initialize model and motions labels from pickle metadata
    model = load_model(model_metadata["model_path"])   # Load the pre-trained model 
    emotions = model_metadata["emotion_labels"]  # Load the emotion classes
    
    # Initialize emotion count dictionary and variable for tracking total no. of emotions observed
    # Used to return a summary of all observed emotions without relying on emotion stabilization 
    emotion_count = {label: 0 for label in emotions} # Emotion label dictionary 
    emotion_count_total = 0 
    
    # Initialize user key-press feedback (i.e. True or False emotion observed)
    user_feedback = []             # List for emotion feedback
    detected_emotion = None        # Track the last emotion that was logged

    # Initialize variable for stabilizing emotions predicted by the model
    # Used to ensure that predicted emotions are stable for a certain ammount of frames (i.e. 'emotion_frames' input argument)
    # Helps to provide the users with enough time to identify and provide feedback for the observed emotion
    emotion_queue = []   # FIFO queue for tracking observed emotions in the input frame count threshold, used as a buffer to stabilize emotion over time
    emotion_prev = None  # Previous stabilized emotion from queue
    emotion_curr = None  # Current stabilized emotion from queue
    stable_frames = 0    # Counter for how many frames the emotion has been stable

    """
    Initialize body movement detection variables
    """
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose()
    mp_drawing = mp.solutions.drawing_utils
    
    # Initialize previous landmarks and analytics storage
    landsmarks_prev = None
    
    # Initialize dictionary for movement data
    movement_data = {}
    
    # Initialize time variables for identifying time of observations
    start_time_body = time.time()  # Start time
    max_time = 0  # Variable for time of maximum movement
    max_movement = 0  # Variable for maximum movement score observed

    """
    Initialize audio recording variables
    """
    # Audio recording variables
    sample_rate = 44100  # Sample rate
    audio_recording = []  # List to store the recorded audio

    # Terminal start-up prints
    print("Starting audio recording...") # Display recording in progress information
    print("Video started: Press 'Q' to quit.") # Display exit guide

    # Start audio recording & webcam capture
    with sd.InputStream(samplerate=sample_rate, channels=1, dtype='float32') as stream:
        # Initialize video capture from webcam
        cap = cv2.VideoCapture(0)

        """
        Face expression detection from capture
        """
        # Initial color for rectangle around detected face: Used to revert color change of user input key-press 
        # True: Green, False: Red, No input: Green
        rect_color = (255, 0, 0) # Green color
    
        # Guide text for first 10 seconds
        start_time = timeit.default_timer() # Start timer: Used to clear the text
        text, font, scale, thickness = "Press 'T' if the detected emotion is correct and 'F' if incorrect", cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
        text_size = cv2.getTextSize(text, font, scale, thickness)[0]
        text_pos = ((int(cap.get(cv2.CAP_PROP_FRAME_WIDTH )) - text_size[0]) // 2, int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT )) - 20) # Display in the middle at bottom
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
                        # Record audio chunks continuously
            audio_chunk, overflowed = stream.read(int(sample_rate * 0.1))
            if not overflowed:
                audio_recording.append(audio_chunk)
    
            # Print user input instructions 
            if start_time is not None:
                # Get elapsed time since starting video capture
                elapsed_time = timeit.default_timer() - start_time
                # Print 10 seconds
                if elapsed_time  < 10:
                    cv2.putText(frame, text, text_pos, font, scale, (180, 190, 180), thickness)
                else:
                    # Clear start_time variable afterwards 
                    start_time = None
    
            # Convert the frame to grayscale
            gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
            # face detection:
            # Detect faces using Dlib's face detector (performs better than the Haar Cascade Classifier previously used)
            faces = detector(gray_frame)
            # For each detected face (i.e. coordinates of detected face)
            for face in faces:
                # Capture face region and repare image for model classification
                x, y, w, h = face.left(), face.top(), face.width(), face.height()
                face_region = gray_frame[y:y+h, x:x+w]                   # Capture face region
                face_resized = cv2.resize(face_region, img_size)         # Resize region to input image size
                face_normalized = face_resized / 255.0                   # Normalize
                face_reshaped = np.expand_dims(face_normalized, axis=-1) # Add channel dimension
                face_input = np.expand_dims(face_reshaped, axis=0)       # Add batch dimension
    
                # Predict the emotion using the pre-trained model
                prediction = model.predict(face_input, verbose=0)        # Pre-trained model loaded from the pickle file
                prediction_max = np.argmax(prediction)                   # Index of emotion with the highest probability
                prediction_label = emotions[prediction_max]              # Map emotion with the highest probability to the labels of the pickle file 
    
                # Populate the emotion queue to obtained the max obs. emotion of the desired frames (i.e. the frame input argument)
                # Stabilizes emotion prediction to enable user feedback
                emotion_queue.append(prediction_label)   # Appends predicted emotions
                if len(emotion_queue) > emotion_frames:   
                    emotion_queue.pop(0)                 # Pop first emotion label if queue exceeds input frames
                    
                # Get the current emotion from the emotion prediction queue
                emotion_curr = max(set(emotion_queue), key=emotion_queue.count)  # Max observed emotion label is the current (stable) emotion
        
                # Track changes in observed emotion
                if emotion_curr == emotion_prev:
                    # Increase stable frames value if the emotion labels remains the same
                    stable_frames += 1          # Increment frames
                else:
                    # Reset stable frames value and previous label when a new emotion is observed. 
                    emotion_prev = emotion_curr # Update previous observed emotion label
                    stable_frames = 1           # Reset frames
      
                # Detect stabilized emotions (i.e. when an emotion has been observed for longer than the emotion frame input threshold argument)
                if stable_frames >= emotion_frames:
                    if emotion_curr != detected_emotion:
                        detected_emotion = emotion_curr  # Define or update the detected emotion
    
                # Display the predicted emotion top left corner
                if detected_emotion:
                    cv2.putText(frame, detected_emotion, (265, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 255, 100), 2)
                    
                # Display rectangle around the face region with current color
                cv2.rectangle(frame, (x, y), (x+w, y+h), rect_color, 2)
                
                # Reset rectangle color to blue
                if stable_frames > 5:
                    rect_color = (255, 0, 0)
    
                # Update emotion label and total counters
                emotion_count[prediction_label] += 1  # Emotion label individual counts
                emotion_count_total += 1              # Emotion label individual counts
    
            # Display detection text
            cv2.putText(frame, "Detect emotion:", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 255, 100), 2)

            
            """
            Body movement detection from capture
            """
            # Body movement detection:
            # Process image to retrieve pose landmarks
            image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image_pose = pose.process(image_rgb)
    
            # Check if landmarks are detected
            if image_pose.pose_landmarks:
                # Draw detected landmarks 
                mp_drawing.draw_landmarks(
                    frame, image_pose.pose_landmarks, mp_pose.POSE_CONNECTIONS)
        
                # Extract current landmark
                landmarks_curr = image_pose.pose_landmarks.landmark
        
                # Detect and analyse body part movement obtained with the landmarks
                if landsmarks_prev is not None:
                    # Call movement calculator function to get differrence between current and prevous landmark
                    movement = calculate_movement(landmarks_curr, landsmarks_prev)
                    movement_total = sum(movement.values())  # Total movement score
        
                    # Call movement categorizer function to obtain total movement classification score (low, med, high) 
                    movement_class, _ = categorize_movement(movement_total)
        
                    # Summarize body part movement
                    movement_parts = {
                        'Head': movement['Left Eye'] + movement['Left Eye'],  # Head movement score
                        'Shoulders': movement['Left Shoulder'] + movement['Right Shoulder'],  # Shoulders movement score
                        'Hands': movement['Left Wrist'] + movement['Right Wrist'],  # Hands movement score
                        'Elbows': movement['Left Elbow'] + movement['Right Elbow'],  # Elbows movement score
                        'Hips': movement['Left Hip'] + movement['Right Hip'],  # Hips movement score
                        'Knees': movement['Left Knee'] + movement['Right Knee'],  # Knees movement score
                        'Ankles': movement['Left Ankle'] + movement['Right Ankle']  # Ankles movement score
                    }
                    
                    # Identify most moved body part
                    movement_max = max(movement_parts, key=movement_parts.get)
        
                    # Check if this is the maximum movement so far
                    current_time = time.time() - start_time_body
                
                    # Check if current overall movement score is higher than the previous max observer
                    if movement_total > max_movement:
                        max_movement = movement_total  # Update if new max is observed
                        max_time = current_time  # Update obervation time
        
                    # Display overall movement classification score on video capture
                    cv2.putText(
                        frame, f'Movement: {movement_class}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA
                    )
                    
                    # Display time of maximum movement observed
                    cv2.putText(
                        frame, f'Max Movement at: {max_time:.2f} sec', (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA
                    )
        
                    # Store the movement data to the dictionary 
                    movement_data['total_movement'] = movement_total  # Overall movement score (i.e. sum of movement values)
                    movement_data['total_movement_class'] = movement_class  # Overall movement classification
                    movement_data['most_moved_part'] = movement_max  # Most moved body part class
                    movement_data['most_moved_score'] = movement_parts[movement_max]  # Most moved body part score
                    movement_data['individual_movements'] = movement  # Individual body part scores
    
                # Update previous landmarks
                landsmarks_prev = landmarks_curr
    
            # Display image with face expression classification and body movement annotation 
            cv2.imshow('Facial Expression Detection', frame)
    
            # Handle key presses
            key = cv2.waitKey(1) & 0xFF
            # Press 'T' to mark the the current stable emotion as false
            if key == ord('q'):  
                break

        # Release the video capture object and close all OpenCV windows
        cap.release()
        cv2.destroyAllWindows()
        
        """
        Audio recording post-processing
        """
        # Save the recorded audio to a wav file
        if audio_recording:
            audio_data = np.concatenate(audio_recording, axis=0)
            output_filename = "data/audio/recording.wav"
            sf.write(output_filename, audio_data, sample_rate)
            print(f"Audio saved as {output_filename}")
        else:
            print("No audio recorded.")

        
        """
        Face expression detection post-processing
        """
        # Calculate percentages and determine the overall state
        emotion_summary = {} # Dictionary for facial expressions predictions 
        overall_state = None

        if emotion_count_total > 0:
            overall_state = max(emotion_count, key=emotion_count.get)  # Get most frequently detected emotion
            # Save percentage to dictionary for each emotional state
            for emotion, count in emotion_count.items():
                percentage = (count / emotion_count_total) * 100
                emotion_summary[emotion] = f"{percentage:.2f}%"
        else:
            emotion_summary = {"message": "No emotions were detected."}

        """
        Body movement detection post-processing
        """
        # Body movement analysis dictionary initialized with summary vaules
        movement_analysis = {
            "overall_movement_score": f"{movement_data.get('total_movement', 0):.2f}",
            "overall_movement_classification": movement_data.get('total_movement_class', 'N/A'),
            "max_time": f"{max_time:.2f} sec",
            "most_moved_part": movement_data.get('most_moved_part', 'N/A'),
            "most_moved_part_score": f"{movement_data.get('most_moved_score', 0):.2f}"
        }

        # Add individual movements scores to the body movement analysis dictionary
        individual_movements = {}
        for part, movement in movement_data.get('individual_movements', {}).items():
            individual_movements[part] = f"{movement:.2f}"
        movement_analysis["individual_movements"] = individual_movements

        
        """
        Voice tone detection: Detects voice tone emotions based on the audio features of the saved audio recording.
        """
        tone_prediction = voice_prediction("data/audio/recording.wav")

        """
        NLP sentiment analysis: Detects speech-to-text sentiment based on the saved audio recording.
        """
        nlp_prediction = classify_audio_emotion(
            audio_file_path="data/audio/recording.wav",
            whisper_model=whisper_model,
            emotion_model=emotion_model,
            tokenizer=tokenizer,
            stop_words=stop_words_nltk,
            max_len=178,
            threshold=0.5
        )
        
        # Return multimodal detections
        return emotion_summary, movement_analysis, tone_prediction, nlp_prediction


In [5]:
emotion_summary, movement_analysis, tone_prediction, nlp_prediction = emotion_detection(model_metadata, (100, 100), 15)

Starting audio recording...
Video started: Press 'Q' to quit.
Audio saved as data/audio/recording.wav


## Evalution

### Evaluting 'happiness'

In [6]:
# Facial expression
emotion_summary

{'Surprise': '0.00%',
 'Fear': '0.00%',
 'Disgust': '0.00%',
 'Happiness': '100.00%',
 'Sadness': '0.00%',
 'Anger': '0.00%',
 'Neutral': '0.00%'}

In [7]:
# Movement 
movement_analysis

{'overall_movement_score': '0.05',
 'overall_movement_classification': 'Low',
 'max_time': '4.76 sec',
 'most_moved_part': 'Elbows',
 'most_moved_part_score': '0.03',
 'individual_movements': {'Left Eye': '0.00',
  'Right Eye': '0.00',
  'Left Shoulder': '0.00',
  'Right Shoulder': '0.00',
  'Left Elbow': '0.01',
  'Right Elbow': '0.02',
  'Left Wrist': '0.00',
  'Right Wrist': '0.00',
  'Left Hip': '0.00',
  'Right Hip': '0.00',
  'Left Knee': '0.00',
  'Right Knee': '0.00',
  'Left Ankle': '0.00',
  'Right Ankle': '0.00'}}

In [8]:
# Voice tone 
tone_prediction

{'angry': '0.00',
 'disgust': '0.75',
 'fear': '1.79',
 'happy': '0.04',
 'neutral': '89.75',
 'sad': '7.19',
 'surprise': '0.48',
 'predicted_emotion': 'neutral'}

In [9]:
# NLP sentiment
nlp_prediction

{'transcribed_text': 'I am very happy about the good weather today.',
 'predicted_probabilities': {'disgust': '0.00',
  'fear': '0.00',
  'anger': '0.00',
  'joy': '100.00',
  'sadness': '0.00',
  'surprise': '0.00'},
 'predicted_emotion': 'joy'}

### Evaluting 'sadness'

In [10]:
emotion_summary, movement_analysis, tone_prediction, nlp_prediction = emotion_detection(model_metadata, (100, 100), 15)

Starting audio recording...
Video started: Press 'Q' to quit.
Audio saved as data/audio/recording.wav


In [11]:
# Facial expression
emotion_summary

{'Surprise': '0.00%',
 'Fear': '0.00%',
 'Disgust': '0.00%',
 'Happiness': '0.00%',
 'Sadness': '82.27%',
 'Anger': '0.00%',
 'Neutral': '17.73%'}

In [12]:
# Movement 
movement_analysis

{'overall_movement_score': '0.08',
 'overall_movement_classification': 'Low',
 'max_time': '17.34 sec',
 'most_moved_part': 'Ankles',
 'most_moved_part_score': '0.03',
 'individual_movements': {'Left Eye': '0.00',
  'Right Eye': '0.00',
  'Left Shoulder': '0.00',
  'Right Shoulder': '0.00',
  'Left Elbow': '0.00',
  'Right Elbow': '0.01',
  'Left Wrist': '0.00',
  'Right Wrist': '0.00',
  'Left Hip': '0.00',
  'Right Hip': '0.01',
  'Left Knee': '0.01',
  'Right Knee': '0.01',
  'Left Ankle': '0.01',
  'Right Ankle': '0.01'}}

In [13]:
# Voice tone 
tone_prediction

{'angry': '0.00',
 'disgust': '0.00',
 'fear': '0.05',
 'happy': '0.02',
 'neutral': '97.64',
 'sad': '2.04',
 'surprise': '0.26',
 'predicted_emotion': 'neutral'}

In [14]:
# NLP sentiment
nlp_prediction

{'transcribed_text': "I'm really sad about the battles that came today.",
 'predicted_probabilities': {'disgust': '0.00',
  'fear': '0.00',
  'anger': '0.00',
  'joy': '0.00',
  'sadness': '100.00',
  'surprise': '0.00'},
 'predicted_emotion': 'sadness'}

## Conclusion

The Multimodal Emotion detection extracts information from video and audio recordings. It effectively captures a variety of information about the emotional state of the patient. These observations can be used to provide valuable insight when conducting a psychiatric assessment or to help classify critical sections of an interview.