# Demo Emotion Recognition: audio + video

## Libraries and parameters

In [2]:
# Utilities
import os
import subprocess
import numpy as np
import keras

# Audio and video manipulation
import moviepy.editor as mp
import cv2
import librosa
from joblib import load
import tkinter as tk

In [3]:
# Labels dictionary
emotions_tras = {1:1, 2:4, 3:5, 4:0, 5:3, 6:2, 7:6}
emotions = {0:'angry', 1:'calm', 2:'disgust', 3:'fear', 4:'happy', 5:'sad', 6:'surprise'}

# Paths
dataset_path = "Examples/"
haar_path = 'haarcascade_frontalface_default.xml'
parameters_path = 'Digital_signal\Dataset\std_scaler.bin'
models_video_path = "Models/Video_stream/"
models_audio_path = "Models/Audio_stream/"
vlc_path = "C:/Program Files/VideoLAN/VLC/vlc.exe" # to play the selected video (insert your own path to vlc.exe)

# Audio video parameters
height_targ = 112
width_targ = 112
sr = 48000

## Select Clip

In [4]:
root= tk.Tk()

canvas1 = tk.Canvas(root, width=400, height=300, relief='raised')
canvas1.pack()

label1 = tk.Label(root, text='Select clip to analize')
label1.config(font=('helvetica', 16))
canvas1.create_window(200, 25, window=label1)

label2 = tk.Label(root, text='Number from 0 to 3:')
label2.config(font=('helvetica', 11))
canvas1.create_window(200, 100, window=label2)

def display_text():
   global example
   example = int(example.get())
   root.destroy

example = tk.Entry(root)
example.pack()
canvas1.create_window(200, 140, window=example)

    
button1 = tk.Button(text='Select', command=lambda: [display_text(), root.destroy()], font=('helvetica', 12, 'bold'))
canvas1.create_window(200, 180, window=button1)

root.mainloop()

In [5]:
fn = os.listdir(dataset_path)
filename = dataset_path + fn[example]
label = emotions_tras[int(fn[example].split('-')[2]) - 1] # trasposition of the emotions

In [6]:
player = subprocess.call([vlc_path, filename, '--play-and-exit'])

## Data preparation

### Video

In [7]:
cap = cv2.VideoCapture(filename)
haar_cascade = cv2.CascadeClassifier(haar_path)
frames = []
count = 0
skip = 3

# Loop through all frames
while True:
    # Capture frame
    ret, frame = cap.read()
    if (count % skip == 0 and count > 20):
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # detect and crop face
        faces = haar_cascade.detectMultiScale(frame, scaleFactor=1.12, minNeighbors=9)
        if len(faces) != 1:
            continue
        for (x, y, w, h) in faces:
            face = frame[y:y + h, x:x + w]

        face = cv2.resize(face, (height_targ+10, width_targ+10))
        face = face[5:-5, 5:-5]
        face = face/255.
        frames.append(face)
    count += 1

frames = np.array(frames)
num_frames = len(frames)
labels = [label] * num_frames
print('shape frames:', frames.shape)

shape frames: (34, 112, 112)


### Audio

In [8]:
audiofile = mp.AudioFileClip(filename).set_fps(sr)
audio = audiofile.to_soundarray()
audio = audio[int(sr/2):int(sr/2 + sr*3)]
audio = np.array([elem[0] for elem in audio])

In [9]:
mel = librosa.power_to_db(librosa.feature.melspectrogram(y=audio, sr=48000, n_fft=1024, n_mels=128, fmin=50, fmax=24000))

scaler = load(parameters_path)
mel = scaler.transform(mel)

mel = np.expand_dims(mel, axis=0)
mel = np.expand_dims(mel, axis=3)
print(mel.shape)


(1, 128, 282, 1)


## Load models

### Video

In [13]:
model_path = 'Models/Video_stream/video_model_11-06-23_11-13_[0.5855]_face.hdf5'
model_video = keras.models.load_model(model_path)


### Audio

In [14]:
model_path = 'Models\Audio_stream\model3_2.h5'
model_audio = keras.models.load_model(model_path)

## Predictions

### Video

In [15]:
pred = model_video.predict(frames)
pred_video = np.mean(pred, axis=0)
pred_video



array([6.2284679e-03, 5.1373136e-01, 3.9748411e-05, 9.5153249e-05,
       3.2962778e-01, 4.6513291e-04, 1.4981234e-01], dtype=float32)

### Audio

In [16]:
pred = model_audio.predict(mel)
pred_audio = np.mean(pred, axis=0)
pred_audio



array([3.73856664e-01, 3.11808378e-01, 1.81318656e-01, 6.13386277e-03,
       1.07560255e-01, 1.92554798e-02, 6.66772830e-05], dtype=float32)

### Global

In [17]:
pred_global = pred_video + pred_audio # mean

In [18]:
print('Video prediction:\t', emotions[pred_video.argmax()])
print('Audio prediction:\t', emotions[pred_audio.argmax()])
print('Global prediction:\t', emotions[pred_global.argmax()])

print('Ground truth:\t\t', emotions[label])

Video prediction:	 calm
Audio prediction:	 angry
Global prediction:	 calm
Ground truth:		 calm
