In [29]:
import pandas as pd
import numpy as np
import itertools
import os
from scipy.io import wavfile
import librosa
import librosa.display

import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
vote_df = pd.read_csv(".\Data\processedResults\summaryTable.csv", index_col = 0).reset_index(drop = True)
demo_df = pd.read_csv(".\Data\VideoDemographics.csv")

In [51]:
vote_df.head(10)

Unnamed: 0,FileName,VoiceVote,VoiceLevel,FaceVote,FaceLevel,MultiModalVote,MultiModalLevel
0,1001_IEO_NEU_XX,N,69.1,N,92.22,N,64.78
1,1001_IEO_HAP_LO,N,71.67,H,57,H,57.38
2,1001_IEO_HAP_MD,N,67.71,H,62.62,H,56.56
3,1001_IEO_HAP_HI,H,63.5,H,68.25,H,73.2
4,1001_IEO_SAD_LO,N,73.71,N,73.5,N,74.8
5,1001_IEO_SAD_MD,N,42.12,N,82.44,N,72.33
6,1001_IEO_SAD_HI,N,60.14,A:F,30.67:51.00,N,63.5
7,1001_IEO_ANG_LO,N,59.57,D,52.6,N,78.0
8,1001_IEO_ANG_MD,A:D,37.2:58.6,A,45.5,A,44.2
9,1001_IEO_ANG_HI,D,67,A,65.56,A,63.25


In [52]:
vote_df['ObservedEmotionANG'] = np.where(vote_df['VoiceVote'].str.contains("A"), 1, 0)
vote_df['ObservedEmotionDIS'] = np.where(vote_df['VoiceVote'].str.contains("D"), 1, 0)
vote_df['ObservedEmotionFEA'] = np.where(vote_df['VoiceVote'].str.contains("F"), 1, 0)
vote_df['ObservedEmotionHAP'] = np.where(vote_df['VoiceVote'].str.contains("H"), 1, 0)
vote_df['ObservedEmotionNEU'] = np.where(vote_df['VoiceVote'].str.contains("N"), 1, 0)
vote_df['ObservedEmotionSAD'] = np.where(vote_df['VoiceVote'].str.contains("S"), 1, 0)

In [53]:
vote_df.head(10)

Unnamed: 0,FileName,VoiceVote,VoiceLevel,FaceVote,FaceLevel,MultiModalVote,MultiModalLevel,ObservedEmotionANG,ObservedEmotionDIS,ObservedEmotionFEA,ObservedEmotionHAP,ObservedEmotionNEU,ObservedEmotionSAD
0,1001_IEO_NEU_XX,N,69.1,N,92.22,N,64.78,0,0,0,0,1,0
1,1001_IEO_HAP_LO,N,71.67,H,57,H,57.38,0,0,0,0,1,0
2,1001_IEO_HAP_MD,N,67.71,H,62.62,H,56.56,0,0,0,0,1,0
3,1001_IEO_HAP_HI,H,63.5,H,68.25,H,73.2,0,0,0,1,0,0
4,1001_IEO_SAD_LO,N,73.71,N,73.5,N,74.8,0,0,0,0,1,0
5,1001_IEO_SAD_MD,N,42.12,N,82.44,N,72.33,0,0,0,0,1,0
6,1001_IEO_SAD_HI,N,60.14,A:F,30.67:51.00,N,63.5,0,0,0,0,1,0
7,1001_IEO_ANG_LO,N,59.57,D,52.6,N,78.0,0,0,0,0,1,0
8,1001_IEO_ANG_MD,A:D,37.2:58.6,A,45.5,A,44.2,1,1,0,0,0,0
9,1001_IEO_ANG_HI,D,67,A,65.56,A,63.25,0,1,0,0,0,0


In [54]:
demo_df.head()

Unnamed: 0,ActorID,Age,Sex,Race,Ethnicity
0,1001,51,Male,Caucasian,Not Hispanic
1,1002,21,Female,Caucasian,Not Hispanic
2,1003,21,Female,Caucasian,Not Hispanic
3,1004,42,Female,Caucasian,Not Hispanic
4,1005,29,Male,African American,Not Hispanic


In [55]:
Crema = "./Data/AudioWAV/"

In [76]:
crema_directory_list = os.listdir(Crema)
observed_emotion = vote_df[["FileName", "ObservedEmotionANG","ObservedEmotionDIS", "ObservedEmotionFEA", "ObservedEmotionHAP",
                           "ObservedEmotionNEU", "ObservedEmotionSAD"]]
file_name = []
file_emotion = []
file_path = []
actor_id = []

for file in crema_directory_list:
    # storing file paths
    file_name.append(file[:-4])
    file_path.append(Crema + file)
    # storing file emotions
    part=file.split('_')
    file_emotion.append(part[2])
    actor_id.append(int(part[0][-4:]))

#Construct Dataframe
name_df = pd.DataFrame(file_name, columns=['FileName'])
emotion_df = pd.DataFrame(file_emotion, columns=['IntendedEmotion'])
path_df = pd.DataFrame(file_path, columns=['Path'])
actor_df = pd.DataFrame(actor_id, columns = ["ActorID"])

#Merge Dataframes
temp_df = pd.concat([name_df, actor_df,emotion_df, path_df], axis=1) 
temp_df2 = temp_df.merge(demo_df, on = "ActorID")
temp_df3 = temp_df2.merge(observed_emotion, on = "FileName")

#Apply One Hot encoding to IntendedEmotion
crema_df = pd.concat([temp_df3, 
                      pd.get_dummies(temp_df3['IntendedEmotion'], prefix = "IntendedEmotion", prefix_sep = '')], 
                     axis = 1)

crema_df.head()

Unnamed: 0,FileName,ActorID,IntendedEmotion,Path,Age,Sex,Race,Ethnicity,ObservedEmotionANG,ObservedEmotionDIS,ObservedEmotionFEA,ObservedEmotionHAP,ObservedEmotionNEU,ObservedEmotionSAD,IntendedEmotionANG,IntendedEmotionDIS,IntendedEmotionFEA,IntendedEmotionHAP,IntendedEmotionNEU,IntendedEmotionSAD
0,1001_DFA_ANG_XX,1001,ANG,./Data/AudioWAV/1001_DFA_ANG_XX.wav,51,Male,Caucasian,Not Hispanic,1,0,0,0,0,0,1,0,0,0,0,0
1,1001_DFA_DIS_XX,1001,DIS,./Data/AudioWAV/1001_DFA_DIS_XX.wav,51,Male,Caucasian,Not Hispanic,0,1,0,0,0,0,0,1,0,0,0,0
2,1001_DFA_FEA_XX,1001,FEA,./Data/AudioWAV/1001_DFA_FEA_XX.wav,51,Male,Caucasian,Not Hispanic,0,0,1,0,0,0,0,0,1,0,0,0
3,1001_DFA_HAP_XX,1001,HAP,./Data/AudioWAV/1001_DFA_HAP_XX.wav,51,Male,Caucasian,Not Hispanic,0,0,0,0,1,0,0,0,0,1,0,0
4,1001_DFA_NEU_XX,1001,NEU,./Data/AudioWAV/1001_DFA_NEU_XX.wav,51,Male,Caucasian,Not Hispanic,0,0,0,0,1,0,0,0,0,0,1,0


SVM: Audio signal preprocessing includes the following
steps [24]: 
applying a pre-emphasis filter which amplifies
higher frequencies, splitting audio signals into shorter windows
(also known as framing), and then multiplying each
window with a Hamming window function so that the start and
end of each window match up. 

Finally, applying the Discrete
Fourier Transform such that the output is represented as the
frequency domain of an audio signal. Once signal processing
is done, we perform audio feature extraction by extracting time
and frequency features from the respective domains of each
window. Time-domain features are extracted directly from the
audio signal inputs and come in the following: Fundamental
frequency, energy, zero crossing rate, and linear predictive
coding. Frequency-domain features extracted include the Mel
Frequency Cepstral Coefficient (MFCC).