In [98]:
# Imports, ignore warnings
import warnings
warnings.filterwarnings('ignore')
import os
from tensorflow import keras
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from keras.models import model_from_json
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [99]:
# Read in the JSON file
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

In [100]:
# Load the model from JSON 
loaded_model = model_from_json(loaded_model_json)

In [101]:
# Load weights into new model
loaded_model.load_weights('saved_models/Emotion_Voice_Detection_Model.h5')
print('Loaded model from disk')

Loaded model from disk


In [102]:
o = keras.optimizers.RMSprop(lr = 0.00001, decay = 1e-6)
loaded_model.compile(loss ='categorical_crossentropy', optimizer = o, metrics = ['accuracy'])

In [125]:
# Method for reading in the audio files and extracting features
"""
- d is the directory the audio files, default is the current working directory.
- dur is the duration in seconds that will be read in.
- For this CNN to work, dur must be 2.5
"""

def readAudioFiles(d, dur, sample_rate):
    if d is None:
        d = 'dir'
        
    df = pd.DataFrame(columns=['feature'])
    file_names = []
    i = 0
    for audiofile in os.listdir(d):
        # Load file using librosa
        print(audiofile, "loaded")
        file_names.append(audiofile)
        X, sr = librosa.load(os.path.join(d, audiofile), res_type = 'kaiser_fast', duration = dur , sr = sample_rate, offset = 0.5)
        sr = np.array(sr)
        # Extract the MFCCS
        mfccs = np.mean(librosa.feature.mfcc(y = X, 
                                            sr = sr, 
                                            n_mfcc = 13),
                        axis=0)
        feature = mfccs
        # Add to data frame
        df.loc[i] = [feature]
        i += 1
    df = pd.DataFrame(df['feature'].values.tolist())
    df = shuffle(df)
    df = df.fillna(0)
    return df, file_names 

In [116]:
audio_features, file_names = readAudioFiles(d = 'the-office-audio-clips', dur = 2.5, sample_rate = 44100)    

daffyduck.wav loaded
deranged.wav loaded
weapons.wav loaded
gamble.wav loaded




cage.wav loaded
punish.wav loaded
shesaid.wav loaded
smile.wav loaded
wild.wav loaded




In [117]:
audio_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
4,3.229875,-0.25044,-3.86676,-1.847083,-1.318632,-3.685853,-2.19231,-0.730599,-2.023446,-5.832206,...,-13.898515,-12.222639,-13.219418,-14.195075,-11.62548,-12.519917,-13.353108,-11.850022,-11.955689,-13.36469
5,-1.823485,-2.858779,-2.578187,-3.18072,-3.942394,-3.32116,-4.217803,-4.44454,-4.239151,-6.173795,...,-9.308763,-4.971852,-0.290382,1.64985,1.364283,1.380545,2.323864,0.95313,-0.293862,-3.324758
0,1.501406,0.817227,-0.033943,-0.573337,-0.992338,-1.820299,-0.825532,-1.126159,-2.236412,-1.79086,...,1.436221,2.606878,2.107405,2.170962,2.021762,1.636188,1.21244,0.841189,0.434088,0.131884
7,-2.409396,-2.300485,-2.479992,0.562576,1.272716,1.013227,0.298986,0.323682,-0.840398,-0.688106,...,-7.117177,-7.068227,-4.295846,-1.048497,0.295969,0.091689,-0.28353,-0.673306,-0.760514,1.282261
2,-4.842369,-6.932052,-10.699755,-11.399866,-12.337184,-12.564977,-13.07259,-11.631197,-11.871105,-12.651745,...,-9.536854,-12.701858,-13.665248,-11.29546,-13.528756,-13.409475,-12.887518,-15.160089,-15.461849,-14.582693


In [118]:
audio_features_cnn = np.expand_dims(audio_features, axis = 2)
audio_features_cnn

array([[[ 3.22987485e+00],
        [-2.50439703e-01],
        [-3.86676025e+00],
        ...,
        [-1.18500223e+01],
        [-1.19556894e+01],
        [-1.33646898e+01]],

       [[-1.82348454e+00],
        [-2.85877872e+00],
        [-2.57818699e+00],
        ...,
        [ 9.53130186e-01],
        [-2.93862075e-01],
        [-3.32475758e+00]],

       [[ 1.50140584e+00],
        [ 8.17227423e-01],
        [-3.39427739e-02],
        ...,
        [ 8.41189027e-01],
        [ 4.34088051e-01],
        [ 1.31884351e-01]],

       ...,

       [[-1.07034435e+01],
        [-8.51020718e+00],
        [-5.30014801e+00],
        ...,
        [-3.21832037e+00],
        [-1.79332304e+00],
        [-7.45307468e-03]],

       [[-4.59554720e+00],
        [-1.88232219e+00],
        [-1.98679745e+00],
        ...,
        [-9.92149353e+00],
        [-8.15657139e+00],
        [-8.68582344e+00]],

       [[-9.50154686e+00],
        [-9.01928425e+00],
        [-8.81004429e+00],
        ...,
        

In [119]:
preds = loaded_model.predict(audio_features_cnn, 
                             batch_size = 32, 
                             verbose = 1)



In [120]:
preds = preds.argmax(axis = 1)

In [142]:
emotions = {
    
    0: 'female_angry', 
    1: 'female_calm',
    2 : 'female_fearful',
    3 : 'female_happy',
    4 : 'female_sad',
    5 : 'male_angry',
    6 : 'male_calm',
    7 : 'male_fearful',
    8 :'male_happy',
    9 : 'male_sad'
}

def inverseTransform(preds, emotion_dict):
    decoded = []
    preds = preds.tolist()
    for i in range(9):
        key = preds[i]
        filename = file_names[i]
        val = emotion_dict[key]
        print('file name:', filename, '/', 'CNN prediction:', key, '/', 'predicted emotion:', val)
        decoded.append(val) 
    return filename, key, val
        

In [143]:
pred_emo = inverseTransform(preds, emotions)

file name: daffyduck.wav / CNN prediction: 7 / predicted emotion: male_fearful
file name: deranged.wav / CNN prediction: 8 / predicted emotion: male_happy
file name: weapons.wav / CNN prediction: 8 / predicted emotion: male_happy
file name: gamble.wav / CNN prediction: 5 / predicted emotion: male_angry
file name: cage.wav / CNN prediction: 5 / predicted emotion: male_angry
file name: punish.wav / CNN prediction: 8 / predicted emotion: male_happy
file name: shesaid.wav / CNN prediction: 8 / predicted emotion: male_happy
file name: smile.wav / CNN prediction: 7 / predicted emotion: male_fearful
file name: wild.wav / CNN prediction: 9 / predicted emotion: male_sad
