In [1]:
import librosa
import os
import pandas as pd

# Data Processing

In [2]:
SAVEE = './dataset/ALL/'
dir_list = os.listdir(SAVEE)

In [3]:
emotion=[]
path = []
for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('angry')
    elif i[-8:-6]=='_d':
        emotion.append('disgust')
    elif i[-8:-6]=='_f':
        emotion.append('fear')
    elif i[-8:-6]=='_h':
        emotion.append('happy')
    elif i[-8:-6]=='_n':
        emotion.append('neutral')
    elif i[-8:-6]=='sa':
        emotion.append('sad')
    elif i[-8:-6]=='su':
        emotion.append('surprise')
    else:
        emotion.append('unknown') 
    path.append(SAVEE + i)

In [4]:
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
print('SAVEE dataset')
SAVEE_df.head()

SAVEE dataset


Unnamed: 0,labels,path
0,angry,./dataset/ALL/DC_a01.wav
1,angry,./dataset/ALL/DC_a02.wav
2,angry,./dataset/ALL/DC_a03.wav
3,angry,./dataset/ALL/DC_a04.wav
4,angry,./dataset/ALL/DC_a05.wav


In [5]:
TESS = './dataset/TESS Toronto emotional speech set data/'

In [6]:
path = []
emotion = []
dir_list = os.listdir(TESS)

for i in dir_list:
    fname = os.listdir(TESS + i)   
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('sad')
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

In [7]:
TESS_df = pd.DataFrame(emotion, columns = ['labels'])
#TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
print('TESS dataset')
TESS_df.head()

TESS dataset


Unnamed: 0,labels,path
0,angry,./dataset/TESS Toronto emotional speech set da...
1,angry,./dataset/TESS Toronto emotional speech set da...
2,angry,./dataset/TESS Toronto emotional speech set da...
3,angry,./dataset/TESS Toronto emotional speech set da...
4,angry,./dataset/TESS Toronto emotional speech set da...


In [8]:
# Now lets merge all the dataframe
Males = pd.concat([SAVEE_df], axis = 0)
Males.to_csv("males_emotions_df.csv", index = False)

Females = pd.concat([TESS_df], axis = 0)
Females.to_csv("females_emotions_df.csv", index = False)

In [9]:
comb = [Males,Females]
Dataset=pd.concat(comb)
Dataset=Dataset.sort_values(by=['labels'])
Dataset.to_csv("Dataset_df.csv", index = False)

# Feature Extraction

In [10]:
import numpy as np
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical 
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
from sklearn.model_selection import train_test_split

In [11]:
def feature_extractor(file):
    data,sample_rate=librosa.load(file,res_type="kaiser_fast")
    
    #MFCC 
    mfccs_features = librosa.feature.mfcc(y=data,sr=sample_rate,n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

    return mfccs_scaled_features

In [12]:
extracted_features=[]
for index_num,row in tqdm(Dataset.iterrows()):
    file_name = os.path.join(str(row["path"]))
    final_class_labels=row['labels']
    data=feature_extractor(file_name)
    extracted_features.append([data,final_class_labels])

3280it [01:26, 37.97it/s]


In [13]:
extracted_features_df = pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()
print(extracted_features_df.iloc[0,0])

[-3.18135529e+02  1.12588829e+02  1.23745937e+01  3.45954971e+01
  1.16200695e+01 -2.91651773e+00 -2.48573990e+01 -2.59529781e+00
 -1.00110602e+00 -1.15826321e+01 -2.48748541e+00 -9.69559789e-01
 -9.28020418e-01 -1.63937509e+00  5.17986345e+00  6.59545600e-01
 -3.40372515e+00  5.59151697e+00 -9.27957833e-01 -5.73660564e+00
  5.74505851e-02  5.40333331e-01  1.78466511e+00 -2.08028388e+00
 -1.25319338e+00 -3.53276229e+00 -4.19973946e+00 -1.58740610e-01
 -2.00895476e+00  5.86389303e-01  1.40789413e+00  1.71946561e+00
  3.23082638e+00  2.73758245e+00  3.46699786e+00  5.05381870e+00
  4.83345985e+00  5.60282660e+00  4.30647087e+00  3.39099669e+00]


In [14]:
#split in indep and dep dataset
X = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

In [15]:
y=to_categorical(labelencoder.fit_transform(y))

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Model

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [18]:
num_labels = y.shape[1]

In [23]:
model=Sequential()
#first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

#final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 100)               4100      
                                                                 
 activation_4 (Activation)   (None, 100)               0         
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_5 (Dense)             (None, 200)               20200     
                                                                 
 activation_5 (Activation)   (None, 200)               0         
                                                                 
 dropout_4 (Dropout)         (None, 200)               0         
                                                                 
 dense_6 (Dense)             (None, 100)              

In [25]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

# Train Model

In [26]:
#Train the model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

num_epochs=100
num_batch_size=32

checkpointer = ModelCheckpoint(filepath='audio_classification.hdf5',verbose=1,save_best_only=True)
start = datetime.now()

model.fit(X_train,y_train,batch_size = num_batch_size,epochs=num_epochs,validation_data=(X_test,y_test),callbacks=[checkpointer])

duration = datetime.now()-start
print(duration)

Epoch 1/100
Epoch 1: val_loss improved from inf to 1.99515, saving model to audio_classification.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 1.99515 to 1.94748, saving model to audio_classification.hdf5
Epoch 3/100
Epoch 3: val_loss did not improve from 1.94748
Epoch 4/100
Epoch 4: val_loss did not improve from 1.94748
Epoch 5/100
Epoch 5: val_loss did not improve from 1.94748
Epoch 6/100
Epoch 6: val_loss did not improve from 1.94748
Epoch 7/100
Epoch 7: val_loss did not improve from 1.94748
Epoch 8/100
Epoch 8: val_loss did not improve from 1.94748
Epoch 9/100
Epoch 9: val_loss improved from 1.94748 to 1.94034, saving model to audio_classification.hdf5
Epoch 10/100
Epoch 10: val_loss improved from 1.94034 to 1.92262, saving model to audio_classification.hdf5
Epoch 11/100
Epoch 11: val_loss improved from 1.92262 to 1.84691, saving model to audio_classification.hdf5
Epoch 12/100
Epoch 12: val_loss improved from 1.84691 to 1.77576, saving model to audio_classification.hdf5
Epoch 13

In [27]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(round(test_accuracy[1]*100),'%')

91 %


In [28]:
def predictEmotion(filename):
        audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
        mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

        #print(mfccs_scaled_features)
        mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
        #print(mfccs_scaled_features)
        #print(mfccs_scaled_features.shape)
        predicted_label=model.predict(mfccs_scaled_features)
        predicted_label=np.argmax(predicted_label,axis=-1)
        print(predicted_label)
        prediction_class = labelencoder.inverse_transform(predicted_label) 
        print(prediction_class)

# Prediction for samples in dataset

In [29]:
predictEmotion('YAF_bite_ps.wav')# surpise file (female)

[6]
['surprise']


In [66]:
predictEmotion('DC_a01.wav')#angry file (Male)

[3]
['happy']


In [67]:
predictEmotion('OAF_cab_sad.wav')#sad file (female)

[1]
['disgust']


In [68]:
predictEmotion('YAF_rose_happy.wav')#happy file (female)

[3]
['happy']


In [69]:
predictEmotion('DC_n24.wav')#neutral (Male)

[4]
['neutral']


# Prediction for samples not in dataset

In [39]:
predictEmotion('03-01-08-01-01-01-24.wav')# surprised

[3]
['happy']


In [40]:
predictEmotion('03-01-08-01-01-01-24.wav')# fearful

[3]
['happy']


In [41]:
predictEmotion('03-01-01-01-01-01-10.wav')#neutral

[1]
['disgust']


In [42]:
predictEmotion('03-01-05-01-01-01-05.wav')#angry

[3]
['happy']


In [43]:
predictEmotion('03-01-03-01-01-01-17.wav')#happy

[1]
['disgust']
