source reference: https://data-flair.training/blogs/python-mini-project-speech-emotion-recognition/

## Emotion Detection

In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
def extract_feature(file_name, mfcc, chroma, mel, zcr):
        with soundfile.SoundFile(file_name) as sound_file:
            X = sound_file.read(dtype="float32")
            sample_rate=sound_file.samplerate
            if chroma:
                stft=np.abs(librosa.stft(X))
            result=np.array([])
            #Chromagram
            if chroma:
                chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
                result=np.hstack((result, chroma))
            #Mel Frequency Cepstral Coefficents
            if mfcc:
                mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
                result=np.hstack((result, mfccs))
            #Zero Crossing Point
            if zcr:
                mfccs=np.mean(librosa.feature.zero_crossing_rate(y=X).T, axis=0)
                result=np.hstack((result, mfccs))
            #Mel Frequency
            if mel:
                mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
                result=np.hstack((result, mel))
        return result



In [4]:
os.getcwd()

'C:\\Users\\hongh\\Documents\\GitHub\\Speech-Sentiment-Analysis-\\VoiceDetection'

In [3]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

observed_emotions=['happy', 'angry','surprised','disgust']

In [5]:
def load_data(test_size=0.25):
    x,y=[],[]
    for file in glob.glob("./dataset/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True, zcr = True)
        x.append(feature)
        y.append(emotion)
    
    return train_test_split(np.array(x), y, test_size=test_size, random_state=999)

In [6]:
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

In [7]:
print(x_test.shape)

(182, 181)


In [8]:
print(x_train)

[[7.03153968e-01 7.62202263e-01 7.38202572e-01 ... 1.88532031e-05
  1.22487099e-05 5.88301509e-06]
 [5.31505585e-01 4.88014698e-01 4.65663761e-01 ... 5.63904364e-03
  3.58180376e-03 2.04791874e-03]
 [6.18966401e-01 5.60614109e-01 5.16037405e-01 ... 1.34777301e-03
  5.97548962e-04 2.67410622e-04]
 ...
 [6.42070234e-01 6.17730200e-01 6.15742385e-01 ... 2.06411914e-05
  1.82138428e-05 8.16701595e-06]
 [6.12404168e-01 5.92265189e-01 6.30398452e-01 ... 3.26680165e-05
  1.84478868e-05 1.61383523e-05]
 [4.90284115e-01 4.59246576e-01 4.30279553e-01 ... 7.81805778e-04
  3.35281569e-04 2.09520818e-04]]


In [9]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 181


In [10]:
model=MLPClassifier(alpha=0.025, batch_size=128, epsilon=1e-08, hidden_layer_sizes=(700,), learning_rate='adaptive', max_iter=350)

In [11]:
model.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.025, batch_size=128, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(700,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=350,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [12]:
y_pred=model.predict(x_test)

In [13]:
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 59.34%


In [14]:
result = pd.DataFrame({"actual":y_test, "predict":y_pred})
result

Unnamed: 0,actual,predict
0,angry,angry
1,surprised,surprised
2,surprised,surprised
3,disgust,angry
4,happy,angry
...,...,...
177,happy,disgust
178,surprised,surprised
179,surprised,surprised
180,angry,angry


In [17]:
import numpy as np
import sys

record =[]
for file in glob.glob("./dataset/Actor_02/03-01-08-02-01-02-02.wav"):
    feature=extract_feature(file, mfcc=True, chroma=True, mel=True, zcr =True)
    record.append(feature)
record = np.array(record)

In [18]:
import emoji
prediction = model.predict(record)
if prediction[0] in ('happy','surprised'):
    emo = emoji.emojize(':smiley:', use_aliases=True)
    label = "positive"
else:
    emo = emoji.emojize(':rage:', use_aliases=True)
    label = "negative"
print (label, emo)

negative 😡
