# Classifying Audio to Emotion using the RAVDESS dataset
### [Medium Article](https://medium.com/@tushar.gupta_47854/speech-emotion-detection-74337966cf2) | [GitHub](https://github.com/RoccoJay/Audio_to_Emotion)
###### Livingstone SR, Russo FA (2018) The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5): e0196391. https://doi.org/10.1371/journal.pone.0196391.

In [7]:
import pandas as pd
import numpy as np
import os
import glob 
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import unique_labels
from sklearn.neural_network import MLPClassifier
import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [8]:
X_train, X_test, y_train, y_test = pd.read_csv('a2e-data/X_train.csv'), pd.read_csv('a2e-data/X_test.csv'), pd.read_csv('a2e-data/y_train.csv'), pd.read_csv('a2e-data/y_test.csv')

print((X_train.shape[0], X_test.shape[0]))

print(f'Features extracted: {X_train.shape[1]}')

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

mlp_params = {'activation': 'relu', 
              'solver': 'lbfgs', 
              'hidden_layer_sizes': 1283, 
              'alpha': 0.3849485717707319, 
              'batch_size': 163, 
              'learning_rate': 'constant',
              'max_iter':1000}

model = MLPClassifier(**mlp_params)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14, model='clf', save=True):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a seaborn heatmap. 
    Saves confusion matrix file to jpg file."""
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, ax=ax, fmt="d", cmap=plt.cm.Oranges)
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    # fix for mpl bug that cuts off top/bottom of seaborn viz
    b, t = plt.ylim() 
    b += 0.5 
    t -= 0.5 
    plt.ylim(b, t) 
    if save == True:
        plt.savefig('tuned_' + "MLP_Classifier" + '_confusion_matrix.jpg')
    plt.show()

def models(clf_model=model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_pred = y_pred, save=False, print_stat=False, inc_train=False, cv=False):
    """Trains models and outputs score metrics. Takes an identifier, list of models, and split dataset as inputs and has options for saving model, 
    printing confusion matrix and classification report and getting cross-validated 5 fold accuracy."""
    if print_stat == True:
        clf_report = pd.DataFrame(classification_report(y_test,y_pred, output_dict=True)).T
        clf_report.to_csv('tuned_MLP_Classifier_classification_report.csv')
        print("MLP_Classifier")
        print('\nTest Stats\n', classification_report(y_test,y_pred))
        print_confusion_matrix(confusion_matrix(y_test, y_pred), unique_labels(y_test, y_pred), model=clf_model)
        if inc_train == True:
            print("MLP_Classifier")
            print('\nTrain Stats\n', classification_report(y_train,clf_model.predict(X_train)))
            print_confusion_matrix(confusion_matrix(y_train, clf_model.predict(X_train)), unique_labels(y_test, y_pred), model=clf_model)
    if cv == True:
        print("MLP_Classifier" + ' CV Accuracy:',  
              np.mean(cross_val_score(clf_model, X_train, y_train, cv=5, scoring='accuracy')))
    if save == True:
        return clf_model
    
a = models()

print(accuracy_score(y_test, y_pred))

(1961, 491)
Features extracted: 180
0.7535641547861507


In [9]:
def extract_feature(file_name):
    """Function Extracts Features from WAV file"""
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    result=np.hstack((result, mfccs))
    chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    result=np.hstack((result, chroma))
    mel=np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
    result=np.hstack((result, mel))
    return result

emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

def gender(g):
    """Returns Gender Label"""
    if int(g[0:2]) % 2 == 0:
        return 'female'
    else:
        return 'male'
    
def load_data(test_size=0.2):
    """Loads Data from directory containing WAV files."""
    x,y=[],[]
    for file in tqdm(glob.glob("*.m4a")):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
        feature=extract_feature(file)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

---

In [10]:
import joblib, librosa, numpy as np, pandas as pd, warnings

warnings.filterwarnings("ignore")

class emotion_classifier():
    
    def __init__(self, audio_fp, model_fp):
        self.file_name = audio_fp
        self.model = joblib.load(model_fp)
        self.possible_emotions = [emotion.split('_')[0] for emotion in self.model.classes_][::2]

    def extract_features(self):
        """Function Extracts Features from WAV file"""
        X, sample_rate = librosa.load(self.file_name)
        stft=np.abs(librosa.stft(X))
        result=np.array([])
        mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
        result=np.hstack((result, mfccs))
        chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
        mel=np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
        return result
    
    def extract_prob_emotions(self):
        features = self.extract_features()
        prob_emotions = pd.Series(self.model.predict_proba(features.reshape(1, -1))[0],
                                  index=self.model.classes_).sort_values(ascending=False)
        return prob_emotions

    def top3emotions(self):
        prob_emotions = self.extract_prob_emotions()
        top3 = []
        for entry in prob_emotions.index:
            emotion = entry.split('_')[0]
            if emotion not in top3 and len(top3) < 3:
                top3.append(emotion)
                
        return top3
    
    def weighted_emotions(self):
        prob_emotions = self.extract_prob_emotions()
        total_prob_emotions = {}
        
        for index, prob in prob_emotions.items():
            emotion = index.split('_')[0]
            if emotion in total_prob_emotions:
                total_prob_emotions[emotion] += prob
            else:
                total_prob_emotions[emotion] = prob

        return total_prob_emotions

In [11]:
test = emotion_classifier('test_journal.m4a', 'models/mlp_emotion_classifier.pkl')

In [12]:
print(test.top3emotions())
print(test.weighted_emotions())

['sad', 'calm', 'neutral']
{'sad': 1.0, 'calm': 6.250119897169803e-61, 'neutral': 1.6360460307621584e-202, 'angry': 0.0, 'disgust': 0.0, 'fearful': 0.0, 'happy': 0.0, 'surprised': 0.0}


### Write Data to csv Files

In [6]:
# for file, name in zip([X_train, X_test, y_train, y_test],['a2e-data/X_train.csv', 'a2e-dataX_test.csv', 'a2e-datay_train.csv', 'a2e-datay_test.csv']):
#     pd.DataFrame(file).to_csv(name, index=False)