Detection of Emotion of Speech for RAVDESS Audio Using Hybrid Convolution Neural Network

https://www.hindawi.com/journals/jhe/2022/8472947/

In [1]:
import pandas as pd
import numpy as np

import os
import sys

from datetime import datetime
import pickle

import librosa
import librosa.display

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim as opt
from torch.utils.data import Dataset, DataLoader 
import torchvision
from torchvision import transforms as T, datasets  
from torch.utils.tensorboard import SummaryWriter 

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
Ravdess = "/kaggle/input/ravdess-emotional-speech-audio/"
Tess = "/kaggle/input/toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/"

In [3]:
ravdess_directory_list = os.listdir(Ravdess)
ravdess_emotion = ['neutral','calm','happy','sad','angry','fear','disgust','surprise']

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    if (dir == "audio_speech_actors_01-24"):
        continue
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        # get the emotion of this file
        part = file.split('.')[0]
        part = part.split('-')
        id = int(part[2])
        file_emotion.append(ravdess_emotion[id-1])
        # get file's path
        file_path.append(Ravdess + dir + '/' + file)
        
# convert to dataframe
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

Ravdess_df.shape

(1440, 2)

In [4]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)
        
# convert to dataframe
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)

Tess_df.shape

(2800, 2)

In [5]:
data_path = pd.concat([Ravdess_df, Tess_df], axis = 0)
data_path.to_csv("data_path.csv",index=False)
data_path.head()

Unnamed: 0,Emotions,Path
0,surprise,/kaggle/input/ravdess-emotional-speech-audio/A...
1,neutral,/kaggle/input/ravdess-emotional-speech-audio/A...
2,disgust,/kaggle/input/ravdess-emotional-speech-audio/A...
3,disgust,/kaggle/input/ravdess-emotional-speech-audio/A...
4,neutral,/kaggle/input/ravdess-emotional-speech-audio/A...


In [6]:
def extract_features(data,sample_rate):
    result = np.array([])
    
    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y = data, sr = sample_rate).T, axis = 0)
    result = np.hstack((result, mfcc))
    
    # Log Mel-Spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y = data, sr = sample_rate).T, axis = 0)
    result = np.hstack((result, mel)) 
    
    # Chroma
    chroma_stft = np.mean(librosa.feature.chroma_stft(S = np.abs(librosa.stft(data)), sr = sample_rate).T, axis = 0)
    result = np.hstack((result, chroma_stft))
    
    return result

In [7]:
def noise(data):
    noise_amp = 0.03 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size = data.shape[0])
    return data

def pitch(data, sampling_rate, pitch_factor = 0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [8]:
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    # load data
    data, sample_rate = librosa.load(path, duration=3)
    
    # augmentation
    noise_data = noise(data)
    pitch_data = pitch(data, sample_rate)
    
    # original speech
    feature = extract_features(data, sample_rate)
    feature = np.array(feature)
    X.append(feature)
    Y.append(emotion)
    
    # noise speech
    feature_noise = extract_features(noise_data, sample_rate)
    feature_noise = np.array(feature_noise)
    X.append(feature_noise)
    Y.append(emotion)
    
    # pitch speech
    feature_pitch = extract_features(pitch_data, sample_rate)
    feature_pitch = np.array(feature_pitch)
    X.append(feature_pitch)
    Y.append(emotion)

In [9]:
# convert to df and save
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,151,152,153,154,155,156,157,158,159,labels
0,-546.596863,44.37738,-19.342091,9.734666,-9.818512,-9.713405,-5.113235,-6.8435,-4.441024,-4.30716,...,0.471864,0.448686,0.505751,0.540664,0.537667,0.536776,0.54137,0.569535,0.59844,surprise
1,-506.400634,38.243272,-17.707741,7.183844,-9.801996,-9.96139,-5.3986,-6.819918,-5.188599,-4.623633,...,0.66722,0.691312,0.684664,0.69075,0.689066,0.691375,0.70238,0.734903,0.731455,surprise
2,-570.769043,42.897747,-20.612906,9.520269,-13.445296,-8.302529,-5.846328,-6.734608,-5.509935,-4.831536,...,0.483759,0.429982,0.501124,0.572247,0.574985,0.578177,0.579576,0.587847,0.613666,surprise
3,-617.853333,61.302486,-15.036719,8.042834,-7.553607,-5.726502,-8.15655,-10.893152,-5.65801,-0.963206,...,0.519677,0.492402,0.504126,0.517783,0.567078,0.568422,0.528793,0.507262,0.484923,neutral
4,-374.570737,16.911391,-0.257796,-0.946276,-3.302215,-4.228215,-5.007686,-5.444537,-3.429903,-1.826473,...,0.811161,0.783862,0.701154,0.70939,0.743575,0.745451,0.740256,0.743368,0.735214,neutral


In [72]:
# Load features from file
Features = pd.read_csv("./features.csv")
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values
len(X), len(Y), data_path.Path.shape

(12720, 12720, (4240,))

In [73]:
# One hot endcoding for Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [74]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0, shuffle = True, test_size=0.3)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((8904, 160), (8904, 8), (3816, 160), (3816, 8))

In [75]:
import pickle

# scaling our data and save the scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

pickle.dump(scaler, open('scaler_Hybrid-1.pkl','wb'))
scaler = pickle.load(open('scaler_Hybrid-1.pkl','rb'))

x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((8904, 160), (8904, 8), (3816, 160), (3816, 8))

In [56]:
class Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = torch.tensor(self.X[idx]).type(torch.float)
        y = torch.tensor(self.y[idx]).type(torch.float)

        return X, y

In [76]:
#Convert X to tensor
X_train_2 = torch.from_numpy(x_train)
X_test_2 = torch.from_numpy(x_test)
print(X_train_2.shape)

torch.Size([8904, 160])


In [77]:
BATCH_SIZE = 64
train_data = Dataset(X_train_2, y_train)
test_data = Dataset(X_test_2, y_test)

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=os.cpu_count(), shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, num_workers=os.cpu_count())

In [59]:
class CNN(nn.Module):
    
    def __init__(self, ):
        super(CNN, self).__init__()
        
        # Block #1: 
        self.layer1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=1024, kernel_size=5),
            nn.ReLU()
        )
        
        # Block #2: 
        self.layer2 = nn.Sequential(
            nn.Conv1d(in_channels=1024, out_channels=512, kernel_size=5),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2)
        )
        
        
        # Block #3: 
        self.layer3 = nn.Sequential(
            nn.Conv1d(in_channels=512, out_channels=256, kernel_size=5),
            nn.ReLU()
        )
        
        # Block #4: 
        self.layer4 = nn.Sequential(
            nn.Conv1d(in_channels=256, out_channels=128, kernel_size=5),
            nn.ReLU()
        )
        
        # Block #5: 
        self.layer5 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=64, kernel_size=5),
            nn.ReLU()
        )
        
        # Block #6:  
        self.layer6 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=64, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5, stride=2)
        )
        
        # Block #7: 
        self.layer7 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=32, kernel_size=5),
            nn.ReLU()
        )
        
        # Block #8: 
        self.layer8 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=16, kernel_size=5),
            nn.ReLU()
        )

        # FC 8 → softmax
        self.fc = nn.Linear(in_features=16*19, out_features=8)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        
        # Channel x H = 1 x 160
        out = self.layer1(x.view(-1, 1, 160))
        
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        
        out = out.view(out.size(0), -1) 
        out = self.fc(out)
        out = self.softmax(out)
        
        return out

In [60]:
# model = CNN()
# print(model)
# !pip install torch-summary
from torchsummary import summary

model = CNN()
summary(model, (64, 160))

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1024, 156]           --
|    └─Conv1d: 2-1                       [-1, 1024, 156]           6,144
|    └─ReLU: 2-2                         [-1, 1024, 156]           --
├─Sequential: 1-2                        [-1, 512, 74]             --
|    └─Conv1d: 2-3                       [-1, 512, 152]            2,621,952
|    └─BatchNorm1d: 2-4                  [-1, 512, 152]            1,024
|    └─ReLU: 2-5                         [-1, 512, 152]            --
|    └─MaxPool1d: 2-6                    [-1, 512, 74]             --
├─Sequential: 1-3                        [-1, 256, 70]             --
|    └─Conv1d: 2-7                       [-1, 256, 70]             655,616
|    └─ReLU: 2-8                         [-1, 256, 70]             --
├─Sequential: 1-4                        [-1, 128, 66]             --
|    └─Conv1d: 2-9                       [-1, 128, 66]             

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 1024, 156]           --
|    └─Conv1d: 2-1                       [-1, 1024, 156]           6,144
|    └─ReLU: 2-2                         [-1, 1024, 156]           --
├─Sequential: 1-2                        [-1, 512, 74]             --
|    └─Conv1d: 2-3                       [-1, 512, 152]            2,621,952
|    └─BatchNorm1d: 2-4                  [-1, 512, 152]            1,024
|    └─ReLU: 2-5                         [-1, 512, 152]            --
|    └─MaxPool1d: 2-6                    [-1, 512, 74]             --
├─Sequential: 1-3                        [-1, 256, 70]             --
|    └─Conv1d: 2-7                       [-1, 256, 70]             655,616
|    └─ReLU: 2-8                         [-1, 256, 70]             --
├─Sequential: 1-4                        [-1, 128, 66]             --
|    └─Conv1d: 2-9                       [-1, 128, 66]             

In [61]:
from sklearn.metrics import accuracy_score
class Trainer:
    def __init__(self, train_dataloader, test_dataloader,
                 model, loss_fn, optimizer, scheduler, logger, device='cpu'):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        self.logger = logger
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device

    def train_epoch(self):
        # Train data
        n_samples = len(self.train_dataloader.dataset)
        train_loss = 0

        for batch_idx, (X, y) in enumerate(self.train_dataloader):
            X = X.to(self.device)
            y = y.to(self.device)
            # Forward
            pred = self.model(X)
            loss = self.loss_fn(pred, torch.argmax(y, dim=1))
            
            # Backward
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            train_loss += loss
        self.scheduler.step()
        return train_loss / n_samples

    def test_epoch(self):
        # Test data
        n_samples = len(self.test_dataloader.dataset)
        test_loss = 0

        for batch_idx, (X,y) in enumerate(self.test_dataloader):
            X = X.to(self.device)
            y = y.to(self.device)
            with torch.no_grad():
                # Forward
                pred = self.model(X)
                loss = self.loss_fn(pred, torch.argmax(y, dim=1))

            test_loss += loss

        return test_loss / n_samples

    def evaluation(self, dataloader):
        y_true = []
        y_pred = []
        for X, y in dataloader:
            X = X.to(self.device)
            y_true.append(y.detach().cpu())
            y_pred.append(self.model(X).detach().cpu())
        
        y_true = torch.cat(y_true, dim=0)
        y_pred = torch.cat(y_pred, dim=0)

        true_labels = torch.argmax(y_true, dim=1)
        pred_labels = torch.argmax(y_pred, dim=1)
        accuracy = accuracy_score(true_labels.cpu(), pred_labels.cpu())

        return accuracy

    def train(self, epochs=10):
        for i in range(epochs):
            self.current_epoch = i+1
            # Training
            train_loss = self.train_epoch()
            test_loss = self.test_epoch()

            # Evaluation
            train_acc = self.evaluation(self.train_dataloader)
            test_acc = self.evaluation(self.test_dataloader)
            
            # Logging
            self.logger.add_scalar('Loss/train', train_loss.item(), i+1)
            self.logger.add_scalar('Loss/test', test_loss.item(), i+1)
            self.logger.add_scalar('Accuracy/train', train_acc.item(), i+1)
            self.logger.add_scalar('Accuracy/test', test_acc.item(), i+1)

            ## Log histogram
            for name, params in model.named_parameters():
                if 'weight' in name:
                    self.logger.add_histogram(name, params, i+1)

            # if ((i+1) % 10 == 0):
            print(f"Epoch {i+1}: Train Loss = {train_loss.item():.5f}, Test Loss = {test_loss.item():.5f}, "
            f"Train accuracy score = {train_acc.item():.5f}, Test accuracy score = {test_acc.item():.5f}")

In [78]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LEARNING_RATE = 1e-3
LOG_DIR = "./logs/"
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 
print(DEVICE)

cuda


In [79]:
from datetime import datetime

model = CNN()
loss_fn = nn.CrossEntropyLoss() 
optimizer = opt.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = opt.lr_scheduler.StepLR(optimizer, step_size=20, gamma=1, last_epoch=- 1, verbose=False)
logger = SummaryWriter(os.path.join(LOG_DIR, datetime.now().strftime("%d_%m_%Y_%H_%M_%S"))) # Logger

# Trainer
trainer = Trainer(
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    scheduler=scheduler,
    logger=logger,
    device=DEVICE
)

In [80]:
import warnings
warnings.filterwarnings('ignore')
trainer.train(epochs=15)

Epoch 1: Train Loss = 0.03100, Test Loss = 0.03078, Train accuracy score = 0.30323, Test accuracy score = 0.29271
Epoch 2: Train Loss = 0.03039, Test Loss = 0.02992, Train accuracy score = 0.38039, Test accuracy score = 0.36845
Epoch 3: Train Loss = 0.03029, Test Loss = 0.03070, Train accuracy score = 0.32233, Test accuracy score = 0.31840
Epoch 4: Train Loss = 0.03053, Test Loss = 0.03043, Train accuracy score = 0.35355, Test accuracy score = 0.33281
Epoch 5: Train Loss = 0.03057, Test Loss = 0.03101, Train accuracy score = 0.31121, Test accuracy score = 0.29769
Epoch 6: Train Loss = 0.03057, Test Loss = 0.03034, Train accuracy score = 0.35108, Test accuracy score = 0.33910
Epoch 7: Train Loss = 0.02989, Test Loss = 0.03000, Train accuracy score = 0.36253, Test accuracy score = 0.36347
Epoch 8: Train Loss = 0.02947, Test Loss = 0.02903, Train accuracy score = 0.41801, Test accuracy score = 0.42584
Epoch 9: Train Loss = 0.02944, Test Loss = 0.02979, Train accuracy score = 0.37478, Test

In [81]:
import pickle
pickle.dump(model, open('model_Hybrid.pkl', 'wb'))

In [83]:
import librosa
import pickle

Emo = ['neutral','calm','happy','sad','angry','fear','disgust','surprise']

def extract_features(data,sample_rate):
    result = np.array([])
    
    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y = data, sr = sample_rate).T, axis = 0)
    result = np.hstack((result, mfcc))
    
    # Log Mel-Spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y = data, sr = sample_rate).T, axis = 0)
    result = np.hstack((result, mel)) 
    
    # Chroma
    chroma_stft = np.mean(librosa.feature.chroma_stft(S = np.abs(librosa.stft(data)), sr = sample_rate).T, axis = 0)
    result = np.hstack((result, chroma_stft))
    
    return result

def emotion_recognition(audio_file):
    trained_model = pickle.load(open('model_Hybrid.pkl', 'rb'))
    scaler = pickle.load(open('scaler_Hybrid-1.pkl','rb'))
    
    # load audio files with librosa
    data, sample_rate = librosa.load(audio_file)
    feat = extract_features(data,sample_rate)
    feat = np.array(feat)
    feat = feat[None,:]
    sc_feat = scaler.transform(feat)
    sc_feat = torch.from_numpy(sc_feat.astype('float32'))
    prediction = trained_model(sc_feat.cuda())
    pred = torch.argmax(prediction, dim=1)
    return Emo[pred]

In [84]:
audio_file = "/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-02-02-01-02-02.wav"
label = emotion_recognition(audio_file)
print(label)

disgust
