# CNN LSTM Model

50.039 Theory and Practice of Deep Learning Project

In [None]:
!pip install torchinfo

In [1]:
import pandas as pd
import numpy as np
import os
import librosa
import librosa.display
import seaborn as sn
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from Preprocessor import *
from Constants import *
from MelTrainHelper import Train_Helper

from tqdm import tqdm
from torchinfo import summary
from IPython.display import Audio
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit


In [2]:
#Convert Dataset into Pandas Dataframe
Tess = "./TESS"

tess_directory_list = os.listdir(Tess)
preprocessor = DataPreprocessor()   
file_paths, labels = preprocessor.get_file_paths_and_labels(Tess)

EMOTIONS= {'neutral':0,'happy':1, 'sad':2, 'angry':3, 'fear':4, 'disgust':5, 'ps':6}
encoded_labels = [EMOTIONS[label] for label in labels]
emotion_df = pd.DataFrame(labels, columns=['Emotions'])
print("ALL EMOTIONS:", set(labels))

path_df = pd.DataFrame(file_paths, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df.head()

ALL EMOTIONS: {'angry', 'ps', 'disgust', 'neutral', 'happy', 'fear', 'sad'}


Unnamed: 0,Emotions,Path
0,sad,./TESS/YAF_sad/YAF_wife_sad.wav
1,sad,./TESS/YAF_sad/YAF_hate_sad.wav
2,sad,./TESS/YAF_sad/YAF_phone_sad.wav
3,sad,./TESS/YAF_sad/YAF_week_sad.wav
4,sad,./TESS/YAF_sad/YAF_raid_sad.wav


# Dataset

In [None]:
# #Convert Dataset into Pandas Dataframe
# RAVDESS = "./RAVDESS"

# preprocessor = DataPreprocessor()   
# file_paths, labels = preprocessor.get_file_paths_and_labels_RAVDESS(RAVDESS)

# CREMA_D_EMOTION_ENCODING = {'FEA':0,'DIS':1, 'HAP':2, 'SAD':3, 'ANG':4, 'NEU':5}
# encoded_labels = [CREMA_D_EMOTION_ENCODING[label] for label in labels]
# emotion_df = pd.DataFrame(labels, columns=['Emotions'])
# print("ALL EMOTIONS:", set(labels))

# path_df = pd.DataFrame(file_paths, columns=['Path'])
# CREMA_D_df = pd.concat([emotion_df, path_df], axis=1)
# CREMA_D_df.head()

In [None]:
# #Convert Dataset into Pandas Dataframe
# CREMA_D = "./CREMA_D"

# preprocessor = DataPreprocessor()   
# file_paths, labels = preprocessor.get_file_paths_and_labels_CREMA(CREMA_D)

# CREMA_D_EMOTION_ENCODING = {'FEA':0,'DIS':1, 'HAP':2, 'SAD':3, 'ANG':4, 'NEU':5}
# encoded_labels = [CREMA_D_EMOTION_ENCODING[label] for label in labels]
# emotion_df = pd.DataFrame(labels, columns=['Emotions'])
# print("ALL EMOTIONS:", set(labels))

# path_df = pd.DataFrame(file_paths, columns=['Path'])
# CREMA_D_df = pd.concat([emotion_df, path_df], axis=1)
# CREMA_D_df.head()

# Data Preprocessing

In [None]:
# #Extracting mel spectrograms from Dataset
# signals = preprocessor.extract_audio_signals(Tess_df['Path'])
# #signals = preprocessor.extract_audio_signals(Tess_df['Path'])
# mel_spectograms = preprocessor.extract_mel_spectograms(signals)

In [None]:
# # Rune only if the converted mel_spectrograms are not saved into numpy array yet
# with open('crema_mel_spec_all.npy', 'wb') as f:
#     np.save(f, mel_spectograms)

In [3]:
with open('mel_spec_all.npy', 'rb') as f:
    mel_spectograms = np.load(f)

In [4]:
#Spliting extracted mel spectrogram data into training, testing and validation set
X_train, X_val, X_test, Y_train, Y_val, Y_test = preprocessor.train_val_test_split(mel_spectograms, encoded_labels)
#Scaling and reshape Data for training
X_train = preprocessor.reshape_scale_data(X_train)
X_test = preprocessor.reshape_scale_data(X_test)
X_val = preprocessor.reshape_scale_data(X_val)

# Create the model

In [12]:
class SEQUENTIAL_CNN_LSTM(nn.Module):
    def __init__(self,num_emotions):
        super().__init__()
        hidden_size = 128
        
        conv2d_1 = nn.Conv2d(in_channels=1, out_channels=16, stride=1, padding=1, kernel_size=3)
        conv2d_2 = nn.Conv2d(in_channels=16, out_channels=32, stride=1, padding=1, kernel_size=3)
        conv2d_3 = nn.Conv2d(in_channels=32, out_channels=64, stride=1, padding=1, kernel_size=3)
        conv2d_4 = nn.Conv2d(in_channels=64, out_channels=64, stride=1, padding=1, kernel_size=3)
        relu = nn.ReLU()
        maxPool2_2 = nn.MaxPool2d(kernel_size=2, stride=2)
        maxPool4_4 = nn.MaxPool2d(kernel_size=4, stride=2)
        dropOut = nn.Dropout(p=0.3)
        batchNorm16 = nn.BatchNorm2d(16)
        batchNorm32 = nn.BatchNorm2d(32)
        batchNorm64 = nn.BatchNorm2d(64)
        
        #Convolution Block
        self.conv2Dblock = nn.Sequential(conv2d_1, batchNorm16, relu, maxPool2_2, dropOut,
                                         conv2d_2, batchNorm32, relu, maxPool4_4, dropOut,
                                         conv2d_3, batchNorm64, relu, maxPool4_4, dropOut,
                                         conv2d_4, batchNorm64, relu, maxPool4_4, dropOut)
        #LSTM Block
        hidden_size = 64
        self.lstm = nn.LSTM(input_size=198,
                            hidden_size=hidden_size,
                            bidirectional=False, 
                            batch_first=True) 
        self.dropout_lstm = nn.Dropout(p=0.3)
        self.out_linear = nn.Linear(hidden_size,num_emotions)
    def forward(self,x):
        conv_out = self.conv2Dblock(x)
        conv_out = torch.flatten(conv_out, start_dim=2)
        lstm_out, (h,c) = self.lstm(conv_eout)
        lstm_out = self.dropout_lstm(lstm_out)
        lstm_output = lstm_out[:,-1,:] 
        output_logits = self.out_linear(lstm_output)
        output_softmax = nn.functional.softmax(output_logits,dim=1)
        return output_logits, output_softmax, 1

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
train_helper = Train_Helper()

cuda


# Learning Stopping Implementation

In [None]:
# from torchsample.modules import ModuleTrainer
# trainer = ModuleTrainer(model)
# model = ModuleTrainer(Network())
# model.compile(loss='nll_loss', optimizer='adam')
# callbacks = [EarlyStopping(monitor='val_loss', patience=5)]
# model.set_callbacks(callbacks)
# model.fit(x_train, y_train, val_data=(x_test, y_test),num_epoch=200, batch_size=128, verbose=1)
# loss = model.evaluate(x_train, y_train)
# y_pred = model.predict(x_train)

# TRAINING

In [8]:
model = SEQUENTIAL_CNN_LSTM(num_emotions=7).to(device)
epochs= 150
batch_size = 32
trained_model, losses, val_losses = train_helper.model_train(model, epochs, batch_size, train_helper.loss_func,  X_train, Y_train, X_val, Y_val, device)

Number of trainable params:  128487
 Epoch 0: iteration 69/70
Epoch 0 --> loss:1.5374, acc:39.24%, val_loss:1.4049, val_acc:48.21%
 Epoch 1: iteration 69/70
Epoch 1 --> loss:1.0547, acc:61.21%, val_loss:1.5572, val_acc:32.50%
 Epoch 2: iteration 69/70
Epoch 2 --> loss:0.8261, acc:71.07%, val_loss:1.2731, val_acc:44.64%
 Epoch 3: iteration 69/70
Epoch 3 --> loss:0.6742, acc:76.38%, val_loss:0.8877, val_acc:67.14%
 Epoch 4: iteration 69/70
Epoch 4 --> loss:0.5396, acc:80.76%, val_loss:0.8652, val_acc:70.00%
 Epoch 5: iteration 69/70
Epoch 5 --> loss:0.4601, acc:84.33%, val_loss:1.3728, val_acc:47.86%
 Epoch 6: iteration 69/70
Epoch 6 --> loss:0.3724, acc:87.50%, val_loss:1.1725, val_acc:60.36%
 Epoch 7: iteration 69/70
Epoch 7 --> loss:0.3303, acc:88.62%, val_loss:0.8964, val_acc:69.29%
 Epoch 8: iteration 69/70
Epoch 8 --> loss:0.2921, acc:90.85%, val_loss:1.2712, val_acc:57.14%
 Epoch 9: iteration 69/70
Epoch 9 --> loss:0.2843, acc:91.12%, val_loss:1.0886, val_acc:64.64%
 Epoch 10: ite

 Epoch 85: iteration 69/70
Epoch 85 --> loss:0.0082, acc:99.78%, val_loss:0.9440, val_acc:80.00%
 Epoch 86: iteration 69/70
Epoch 86 --> loss:0.0247, acc:98.93%, val_loss:1.1179, val_acc:73.93%
 Epoch 87: iteration 69/70
Epoch 87 --> loss:0.0131, acc:99.55%, val_loss:0.5926, val_acc:86.07%
 Epoch 88: iteration 69/70
Epoch 88 --> loss:0.0151, acc:99.64%, val_loss:0.8168, val_acc:82.50%
 Epoch 89: iteration 69/70
Epoch 89 --> loss:0.0070, acc:99.78%, val_loss:0.6294, val_acc:86.79%
 Epoch 90: iteration 69/70
Epoch 90 --> loss:0.0141, acc:99.55%, val_loss:0.6117, val_acc:87.50%
 Epoch 91: iteration 69/70
Epoch 91 --> loss:0.0083, acc:99.82%, val_loss:0.9340, val_acc:81.79%
 Epoch 92: iteration 69/70
Epoch 92 --> loss:0.0168, acc:99.33%, val_loss:0.4705, val_acc:89.29%
 Epoch 93: iteration 69/70
Epoch 93 --> loss:0.0291, acc:99.11%, val_loss:0.4808, val_acc:87.50%
 Epoch 94: iteration 69/70
Epoch 94 --> loss:0.0271, acc:99.15%, val_loss:0.6916, val_acc:81.79%
 Epoch 95: iteration 69/70
Epo

# Saving the trained model's weights

In [9]:
saved_trained_weights_file = "sequential_cnn_lstm_150_epochs.pt"

os.makedirs('models', exist_ok=True)
file_path = os.path.join(os.getcwd(),'models')
torch.save(trained_model.state_dict(), os.path.join(file_path, saved_trained_weights_file))
print('Trained Weights saved to'.format(os.path.join(file_path, saved_trained_weights_file)))

Trained Weights saved to


# Loading model's weights

In [6]:
loaded_trained_weights_file = "sequential_cnn_lstm_150_epochs.pt"

LOAD_PATH = os.path.join(os.getcwd(),'models')
model = CNN_LSTM(len(EMOTIONS))
model.load_state_dict(torch.load(os.path.join(LOAD_PATH, loaded_trained_weights_file)))
print('Weights for Model is loaded from {}'.format(os.path.join(LOAD_PATH, loaded_trained_weights_file)))

Weights for Model is loaded from /home/jovyan/DL Project/models/cnn_lstm_50epochs_batchsize2.pt


In [63]:
summary(model)

Layer (type:depth-idx)                   Param #
CNN_LSTM                                 --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       160
│    └─BatchNorm2d: 2-2                  32
│    └─ReLU: 2-3                         --
│    └─MaxPool2d: 2-4                    --
│    └─Dropout: 2-5                      --
│    └─Conv2d: 2-6                       4,640
│    └─BatchNorm2d: 2-7                  64
│    └─ReLU: 2-8                         --
│    └─MaxPool2d: 2-9                    --
│    └─Dropout: 2-10                     --
│    └─Conv2d: 2-11                      18,496
│    └─BatchNorm2d: 2-12                 128
│    └─ReLU: 2-13                        --
│    └─MaxPool2d: 2-14                   --
│    └─Dropout: 2-15                     --
│    └─Conv2d: 2-16                      36,928
│    └─BatchNorm2d: 2-17                 (recursive)
│    └─ReLU: 2-18                        --
│    └─MaxPool2d: 2-19                   --
│    

# Testing

In [10]:
validate = train_helper.make_validate_func(model, train_helper.loss_func)
X_test_tensor = torch.tensor(X_test,device="cpu").float()
Y_test_tensor = torch.tensor(Y_test, dtype=torch.long, device="cpu")
testing_loss, testing_acc, predictions = validate(X_test_tensor,Y_test_tensor)
print(f'Testing loss is {testing_loss:.3f}')
print(f'Testing accuracy is {testing_acc:.3f}%')

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same

In [7]:
def infer(model, path, all_mel_specs):
    EMOTIONS= {0:'neutral',1:'happy', 2:'sad', 3:'angry', 4:'fear', 5:'disgust', 6:'ps'}
    n=len(path)
    train_helper = Train_Helper()
    preprocessor = DataPreprocessor()
    validate_top3 = train_helper.make_validate_func_top3(model,train_helper.loss_func)
    if len(path) == 1:
        path = path[0]
        audio, sample_rate = librosa.load(path, duration=3, offset=0.5, sr=48000)
        audio_signal = np.zeros((int(sample_rate*3,)))
        audio_signal[:len(audio)] = audio
        mel_specs = preprocessor.extract_mel_spectogram(audio_signal)
        X_test = mel_specs
        X_test = np.expand_dims(X_test, axis=0)
        X_test = np.concatenate((X_test, all_mel_specs))
        X_test = preprocessor.reshape_scale_data(X_test)
        X_test = X_test[:1]
    else:
        audio_signals = preprocessor.extract_audio_signals(path)
        mel_specs = preprocessor.extract_mel_spectograms(audio_signals)
        mel_specs = np.concatenate((mel_specs, all_mel_specs))
        X_test = preprocessor.reshape_scale_data(mel_specs)
        X_test = X_test[:len(path)]
    Y_test = [1]*n
    X_test_tensor = torch.tensor(X_test,device='cpu').float()
    Y_test_tensor = torch.tensor(Y_test,dtype=torch.long,device='cpu')
    test_loss, test_acc, predictions, output_softmax = validate_top3(X_test_tensor,Y_test_tensor)
    
    if len(predictions.tolist()) == 1:
        ground_truth = path.split("_")[-1].split(".")[-2]
        top3_prob, top3 = torch.topk(output_softmax, 3)
        top3 = top3.detach().numpy()
        top3_prob = top3_prob.detach().numpy()
        print("Audio File: ", path)
        print("Predicted Emotions: ",EMOTIONS[top3[0,0]], "\t| Ground Truth: ", ground_truth)
        print("Top 1: ", EMOTIONS[top3[0,0]], "Prob: ", round(top3_prob[0,0]*100, 3),"%", "\t| Top 2: ", EMOTIONS[top3[0,1]], "Prob: ", round(top3_prob[0,1]*100, 3),"%", "\t| Top 3: ", EMOTIONS[top3[0,2]], "Prob: ", round(top3_prob[0,2]*100, 3),"%")
    else:
        print("\n")
        i = 0 
        for pred, file_name in zip(predictions.tolist(), path):
            ground_truth = file_name.split("_")[-1].split(".")[-2]
            
            top3_prob, top3 = torch.topk(output_softmax[i], 3)
            top3 = top3.detach().numpy()
            top3_prob = top3_prob.detach().numpy()
            i += 1
            print("Audio File: ", file_name)
            print("Predicted Emotions: ",EMOTIONS[top3[0]], "\t| Ground Truth: ", ground_truth)
            print("Top 1:", EMOTIONS[top3[0]], "(Prob:", round(top3_prob[0]*100, 2),"%)", "\t| Top 2:", EMOTIONS[top3[1]], "(Prob:", round(top3_prob[1]*100, 2),"%)", "\t| Top 3:", EMOTIONS[top3[2]], "(Prob:", round(top3_prob[2]*100, 2),"%)\n")
    return

# Confusion Matrix

In [None]:
predictions = predictions.cpu().numpy()
cm = confusion_matrix(Y_test, predictions)
print(EMOTIONS)
EMOTIONS= {0:'neutral',1:'happy', 2:'sad', 3:'angry', 4:'fear', 5:'disgust', 6:'ps'}
names = [EMOTIONS[ind] for ind in range(len(EMOTIONS))]
df_cm = pd.DataFrame(cm, index=names, columns=names)
plt.figure(figsize=(10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.show()

# Plot loss

In [None]:
plt.plot(losses,'b')
plt.plot(val_losses,'r')
plt.legend(['train loss','val loss'])

# Visualizing mel spectrograms

In [None]:
audio = preprocessor.extract_audio_signals(["./TESS/YAF_happy/YAF_wife_happy.wav"])

Audio(Y,rate=48000)

In [None]:
librosa.display.specshow(mel_spectograms[2].squeeze(), y_axis='mel', x_axis='time',cmap='magma')
Audio(audio,rate=48000)

# Inferring Single or Multiple Samples

In [11]:
#Inferring Single Sample
#single_test = "./TESS/YAF_sad/YAF_wife_sad.wav"
single_test = "./CREMA_D/happy/1004_IEO_HAP_LO.wav"
infer(model,[single_test],mel_spectograms)
audio = preprocessor.extract_audio_signals([single_test])
Audio(audio,rate=48000)

Audio File:  ./CREMA_D/happy/1004_IEO_HAP_LO.wav
Predicted Emotions:  disgust 	| Ground Truth:  LO
Top 1:  disgust Prob:  54.003 % 	| Top 2:  ps Prob:  41.182 % 	| Top 3:  neutral Prob:  2.27 %
 extract_audio_signals: Processed 0/1 files

In [62]:
#Inferring a list of Samples
file_ls = ["./TESS/YAF_sad/YAF_wife_sad.wav","./TESS/YAF_fear/YAF_wife_fear.wav","./TESS/YAF_happy/YAF_wife_happy.wav","./TESS/YAF_neutral/YAF_wife_neutral.wav"]
infer(model, file_ls ,mel_spectograms)

 extract_mel_spectograms: Processed 3/4 files

Audio File:  ./TESS/YAF_sad/YAF_wife_sad.wav
Predicted Emotions:  sad 	| Ground Truth:  sad
Top 1: sad (Prob: 99.77 %) 	| Top 2: angry (Prob: 0.06 %) 	| Top 3: fear (Prob: 0.06 %)

Audio File:  ./TESS/YAF_fear/YAF_wife_fear.wav
Predicted Emotions:  fear 	| Ground Truth:  fear
Top 1: fear (Prob: 99.92 %) 	| Top 2: angry (Prob: 0.04 %) 	| Top 3: ps (Prob: 0.02 %)

Audio File:  ./TESS/YAF_happy/YAF_wife_happy.wav
Predicted Emotions:  happy 	| Ground Truth:  happy
Top 1: happy (Prob: 100.0 %) 	| Top 2: angry (Prob: 0.0 %) 	| Top 3: fear (Prob: 0.0 %)

Audio File:  ./TESS/YAF_neutral/YAF_wife_neutral.wav
Predicted Emotions:  neutral 	| Ground Truth:  neutral
Top 1: neutral (Prob: 98.16 %) 	| Top 2: angry (Prob: 0.85 %) 	| Top 3: ps (Prob: 0.69 %)



In [13]:
#Inferring a folder of Samples
folder_name = "./TESS/YAF_happy/"
#folder_name = "./CREMA_D/happy/"
folder = os.listdir(folder_name)
folder = [folder_name+file for file in folder]
infer(model, folder, mel_spectograms)

 extract_mel_spectograms: Processed 499/500 files

Audio File:  ./CREMA_D/happy/1015_ITS_HAP_XX.wav
Predicted Emotions:  disgust 	| Ground Truth:  XX
Top 1: disgust (Prob: 85.94 %) 	| Top 2: fear (Prob: 5.46 %) 	| Top 3: angry (Prob: 5.38 %)

Audio File:  ./CREMA_D/happy/1025_IWW_HAP_XX.wav
Predicted Emotions:  ps 	| Ground Truth:  XX
Top 1: ps (Prob: 59.73 %) 	| Top 2: disgust (Prob: 24.68 %) 	| Top 3: angry (Prob: 7.03 %)

Audio File:  ./CREMA_D/happy/1046_IOM_HAP_XX.wav
Predicted Emotions:  disgust 	| Ground Truth:  XX
Top 1: disgust (Prob: 50.21 %) 	| Top 2: fear (Prob: 33.33 %) 	| Top 3: neutral (Prob: 11.04 %)

Audio File:  ./CREMA_D/happy/1018_ITH_HAP_XX.wav
Predicted Emotions:  disgust 	| Ground Truth:  XX
Top 1: disgust (Prob: 39.08 %) 	| Top 2: fear (Prob: 32.28 %) 	| Top 3: ps (Prob: 26.33 %)

Audio File:  ./CREMA_D/happy/1074_IEO_HAP_HI.wav
Predicted Emotions:  ps 	| Ground Truth:  HI
Top 1: ps (Prob: 96.8 %) 	| Top 2: fear (Prob: 1.32 %) 	| Top 3: angry (Prob: 1.06 %)

Aud