In [1]:
import numpy as np
import cv2
import pandas as pd

import torch
from torch import nn


from scipy.io import wavfile

from torchsummary import summary

from tqdm import tqdm

import os
import shutil
import tarfile

import librosa
import random

import pickle

import time

import IPython.display as ipd

import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
import seaborn as sns

%matplotlib inline

In [61]:
def get_paths_to_wavs(path_to_dataset):
    file_paths_list = []

    for root, dirs, files in os.walk(path_to_dataset):
        if len(files) != 0:
            file_paths_list += [os.path.join(root, f) for f in files if f.endswith('.wav')]

    return file_paths_list

def get_paths_to_npys(path_to_dataset):
    # get a list with all absolute paths to each file
    file_paths_list = []

    for root, dirs, files in os.walk(path_to_dataset):
        if len(files) != 0:
            file_paths_list += [os.path.join(root, f) for f in files if f.endswith('.npy')]
            #file_paths_list += [os.path.join(root, f) for f in files if os.path.isdir(os.path.join(root, f))]

    return file_paths_list

class numpy_ravdess_dataset(torch.utils.data.Dataset):
    '''
    Due to librosa reads wav-files very slow it is more preferable to read the
    numpy representations of the original wavs
    '''

    emotions_dict = {
        0: 'neutral',
        1: 'calm',
        2: 'happy',
        3: 'sad',
        4: 'angry',
        5: 'fearful',
        6: 'disgust',
        7: 'surprised'
        }

    def __init__(self, paths_to_wavs_list, spectrogram_shape, mode):
        super(numpy_ravdess_dataset, self).__init__()

        self.paths_to_wavs_list = paths_to_wavs_list

        self.mfcc_rows = spectrogram_shape[0]
        self.mfcc_cols = spectrogram_shape[1]
        self.mode = mode

    def __len__(self):
        return len(self.paths_to_wavs_list)
    '''
    def read_audio(self, path_to_wav):
        return np.load(path_to_wav, allow_pickle=True)
    '''
    def read_audio(self, path_to_wav):
        sr, wav = wavfile.read(path_to_wav)
        wav = (wav / 32768).astype(np.float32)
        return wav, sr

    def get_class_label(self, path_to_file):
        # Parse the filename, which has the following pattern:
        # modality-vocal_channel-emotion-intensity-statement-repetition-actor.wav
        # e.g., '02-01-06-01-02-01-12.wav'
        file_name = os.path.split(path_to_file)[1]
        file_name = file_name[:-4]
        class_label = int(file_name.split('-')[2]) - 1 # 2 is a number of emotion code
        return class_label
        

    def __getitem__(self, idx):
        path_to_wav = self.paths_to_wavs_list[idx]
        # debug
        #print(path_to_wav)

        # read the wav file
        wav, sr = self.read_audio(path_to_wav)       

        # augmentation
        
        if self.mode == 'TRAIN':
            # add noise
            if np.random.randint(0, 2) == 1:
                sigma = np.random.uniform(0.0009, 0.0051)
                noise = sigma * np.random.randn(len(wav))
                wav += noise
            # stretch wav
            if np.random.randint(0, 2) == 1:
                factor = np.random.uniform(0.5, 1.2)
                wav = librosa.effects.time_stretch(wav, 2)
            # change pitch
            if np.random.randint(0, 2) == 1:
                factor = np.random.uniform(-1.5, 1.1)
                wav = librosa.effects.pitch_shift(wav, sr=sr, n_steps=factor)
    
        # get mfcc coefficients
        #mfccs = librosa.feature.mfcc(wav, sr=sr, n_mfcc=self.mfcc_rows, n_mels=self.mfcc_rows).astype(np.float32)
        '''
        if self.mode == 'TRAIN':
            # augment by choosing n_fft
            n_fft_list = [i for i in range(1024, 2049, 32)]
            idx = np.random.randint(len(n_fft_list))
            n_fft = n_fft_list[idx]
        else:
            n_fft = 2048

        '''
        '''
        n_fft = 2048
        mfccs = librosa.feature.melspectrogram(wav, sr=sr, n_mels=self.mfcc_rows, n_fft=n_fft, hop_length=128).astype(np.float32)

        '''
        mfccs = librosa.core.stft(wav, n_fft=self.mfcc_rows*2)#.astype(np.float32)
        mfccs = np.abs(mfccs)#**2
        mfccs = np.log(mfccs + 0.1)
        mfccs = mfccs[:-1]
        # debug
        #print(mfccs.shape)
        #mfccs = (mfccs - mfccs.mean())/np.std(mfccs)

        actual_mfcc_cols = mfccs.shape[1]

        # prmitive time-shifting augmentation
        target_real_diff = actual_mfcc_cols - self.mfcc_cols
        # debug
        #print(actual_mfcc_cols)
        if target_real_diff > 0:
            
            if self.mode == 'TRAIN':
                beginning_col = np.random.randint(target_real_diff)
            else:
                beginning_col = actual_mfcc_cols//2 - self.mfcc_cols//2

            mfccs = mfccs[:, beginning_col:beginning_col + self.mfcc_cols]
            #mfccs = mfccs[:, beginning_col:beginning_col + self.mfcc_cols]

        elif target_real_diff < 0:
            zeros = np.zeros((self.mfcc_rows, self.mfcc_cols), dtype=np.float32)
            # debug
            #print(zeros.shape)
            
            if self.mode == 'TRAIN':
                beginning_col = np.random.randint(self.mfcc_cols-actual_mfcc_cols)
            else:
            
                beginning_col = self.mfcc_cols//2 - actual_mfcc_cols//2
            zeros[..., beginning_col:beginning_col+actual_mfcc_cols] = mfccs
            #zeros[..., beginning_col:beginning_col+actual_mfcc_cols] = mfccs
            mfccs = zeros
            #mfccs = np.pad(mfccs, ((0, 0), (0, np.abs(target_real_diff))), constant_values=(0), mode='constant')

        # make the data compatible to pytorch 1-channel CNNs format
        # !!!!!!!!!!!!!!!!!!!!!
        #mfccs = np.expand_dims(mfccs, axis=0)

        # Parse the filename, which has the following pattern:
        # modality-vocal_channel-emotion-intensity-statement-repetition-actor.wav
        # e.g., '02-01-06-01-02-01-12.wav'
        #file_name = os.path.split(path_to_wav)[1]
        #file_name = file_name[:-4]
        #class_label = int(file_name.split('-')[2]) - 1 # 2 is a number of emotion code
        #class_label = np.array(class_label)
        class_label = self.get_class_label(path_to_wav)
        # !!!!!!!!!
        # transpose to reorder index by the time windows of the spectrograms
        return torch.from_numpy(mfccs).transpose(1, 0), class_label#, path_to_wav

class numpy_crema_dataset(numpy_ravdess_dataset):
    emotions_dict = {
        'ANG': 0,
        'DIS': 1,
        'FEA': 2,
        'SAD': 3,
        'HAP': 4,
        'NEU': 5
    }

    label2str = {
        0: 'ANG',
        1: 'DIS',
        2: 'FEA',
        3: 'SAD',
        4: 'HAP',
        5: 'NEU'
    }
    
    def get_class_label(self, path_to_file):
        file_name = os.path.split(path_to_file)[1]
        file_name = file_name[:-4]
        emotion_name = file_name.split('_')[2] # 2 is a number of emotion code
        return self.emotions_dict[emotion_name]

class numpy_iemocap_dataset(numpy_ravdess_dataset):
    '''
    emotions_dict = {
        'exc': 0,
        'sad': 1,
        'fru': 2,
        'hap': 3,
        'neu': 4,
        'sur': 5,
        'ang': 6,
        'fea': 7,
        'dis': 8,
        #'oth': 9
    }
    '''
    emotions_dict = {
        'exc': 0,
        'sad': 1,
        'fru': 2,
        'hap': 3,
        'neu': 4,
        'ang': 5,
    }

    def get_class_label(self, path_to_file):
        file_name = os.path.split(path_to_file)[1]
        file_name = file_name[:-4]
        emotion_name = file_name.split('_')[-1] # the last is a position of emotion code
        return self.emotions_dict[emotion_name]

class crema_gender_dataset(numpy_ravdess_dataset):
    emotions_dict = {
        'ANG_Male': 0,
        'DIS_Male': 1,
        'FEA_Male': 2,
        'SAD_Male': 3,
        'HAP_Male': 4,
        'NEU_Male': 5,
        'ANG_Female': 6,
        'DIS_Female': 7,
        'FEA_Female': 8,
        'SAD_Female': 9,
        'HAP_Female': 10,
        'NEU_Female': 11
    }

    label2str = {
        0: 'ANG',
        1: 'DIS',
        2: 'FEA',
        3: 'SAD',
        4: 'HAP',
        5: 'NEU'
    }
    def __init__(self, paths_to_wavs_list, spectrogram_shape, mode, gender_df):
        super().__init__(paths_to_wavs_list, spectrogram_shape, mode)
        self.gender_df = gender_df
    
    def get_class_label(self, path_to_file):
        
        file_name = os.path.split(path_to_file)[1]
        file_name = file_name[:-4]
        name_list = file_name.split('_') # 2 is a number of emotion code
        emotion_name = name_list[2]
        actor_id = int(name_list[0])

        gender = self.gender_df[self.gender_df['ActorID'] == actor_id]['Sex'].values[0]

        return self.emotions_dict['{}_{}'.format(emotion_name, gender)]

In [88]:
class audio_rnn(nn.Module):
    def __init__(self, rnn, layer_num, input_dim, hidden_dim, class_num, device, bidirectional=False):
        super().__init__()

        self.layer_num = layer_num
        self.hidden_dim = hidden_dim
        self.device = device
        
        if bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
       
        self.rnn = rnn(input_size=input_dim,
                           hidden_size=hidden_dim,
                           num_layers=layer_num,
                           batch_first=False,
                           bidirectional=bidirectional,
                           dropout=0.5)
        
        # Bidirectional nns has twice large inputs size on Linear layer
        if bidirectional:
            self.fc = nn.Linear(in_features=hidden_dim * self.num_directions, out_features=class_num)
        else:
            self.fc = nn.Linear(in_features=hidden_dim, out_features=class_num)        

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_directions * self.layer_num, batch_size, self.hidden_dim).to(self.device)
        cell = torch.zeros(self.num_directions * self.layer_num, batch_size, self.hidden_dim).to(self.device)
        return (hidden, cell)

    def compute_output(self, output):
        if self.num_directions == 2:
            # Если рекуррентная сеть является двунаправленной, то на выходной классификатор надо
            # подавать выход последнего шага рекуррентной сети прямого прохода - output[-1,:,size//2:],
            # а также выход последнего шага рекуррентной сети обратного прохода - output[1,:,:size//2]
            size = output.size(2)
            result = self.fc(torch.cat([output[1,:,:size//2], output[-1,:,size//2:]], dim=1))
        else:
            result = self.fc(output[-1])

        return result
    
    def forward(self, batch):
        batch_size = batch.shape[0]

        batch = batch.transpose(1, 0)

            

        h0, c0 = self.init_hidden(batch_size=batch_size)

        #print('h0 shape =', h0.shape)
        
        #return h0, c0

        # GRU don't has memory cell
        # We need to initialize only hidden states h
        if isinstance(self.rnn, nn.GRU):
            output, hn = self.rnn(batch, h0)
        elif isinstance(self.rnn, nn.LSTM):
            output, (hn, cn) = self.rnn(batch, (h0, c0))
        else:
            raise ValueError('self.rnn shoulb be torch.nn.LSTM or torch.nn.GRU')

        #return output

        result = self.compute_output(output)

        return result

class audio_rnn_avg(audio_rnn_last):
    def compute_output(self, output):
        output = output.mean(dim=0)
        result = self.fc(output)

        return result

class audio_rnn_attention(audio_rnn):
    def __init__(self, rnn, layer_num, input_dim, hidden_dim, class_num, device, bidirectional=False):
        super().__init__(rnn, layer_num, input_dim, hidden_dim, class_num, device, bidirectional=False)
        self.attention = nn.Linear(in_features=hidden_dim, out_features=1)
    
    def compute_output(self, x):
        x = x.transpose(1, 0).contiguous()

        batch_size = x.size(0)
        seq_len = x.size(1)
        hidden_dim = x.size(2)

        # compute alpha coefficients of attention module
        alphas = F.softmax(self.atention(x), dim=1)

        # AAAAAAAAAAAAAAAAAAAAAAAAA
        # multiply the outputs by the alphas
        # outputs have size [batch_size, sequence_len, hidden_dim]
        # reshape them to [batch_size * sequence_len, hidden_dim, 1]
        # and multiply the by alphas of shape [batch_size * sequence_len, 1, 1]
        intermediate = torch.bmm(
            x.view(batch_size*seq_len, hidden_dim, 1),
            alphas.view(batch_size*seq_len, 1, 1)
            )
        intermediate = intermediate.view(batch_size, seq_len, -1).sum(dim=1)

        output = self.fc(intermediate)

        return output

In [95]:

# CREMA-D
target_path = '/media/mikhail/files/datasets/emotion_recognition/CREMA-D/AudioWAV'
# IEMOCAP
#target_path = '/media/mikhail/files/datasets/emotion_recognition/IEMOCAP/IEMOCAP_full_release/audios'
 
npys_list = get_paths_to_wavs(target_path)

# shuffle the dataset to for the learning process stability
random.seed(10)
random.shuffle(npys_list)

dataset_size = len(npys_list)

train_size = int(0.8 * dataset_size)

print(dataset_size)

train_dataset = numpy_crema_dataset(npys_list[:train_size], (128, 256), mode='TRAIN')
#train_dataset = numpy_iemocap_dataset(npys_list[:train_size], (256, 256), mode='TRAIN')

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=256, shuffle=True, num_workers=4)

# set up test dataset and test dataloader
test_dataset = numpy_crema_dataset(npys_list[train_size:], (128, 256), mode='TEST')
#test_dataset = numpy_iemocap_dataset(npys_list[train_size:], (256, 256), mode='TEST')

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=256, shuffle=False, num_workers=4)

7442


In [96]:
# set-up devices
cuda = torch.device('cuda:0')
cpu = torch.device('cpu')

device = cuda

rnn = nn.GRU #Avaialable models: nn.LSTM, nn.GRU, bidirectional = True/False
bidirectional=False
layer_num = 1

model = audio_rnn(layer_num=layer_num, rnn=rnn, input_dim=128, hidden_dim=64, class_num=len(train_dataset.emotions_dict), device=device, bidirectional=bidirectional)

#summary(model, input_size=(256, 1, 128), batch_size=32, device='cpu')

device = cuda

model.to(device)

criterion = nn.CrossEntropyLoss()
# define an optimization algorithm and bind it with the NN parameters
optimizer = torch.optim.Adam(params=model.parameters())

starting_epoch = 0
ending_epoch = 1000
epoch_step = 1

basic_name = '{}_log_mel_spec_256_emotion_LSTM'.format('CREMA')



path_to_weights = basic_name
path_to_pkl = basic_name

if not os.path.isdir(path_to_weights):
    os.mkdir(path_to_weights)
if not os.path.isdir(path_to_pkl):
    os.mkdir(path_to_pkl)

In [98]:
start_epoch = 500
epochs = 1000
epoch_step = 1

print('Start learning')


best_acc = 0.0

train_dataset_size = len(train_dataloader.dataset)  
test_dataset_size = len(test_dataloader.dataset)  


if os.path.exists(os.path.join(path_to_pkl, basic_name + '_train_loss.pkl')):
    # Update existing classifier
    with open(os.path.join(path_to_pkl, basic_name + '_train_loss.pkl'), "rb") as f:
        train_loss_list = pickle.load(f)
else:
  train_loss_list = []

if os.path.exists(os.path.join(path_to_pkl, basic_name + '_train_acc.pkl')):
    # Update existing classifier
    with open(os.path.join(path_to_pkl, basic_name + '_train_acc.pkl'), "rb") as f:
        train_acc_list = pickle.load(f)
else:
  train_acc_list = []

if os.path.exists(os.path.join(path_to_pkl, basic_name + '_val_loss.pkl')):
    # Update existing classifier
    with open(os.path.join(path_to_pkl, basic_name + '_val_loss.pkl'), "rb") as f:
        val_loss_list = pickle.load(f)
else:
  val_loss_list = []

if os.path.exists(os.path.join(path_to_pkl, basic_name + '_val_acc.pkl')):
    # Update existing classifier
    with open(os.path.join(path_to_pkl, basic_name + '_val_acc.pkl'), "rb") as f:
        val_acc_list = pickle.load(f)
else:
    val_acc_list = []


t = 0.0

for epoch_idx in range(start_epoch, epochs, epoch_step):

    print('#############################################')
    print('#\tStart training process')
    print('#############################################\n\n')

    # iterate over epochs
    for epoch in range(epoch_step):
        print('Epoch #{}'.format(epoch_idx + epoch))
        t0 = time.time()
        model.train()
        # define losses and correct valuse number for each epoch
        epoch_train_loss = 0.0
        correct = 0
        total = 0
        
        # iterate over batches
        for data, labels in train_dataloader:
            data = data.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            pred = model(data)
            loss = criterion(pred, labels)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item() * data.size(0)
            total += labels.size(0)
            _, pred_labels = torch.max(pred.data, 1)

            correct += (pred_labels == labels).sum().item()

        t1 = time.time()

        print('Epoch time = {:.3f} s'.format(t1 - t0))

        train_loss = epoch_train_loss / train_dataset_size
        train_acc = correct/total

        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)

        print('Loss = %f\tTraining acc = %f' % (train_loss, train_acc))
        print('----------------------------------------')
        
        print('#############################################')
        print('#\tStart validation on %d epoch' % (epoch_idx + epoch))
        print('#############################################')
        
        model.eval()
        with torch.no_grad():
            true_values = 0.0
            epoch_test_loss = 0.0
            correct = 0
            total = 0
            for data, labels in test_dataloader:
                data = data.to(device)
                labels = labels.to(device)
                # run the model
                pred = model(data)
                loss = criterion(pred, labels)
                epoch_test_loss += loss.item() * data.size(0)
                total += labels.size(0)
                _, pred_labels = torch.max(pred.data, 1)
                correct += (pred_labels == labels).sum().item()
        val_acc = correct/total
        val_loss = epoch_test_loss / test_dataset_size
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)
                
        print('\tLoss = {:.4f}\tValidation acc = {:.3f}'.format(val_loss, val_acc))
        print('---------------------------------------------')
      
        # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        #save current model to resume training
        #после каждой эпохи сохраняем веса для того, чтобы потом продолжить обучение именно с последней эпохи
        #а не с эпохи с лучшими весами 
        path_to_saving_model = os.path.join(path_to_weights, basic_name + '_current.pth')
        torch.save(model.state_dict(), path_to_saving_model)
            
        if val_acc > best_acc:
            print('#############################################')
            print('#\tBest accuracy has achieved')
            print('#\tSaving weights...')
            print('#############################################\n\n')

            model_name = basic_name + '_ep-{}_loss-{:.3}_acc-{:.3}.pth'.format(epoch_idx + epoch, val_loss, val_acc)
            path_to_saving_model = os.path.join(path_to_weights, model_name)

            torch.save(model.state_dict(), path_to_saving_model)
            print('model {} have been saved'.format(path_to_saving_model))
            best_acc = val_acc

        with open(os.path.join(path_to_pkl, basic_name + '_train_loss.pkl'), 'wb') as f:
            pickle.dump(train_loss_list, f)

        with open(os.path.join(path_to_pkl, basic_name + '_train_acc.pkl'), 'wb') as f:
            pickle.dump(train_acc_list, f)

        with open(os.path.join(path_to_pkl, basic_name + '_val_loss.pkl'), 'wb') as f:
            pickle.dump(val_loss_list, f)

        with open(os.path.join(path_to_pkl, basic_name + '_val_acc.pkl'), 'wb') as f:
            pickle.dump(val_acc_list, f)
    

 validation on 954 epoch
#############################################
	Loss = 1.3418	Validation acc = 0.510
---------------------------------------------
#############################################
#	Start training process
#############################################


Epoch #955
Epoch time = 60.712 s
Loss = 1.000806	Training acc = 0.618512
----------------------------------------
#############################################
#	Start validation on 955 epoch
#############################################
	Loss = 1.3811	Validation acc = 0.504
---------------------------------------------
#############################################
#	Start training process
#############################################


Epoch #956
Epoch time = 61.079 s
Loss = 1.031189	Training acc = 0.606585
----------------------------------------
#############################################
#	Start validation on 956 epoch
#############################################
	Loss = 1.3664	Validation acc = 0.498
---------

In [91]:
result = model(data.to(device))

In [92]:
result.shape

torch.Size([64, 6])