In [None]:
!unzip -d data /content/drive/MyDrive/Neuron/Audio_Speech_Actors_01-24.zip

In [1]:
import os
import pandas as pd
import numpy as np

import librosa

from sklearn.utils import shuffle

In [2]:
current_path = 'data/'

In [3]:
def create_file_path_list(path):
    ravdess_directory_list = os.listdir(path)
    file_emotion = []
    file_path = []
    for dir in ravdess_directory_list:
        actor = os.listdir(path + dir) 
        for file in actor:
            part = file.split('.')[0]
            part = part.split('-')
            if len(part) != 7:
                continue
            file_emotion.append(int(part[2]))
            file_path.append(path + dir + '/' + file)
            
    # dataframe for emotion of files
    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

    # dataframe for path of files.
    path_df = pd.DataFrame(file_path, columns=['Path'])
    Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

    # changing integers to actual emotions.
    Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
    Ravdess_df.to_csv("data_path.csv",index=False)

    temp = shuffle(Ravdess_df)
    temp_bool = np.random.rand(len(temp)) < 0.8
    train_df = temp[temp_bool]
    test_df = temp[~temp_bool]
    
    train_df.to_csv("train_path.csv",index=False)
    test_df.to_csv("test_path.csv",index=False)

In [4]:
create_file_path_list(current_path)

In [5]:
data_path = pd.read_csv('train_path.csv')
train_df = pd.read_csv('train_path.csv')
test_df = pd.read_csv('test_path.csv')

In [6]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

# taking any example and checking for techniques.
path = np.array(data_path.Path)[1]
data, sample_rate = librosa.load(path)

In [7]:
def extract_features(data):
    # taking a random example and checking for its sample_rate.
    _ , sample_rate = librosa.load(current_path + "Actor_01/03-01-01-01-01-01-01.wav")

    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=13).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally

    # Spectral constrat
    spect_contr = np.mean(librosa.feature.spectral_contrast(data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, spect_contr))

    return result

def get_features(path, data_train=True):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    result = np.array(extract_features(data))
    
    if data_train == False:
        return result
    else:
        # data with noise
        noise_data = noise(data)
        result = np.vstack((result, extract_features(noise_data))) # stacking vertically
        
        # data with stretching and pitching
        stretch_data = stretch(data)
        data_stretch_pitch = pitch(stretch_data, sample_rate)
        result = np.vstack((result, extract_features(data_stretch_pitch))) # stacking vertically
    
    return result

In [8]:
X_train, Y_train = [], []
i = 0
for path, emotion in zip(train_df.Path, train_df.Emotions):
    feature = get_features(path, data_train=True)
    if i%100 == 0:
        print(str(i) + " processed elements.")
    for ele in feature:
        X_train.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y_train.append(emotion)
    i += 1

dataset = pd.DataFrame(X_train)
dataset['labels'] = Y_train
dataset.to_csv('train_dataset_augmented.csv', index=False)

0 processed elements.
100 processed elements.
200 processed elements.
300 processed elements.
400 processed elements.
500 processed elements.
600 processed elements.
700 processed elements.
800 processed elements.
900 processed elements.
1000 processed elements.
1100 processed elements.


In [9]:
X_test, Y_test = [], []
i = 0
for path, emotion in zip(test_df.Path, test_df.Emotions):
    feature = get_features(path, data_train=False)
    if i%100 == 0:
        print(str(i) + " processed elements.")
    X_test.append(ele)
    Y_test.append(emotion)
    i += 1
dataset = pd.DataFrame(X_test)
dataset['labels'] = Y_test
dataset.to_csv('test_dataset_augmented.csv', index=False)

0 processed elements.
100 processed elements.
200 processed elements.
300 processed elements.
