# Configuration

NOTES: The warnings after the import are referred to the fact that Tensorflow 2.x versions are built to directly look for a GPU in the system. The warning can be forgot if you are not going to use the GPU. 

In [83]:
!source myenv/bin/activate

In [84]:
import os
import librosa
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
sns.set_style('whitegrid')
import IPython.display as ipd
import librosa.display
import numpy as np
import pickle
import scipy
import ipywidgets
import math

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold


from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Conv2D, AveragePooling1D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import SGD, Adam 
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import regularizers

# from livelossplot import PlotLossesKeras
tf.config.list_physical_devices('GPU')

[]

# Utils

# Compute dataframes for datasets and split in Train, Val, Test 

In [85]:
main_path = '/media/helemanc/OS/Users/i2CAT/Desktop/Datasets SER/'
TESS = os.path.join(main_path, "tess/TESS Toronto emotional speech set data/") 
RAV = os.path.join(main_path, "ravdess-emotional-speech-audio/audio_speech_actors_01-24")
SAVEE = os.path.join(main_path, "savee/ALL/")
CREMA = os.path.join(main_path, "creamd/AudioWAV/")

In [86]:
lst = []
emotion = []
voc_channel = []
full_path = []
modality = []
intensity = []
actors = []
phrase =[]

for root, dirs, files in tqdm(os.walk(RAV)):
    for file in files:
        try:
            #Load librosa array, obtain mfcss, store the file and the mfcss information in a new array
            # X, sample_rate = librosa.load(os.path.join(root,file), res_type='kaiser_fast')
            # mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0) 
            # The instruction below converts the labels (from 1 to 8) to a series from 0 to 7
            # This is because our predictor needs to start from 0 otherwise it will try to predict also 0.
           
            modal = int(file[1:2])
            vchan = int(file[4:5])
            lab = int(file[7:8])
            ints = int(file[10:11])
            phr = int(file[13:14])
            act = int(file[18:20])
            # arr = mfccs, lab
            # lst.append(arr)
            
            modality.append(modal)
            voc_channel.append(vchan)
            emotion.append(lab) #only labels
            intensity.append(ints)
            phrase.append(phr)
            actors.append(act)
            
            full_path.append((root, file)) # only files
          # If the file is not valid, skip it
        except ValueError:
            continue

25it [00:00, 1083.71it/s]


In [87]:
# 01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised
# merge neutral and calm
emotions_list = ['neutral', 'neutral', 'happy', 'sadness', 'angry', 'fear', 'disgust', 'surprise']
emotion_dict = {em[0]+1:em[1] for em in enumerate(emotions_list)}

df = pd.DataFrame([emotion, voc_channel, modality, intensity, actors, actors,phrase, full_path]).T
df.columns = ['emotion', 'voc_channel', 'modality', 'intensity', 'actors', 'gender', 'phrase', 'path']
df['emotion'] = df['emotion'].map(emotion_dict)
df['voc_channel'] = df['voc_channel'].map({1: 'speech', 2:'song'})
df['modality'] = df['modality'].map({1: 'full AV', 2:'video only', 3:'audio only'})
df['intensity'] = df['intensity'].map({1: 'normal', 2:'strong'})
df['actors'] = df['actors']
df['gender'] = df['actors'].apply(lambda x: 'female' if x%2 == 0 else 'male')
df['phrase'] = df['phrase'].map({1: 'Kids are talking by the door', 2:'Dogs are sitting by the door'})
df['path'] = df['path'].apply(lambda x: x[0] + '/' + x[1])

In [88]:
# remove files with noise to apply the same noise to all files for data augmentation 
df = df[~df.path.str.contains('noise')]

In [89]:
# only speech
RAV_df = df
RAV_df = RAV_df.loc[RAV_df.voc_channel == 'speech']

In [90]:
RAV_df.insert(0, "emotion_label", RAV_df.emotion, True)

In [91]:
RAV_df = RAV_df.drop(['emotion', 'voc_channel', 'modality', 'intensity', 'phrase'], 1)

In [92]:
RAV_train = []
RAV_val = []
RAV_test = []

In [93]:
for index, row in RAV_df.iterrows():
    if row['actors'] in range(1,21): 
        RAV_train.append(row) 
    elif row['actors'] in range(21,23): 
        RAV_val.append(row)
    elif row['actors'] in range(23,25): 
        RAV_test.append(row)
len(RAV_train), len(RAV_val), len(RAV_test)

(1200, 120, 120)

In [94]:
RAV_train = pd.DataFrame(RAV_train)
RAV_val = pd.DataFrame(RAV_val)
RAV_test = pd.DataFrame(RAV_test)

In [95]:
RAV_train = RAV_train.drop(['actors'], 1)
RAV_val = RAV_val.drop(['actors'], 1)
RAV_test = RAV_test.drop(['actors'], 1)

In [96]:
df_train = RAV_train.reset_index(drop=True) 
df_val = RAV_val.reset_index(drop=True) 
df_test = RAV_test.reset_index(drop=True) 

df_train.head()

Unnamed: 0,emotion_label,gender,path
0,disgust,male,/media/helemanc/OS/Users/i2CAT/Desktop/Dataset...
1,disgust,male,/media/helemanc/OS/Users/i2CAT/Desktop/Dataset...
2,disgust,male,/media/helemanc/OS/Users/i2CAT/Desktop/Dataset...
3,disgust,male,/media/helemanc/OS/Users/i2CAT/Desktop/Dataset...
4,disgust,male,/media/helemanc/OS/Users/i2CAT/Desktop/Dataset...


# Create Noise Files

In [99]:
from pydub import AudioSegment
import random 
from pydub.utils import make_chunks


def create_noise_files(df_train, df_val, df_test): 
    
    '''
    Apply noise only on training files, so double the number of training files and keep 
    validation and test the same
    '''
    path_noise_sound_1 = '/home/helemanc/Desktop/Binary_Model/noise_sounds/freight_train.wav'
    path_noise_sound_2 = '/home/helemanc/Desktop/Binary_Model/noise_sounds/inside_train.wav'
    path_noise_sound_3 = '/home/helemanc/Desktop/Binary_Model/noise_sounds/small_crowd.wav'
    
    path_noise_dataset_train = '/home/helemanc/Desktop/Binary_Model/noise_datasets/ravdess/train'
    #path_noise_dataset_val = '/home/helemanc/Desktop/Binary_Model/noise_datasets/ravdess/val'
    #path_noise_dataset_test = '/home/helemanc/Desktop/Binary_Model/noise_datasets/ravdess/test'
    

    #df_list = [df_train, df_val, df_test]
    #count_df = 0 
    
    train_emotions = []
    train_genders = []
    train_paths = []
    
    #val_emotions = []
    #val_genders = []
    #val_paths = []
    
    #test_emotions = []
    #test_genders = []
    #test_paths = []
    
    #for df in df_list: 
        
    for index, row in tqdm(df_train.iterrows()): 
        path = row['path']
        sound1 = AudioSegment.from_file(path)
        samples, sr = librosa.load(path, res_type='kaiser_fast', sr=16000)
        duration = librosa.get_duration(y = samples, sr = sr)

        # pick a noise sound file randomly 
        noise_list = [path_noise_sound_1, path_noise_sound_2, path_noise_sound_3]
        random_noise = random.choice(noise_list) 

        lower_volume = 0 

        # adjust volume to not cover the voice of the audio file 
        # warning: different levels of dB need to be calibrate for each dataset 
        '''
        if random_noise == path_noise_sound_1: 
            lower_volume = 40
        elif random_noise == path_noise_sound_2: 
            lower_volume = 25 
        else: 
            lower_volume = 40
        '''

        # other strategy: 
        # compute db of both files, compute the difference, and lower the volume of the file to make it 
        # a bit lower than the original file -almost equal- 

        sound2 = AudioSegment.from_file(random_noise)

        # make chunks of duration equal to the audio file 
        chunk_length_ms = duration*1000 #ms
        chunks = make_chunks(sound2, chunk_length_ms) 

        # pick a random chunk 
        random_chunk = random.choice(chunks)
        difference = random_chunk.dBFS - sound1.dBFS

        abs_difference = abs(difference)

        lower = random_chunk - abs_difference - 2

        # lower the volume of the noise file to be overlayed with the voice_sound 
        #lower = random_chunk - lower_volume

        combined = sound1.overlay(lower)

        parts = path.split('/')
        fname = parts[-1]
        
        new_path = path_noise_dataset_train + '/' + fname 

        train_emotions.append(row['emotion_label'])
        train_genders.append(row['gender'])
        train_paths.append(new_path)

        '''
        if count_df == 0: 
            new_path = path_noise_dataset_train + '/' + fname 

            train_emotions.append(row['emotion_label'])
            train_genders.append(row['gender'])
            train_paths.append(new_path)

        elif count_df == 1: 
            new_path = path_noise_dataset_val + '/' + fname

            val_emotions.append(row['emotion_label'])
            val_genders.append(row['gender'])
            val_paths.append(new_path)

        elif count_df == 2:
            new_path = path_noise_dataset_test + '/' + fname          

            test_emotions.append(row['emotion_label'])
            test_genders.append(row['gender'])
            test_paths.append(new_path)
        '''
        combined.export(new_path, format= 'wav')

    #count_df +=1

    df_train_noise = pd.DataFrame([train_emotions, train_genders, train_paths]).T
    df_train_noise.columns = ['emotion_label', 'gender', 'path']
    
    #df_val_noise = pd.DataFrame([val_emotions, val_genders, val_paths]).T
    #df_val_noise.columns = ['emotion_label', 'gender', 'path']
    
    #df_test_noise = pd.DataFrame([test_emotions, test_genders, test_paths]).T
    #df_test_noise.columns = ['emotion_label', 'gender', 'path']

    df_train_combined = pd.concat([df_train, df_train_noise])
    df_train_combined.reset_index(drop=True, inplace=True)
    
    #df_val_combined = pd.concat([df_val, df_val_noise])
    #df_val_combined.reset_index(drop=True, inplace=True)
                                   
    #df_test_combined = pd.concat([df_test, df_test_noise])
    #df_test_combined.reset_index(drop=True, inplace=True)
    
    return df_train_combined, df_val, df_test
# have to save df 

In [100]:
new_df_train, new_df_val, new_df_test = create_noise_files(df_train, df_val, df_test)

1200it [00:04, 293.93it/s]


In [101]:
new_df_train.shape, new_df_val.shape, new_df_test.shape

((2400, 3), (120, 3), (120, 3))

## Save dataframes

In [102]:
preprocess_path = "/home/helemanc/Desktop/Binary_Model/df_csv_noise/ravdess"

In [103]:
new_df_train.to_csv(os.path.join(preprocess_path,"df_train.csv"), index=False)

In [104]:
new_df_val.to_csv(os.path.join(preprocess_path,"df_val.csv"), index=False)

In [105]:
new_df_test.to_csv(os.path.join(preprocess_path,"df_test.csv"), index=False)

## Trial Code

In [17]:
path_noise_sound_1 = '/home/helemanc/Desktop/Binary_Model/noise_sounds/freight_train.wav'
path_noise_sound_2 = '/home/helemanc/Desktop/Binary_Model/noise_sounds/inside_train.wav'
path_noise_sound_3 = '/home/helemanc/Desktop/Binary_Model/noise_sounds/small_crowd.wav'

In [18]:
path_noise_dataset = '/home/helemanc/Desktop/Binary_Model/noise_datasets/ravdess/train'

In [67]:
# load a file 
from pydub import AudioSegment
import random 
from pydub.utils import make_chunks


sound1 = AudioSegment.from_file(RAV_df.path[0])
samples, sr = librosa.load(RAV_df.path[0], res_type='kaiser_fast', sr=16000)
duration = librosa.get_duration(y = samples, sr = sr)
                                
noise_list = [path_noise_sound_1, path_noise_sound_2, path_noise_sound_3]
lower_volume = 0 
random_noise = random.choice(noise_list) 
'''
if random_noise == path_noise_sound_1: 
    lower_volume = 30
elif random_noise == path_noise_sound_2: 
    lower_volume = 25 
else: 
    lower_volume = 40
'''
sound2 = AudioSegment.from_file(random_noise)

chunk_length_ms = duration*1000 #ms
chunks = make_chunks(sound2, chunk_length_ms) # divide the audio file to the original length

random_chunk = random.choice(chunks)
#lower = random_chunk - lower_volume
print(random_chunk.dBFS)

difference = random_chunk.dBFS - sound1.dBFS

abs_difference = abs(difference)
print(abs_difference)

lower = random_chunk - abs_difference -2
print(lower.dBFS)
combined = sound1.overlay(lower)

parts = RAV_df.path[0].split('/')
fname = parts[-1]

new_path = path_noise_dataset + '/' + fname 

combined.export(new_path, format= 'wav')
print(sound1.dBFS)

-23.519969873065545
22.706629173010317
-48.23292428007521
-46.22659904607586
