In [90]:
import scipy
import numpy as np
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
import keras.backend as K
import os
from os import listdir
from os.path import isfile, join
from pandas import DataFrame
import keras.backend as K
import pandas as pd
import math

In [91]:
# Read Clean speech, Noise and Noisy speech training files
# Count how many samples of Clean speech are used for training and how many snr levels, 
# there are snr levels of Noise and Noisy speech for every Clean speech audio
clean_speech_list = [f for f in listdir("CleanSpeech_training/") if isfile(join("CleanSpeech_training/", f))]
noise_list = [f for f in listdir("Noise_training/") if isfile(join("Noise_training/", f))]
noisy_speech_list = [f for f in listdir("NoisySpeech_training/") if isfile(join("NoisySpeech_training/", f))]

In [92]:
## Make audios set complete

#Erase duplicate audios

clean_speech_list_complete = [item for item in clean_speech_list if item[-5:-4].isdigit()]
noise_list_complete = [item for item in noise_list if item[-5:-4].isdigit()]

noisy_speech_list_split = [item.split('_') for item in noisy_speech_list]
df = pd.DataFrame(noisy_speech_list_split)
df.columns = ['noisy','type','level','clean']
df = df[df['clean'].apply(lambda x : x[-5:-4].isdigit())]

#Get the noisy audios that match with clean and noise audios

df['noisy_audio'] = (df['noisy'] + '_' + df['type'] + '_' + df['level'] + '_' + df['clean']).values
df['noise_audio'] = (df['noisy'] + '_' + df['type'] + '_' + df['level'] + '.wav').values

#Get the noisy audios that have 3 times the same clean audio reference

df_grouped = df.groupby(['clean']).count()
df_count = df_grouped[df_grouped['noisy']==3]
clean_reference = [item for item in df_count.index]
df = df[df['clean'].isin(clean_reference)]
noise_reference = [item for item in df['noise_audio']]

#Filter noise and clean audios according to references from noisy audios
clean_speech_list_complete = [item for item in clean_speech_list_complete if item in clean_reference]
noise_list_complete = [item for item in noise_list_complete if item in noise_reference]

noisy_speech_list_complete = (df['noisy'] + '_' + df['type'] + '_' + df['level'] + '_' + df['clean']).values

#Compare original vs complete audios

print('noisy speech original {}'.format(len(noisy_speech_list)))
print('noisy speech complete {}'.format(len(noisy_speech_list_complete)))
print('clean speech original {}'.format(len(clean_speech_list)))
print('clean speech complete {}'.format(len(clean_speech_list_complete)))
print('noise original {}'.format(len(noise_list)))
print('noise complete {}'.format(len(noise_list_complete)))

#Update audios

clean_speech_list = clean_speech_list_complete
noise_list = noise_list_complete
noisy_speech_list = noisy_speech_list_complete

noisy speech original 3701
noisy speech complete 3696
clean speech original 1237
clean speech complete 1232
noise original 3705
noise complete 3696


In [93]:
samples = len(clean_speech_list)
snr_levels = len(noisy_speech_list) / samples

In [94]:
# 16KHz audios assumed
frames_per_sec = 32           # Number of frames per second
sample_length = 30            # Sample length in seconds
segment_length = 5            # Audio segment length in seconds considered in the network training   
wave_max_length = 600000      # Number larger than the longest audio wave 
frames_per_sample = frames_per_sec * sample_length 
segment_frames_lenght = segment_length * frames_per_sec

In [95]:
# Pick randonmly one of the snr level of the Noise and Noisy speech samples 
from random import seed
from random import randint
seed(1)
random_snr = np.random.randint(1, snr_levels+1, size=samples)
for i in range(samples):
    random_snr[i] = random_snr[i] + ((i) * snr_levels)
random_snr = random_snr - 1
random_snr = random_snr.astype(np.int64)

rand_noisy_speech_list = list(noisy_speech_list[i] for i in random_snr)
rand_noise_list = list(noise_list[i] for i in random_snr)
noisy_speech_df = DataFrame(rand_noisy_speech_list,columns=['Sample'])
noise_df = DataFrame(rand_noise_list,columns=['Sample'])
clean_speech_df = DataFrame(clean_speech_list, columns=["Sample"])

In [96]:
# Generate a list of integers using the last part of the file name
# file name format: noisy10_SNRdb_20.0_clnsp10.wav 
# List maximum possible elements is 99,999  
noisy_speech_aux_list = [x[-9:-4] for x in noisy_speech_df["Sample"]]
sample_column = noisy_speech_aux_list
for i in range(len(noisy_speech_aux_list)):
    try:
        sample_column[i] = int(noisy_speech_aux_list[i])
    except ValueError:
        try:
            sample_column[i] = int(noisy_speech_aux_list[i][-4:])
        except ValueError:
            try:
                sample_column[i] = int(noisy_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    sample_column[i] = int(noisy_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        sample_column[i] = int(noisy_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [97]:
# Order the df by sample_column number
# Noise has the same file formatting than Noisy speech so it is sorted using the same df column
noisy_speech_df["Sample_Number"] = sample_column
noise_df["Sample_Number"] = sample_column
noisy_speech_df.sort_values(by=['Sample_Number'], inplace=True)
noise_df.sort_values(by=['Sample_Number'], inplace=True)

In [98]:
# Generate a list of integers using the last part of the file name
# file name format: clnsp1.wav
# List maximum possible elements is 99,999
clean_speech_aux_list = [x[-9:-4] for x in clean_speech_df["Sample"]]
clean_speach_sample_number = clean_speech_aux_list
for i in range(len(clean_speech_aux_list)):
    try:
        clean_speach_sample_number[i] = int(clean_speech_aux_list[i])
    except ValueError:
        try:
            clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-4:])
        except ValueError:
            try:
                clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [99]:
# Order clean speech df by sample_column number
clean_speech_df["Sample_Number"] = clean_speach_sample_number
clean_speech_df.sort_values(by=['Sample_Number'], inplace=True)

In [100]:
# Generate lists based on the data frames
clean_speech_ordered_list = clean_speech_df["Sample"]
noisy_speech_ordered_list = noisy_speech_df["Sample"]
noise_ordered_list = noise_df['Sample']

In [101]:
# Read the audio files in the list
pathAudio = "CleanSpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
clean_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\CleanSpeech_training\\'+item for item in clean_speech_ordered_list]

In [102]:
# Read the audio files in the list
pathAudio = "NoisySpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
noisy_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\NoisySpeech_training\\'+item for item in noisy_speech_ordered_list]

In [103]:
# Read the audio files in the list
pathAudio = "Noise_training/"
os.path.dirname(os.path.realpath(pathAudio))
noise_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\Noise_training\\'+item for item in noise_ordered_list]

In [104]:
def wave_function(audio):
    # generate the wave of the audio sample
    
    wave, sr = librosa.load(audio, sr = 16000, mono = True)
    f, t, stft  = scipy.signal.stft(wave, fs=16000, window='hamm', nperseg=512, noverlap=0.75)
    
    return(stft, f, t)

In [105]:
def frequency_independent_normalization(sample_number):
    # Normalize noisy speech, clean speech and noise audios based on noisy speech global mean and standard deviation 
    # frequency independent
    
    noisy_speech_zxx, f, t =  wave_function(noisy_speech_ordered_list[sample_number])
    noisy_speech_zxx_mean_fi = np.mean(noisy_speech_zxx)    
    noisy_speech_zxx_std_fi = np.std(noisy_speech_zxx)
    noisy_speech_normalized_fi = (noisy_speech_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi
    
    noise_zxx, f, t = wave_function(noise_ordered_list[sample_number])
    noise_normalized_fi = (noise_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi
    
    clean_speech_zxx, f, t = wave_function(clean_speech_ordered_list[sample_number])
    clean_speech_normalized_fi = (clean_speech_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi 

    return (noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi)

In [106]:
def vad(sample_number):
    # Voice Activity Detector (VAD) based on accumulated energy (measured in db) for frequencies between 300Hz and 5000Hz
    # Calculate acumulated energy (db) for each frame of each audio

    vad_min_frequency_limit = 300
    vad_max_frequency_limit = 5000
    clean_speech_zxx, clean_speech_frequency, clean_speech_time = wave_function(clean_speech_ordered_list[sample_number])
    clean_speech_zxx_db = -librosa.amplitude_to_db(np.abs(clean_speech_zxx))  # , ref=np.max
    acumulated_energy_db = np.zeros((clean_speech_zxx.shape[1]))
    for f in range(clean_speech_frequency.shape[0]):
        if clean_speech_frequency[f] > vad_min_frequency_limit and clean_speech_frequency[f] < vad_max_frequency_limit:
            for j in range(clean_speech_zxx.shape[1]):
                    acumulated_energy_db[j] = acumulated_energy_db[j] + clean_speech_zxx_db[f,j] 
    
    # Smooth the acumulated energy over 3 frames
    smooth_energy_db = np.zeros((acumulated_energy_db.shape))
    smooth_energy_db[0] = acumulated_energy_db[0]
    smooth_energy_db[acumulated_energy_db.shape[0]-1] = acumulated_energy_db[acumulated_energy_db.shape[0]-1]
    for j in range(1,acumulated_energy_db.shape[0]-2):
        smooth_energy_db[j] = (acumulated_energy_db[j-1] + acumulated_energy_db[j] + acumulated_energy_db[j+1]) / 3
        
    # Generate a matrix [0,1] for frames with an accumulated energy above (Max - threshold)
    treshold_db = 2000
    max_smooth_energy_db = smooth_energy_db.max(axis=0)
    frame_bin = np.zeros((smooth_energy_db.shape))
    for j in range (0, acumulated_energy_db.shape[0]):
        frame_bin[j] = np.where(smooth_energy_db[j] > max_smooth_energy_db - treshold_db, 1, 0)
        
    # Make zero the frames below the threshold
    zxx_bin = np.repeat(np.expand_dims(frame_bin, axis=0), 257, 0)
    noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi = frequency_independent_normalization(sample_number)
    vad_clean_speech_normalized_fi = clean_speech_normalized_fi * zxx_bin
    
    return(vad_clean_speech_normalized_fi)

In [107]:
def adjusted_input_matrixes(sample_number, segment_frames_lenght):
    # Adjust the noisy speech matrix size to input it in the network
    # batch_size: number or audios in a training mini-batch
    # sample_number: number of the first audio sample used in the batch
    # segment_frames_lenght: number of past frames used for the training (160 if ) 
    
    noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi = frequency_independent_normalization(sample_number)
    vad_clean_speech_normalized = vad(sample_number)
                                      
    number_of_frames = noisy_speech_normalized_fi.shape[1]
    dim1 = noisy_speech_normalized_fi.shape[1]-segment_frames_lenght
    dim2 = segment_frames_lenght
    dim3 = noisy_speech_normalized_fi.shape[0]
    network_noisy_speech_input = np.zeros((dim1,dim2,dim3), complex)
    network_clean_speech_input = np.zeros((dim1,dim2,dim3), complex)
    network_vad_clean_speech_input = np.zeros((dim1,dim2,dim3), complex)
    
    for j in range(number_of_frames-segment_frames_lenght):
        for k in range(segment_frames_lenght):
            network_noisy_speech_input[j,k,:] = noisy_speech_normalized_fi[:,k+j]
            network_clean_speech_input[j,k,:] = clean_speech_normalized_fi[:,k+j]
            network_vad_clean_speech_input[j,k,:] = vad_clean_speech_normalized[:,k+j]
             
    return(network_noisy_speech_input,network_clean_speech_input, network_vad_clean_speech_input)  

In [108]:
noisy, clean, clean_vad = adjusted_input_matrixes(100, 160)

In [109]:
#Fix dimensions
clean = clean[:,-1,:].reshape((clean.shape[0], clean.shape[2]))
clean_vad = clean_vad[:,-1,:].reshape((clean_vad.shape[0], clean_vad.shape[2]))

#Fix batch multiplicy
batch_size = 32
number_batches = math.floor(len(noisy)/batch_size)
number_samples = batch_size*number_batches
noisy = noisy[:number_samples,:,:]
clean = clean[:number_samples,:]
clean_vad = clean_vad[:number_samples,:]

print(noisy.shape)
print(clean.shape)
print(clean_vad.shape)

(224, 160, 257)
(224, 257)
(224, 257)


In [110]:
data = tf.convert_to_tensor(noisy, np.complex64)
labels = tf.convert_to_tensor(clean, np.complex64)
side_input = tf.convert_to_tensor(clean_vad, np.complex64)

In [111]:
# Network architecture

noisy_speech = layers.Input(shape=(160,257))
noisy_speech_current = layers.Reshape((257,), input_shape=(1,257))(noisy_speech[:,-1,:])
clean_speech = layers.Input(shape=(257))
vad_clean_speech = layers.Input(shape=(257))

gru1 = layers.GRU(257, batch_input_shape=(160,257), return_sequences=True)(noisy_speech)
gru1_red = layers.add([noisy_speech, gru1])
gru2 = layers.GRU(257, batch_input_shape=(160,257), return_sequences=True)(gru1_red)
gru2_red = layers.add([gru2, gru1_red])
gru3 = layers.GRU(257, batch_input_shape=(160, 257), return_sequences=False)(gru2_red)
gain = layers.Dense(257, activation='sigmoid')(gru3)

model = tf.keras.Model(inputs=[noisy_speech, clean_speech, vad_clean_speech], outputs=gain)

In [112]:
alpha = 0.35
loss = alpha * K.mean(K.square(vad_clean_speech-vad_clean_speech*gain), axis=-1) + (1-alpha) * K.mean(K.square((noisy_speech_current-clean_speech)*gain), axis=-1)

In [113]:
model.add_loss(loss)

In [114]:
model.compile(optimizer='sgd')



In [115]:
model.fit([data, labels, side_input]) 

Train on 224 samples


<tensorflow.python.keras.callbacks.History at 0x14ba40cbe48>