In [177]:
import scipy
import numpy as np
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
import keras.backend as K
import os
from os import listdir
from os.path import isfile, join
from pandas import DataFrame
import keras.backend as K

In [178]:
# Read Clean speech, Noise and Noisy speech training files
# Count how many samples of Clean speech are used for training and how many snr levels, 
# there are snr levels of Noise and Noisy speech for every Clean speech audio
clean_speech_list = [f for f in listdir("CleanSpeech_training/") if isfile(join("CleanSpeech_training/", f))]
noise_list = [f for f in listdir("Noise_training/") if isfile(join("Noise_training/", f))]
noisy_speech_list = [f for f in listdir("NoisySpeech_training/") if isfile(join("NoisySpeech_training/", f))]
samples = len(clean_speech_list)
snr_levels = len(noisy_speech_list) / samples

In [179]:
# Pick randonmly one of the snr level of the Noise and Noisy speech samples 
from random import seed
from random import randint
seed(1)
random_snr = np.random.randint(1, snr_levels+1, size=samples)
for i in range(0,samples):
    random_snr[i] = random_snr[i] + ((i) * snr_levels)
random_snr = random_snr - 1
random_snr = random_snr.astype(np.int64)

rand_noisy_speech_list = list(noisy_speech_list[i] for i in random_snr)
rand_noise_list = list(noise_list[i] for i in random_snr)
noisy_speech_df = DataFrame(rand_noisy_speech_list,columns=['Sample'])
noise_df = DataFrame(rand_noise_list,columns=['Sample'])
clean_speech_df = DataFrame(clean_speech_list, columns=["Sample"])

In [180]:
# Generate a list of integers using the last part of the file name
# file name format: noisy10_SNRdb_20.0_clnsp10.wav 
# List maximum possible elements is 99,999  
noisy_speech_aux_list = [x[-9:-4] for x in noisy_speech_df["Sample"]]
sample_column = noisy_speech_aux_list
for i in range(0,len(noisy_speech_aux_list)):
    try:
        sample_column[i] = int(noisy_speech_aux_list[i])
    except ValueError:
        try:
            sample_column[i] = int(noisy_speech_aux_list[i][-4:])
        except ValueError:
            try:
                sample_column[i] = int(noisy_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    sample_column[i] = int(noisy_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        sample_column[i] = int(noisy_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [181]:
# Order the df by sample_column number
# Noise has the same file formatting than Noisy speech so it is sorted using the same df column
noisy_speech_df["Sample_Number"] = sample_column
noise_df["Sample_Number"] = sample_column
noisy_speech_df.sort_values(by=['Sample_Number'], inplace=True)
noise_df.sort_values(by=['Sample_Number'], inplace=True)

In [182]:
# Generate a list of integers using the last part of the file name
# file name format: clnsp1.wav
# List maximum possible elements is 99,999
clean_speech_aux_list = [x[-9:-4] for x in clean_speech_df["Sample"]]
clean_speach_sample_number = clean_speech_aux_list
for i in range(0,len(clean_speech_aux_list)):
    try:
        clean_speach_sample_number[i] = int(clean_speech_aux_list[i])
    except ValueError:
        try:
            clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-4:])
        except ValueError:
            try:
                clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [183]:
# Order clean speech df by sample_column number
clean_speech_df["Sample_Number"] = clean_speach_sample_number
clean_speech_df.sort_values(by=['Sample_Number'], inplace=True)

In [184]:
# Generate lists based on the data frames
clean_speech_ordered_list = clean_speech_df["Sample"]
noisy_speech_ordered_list = noisy_speech_df["Sample"]
noise_ordered_list = noise_df['Sample']

In [185]:
# Read the audio files in the list
pathAudio = "CleanSpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
clean_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\CleanSpeech_training\\'+item for item in clean_speech_ordered_list]

In [186]:
# Read the audio files in the list
pathAudio = "NoisySpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
noisy_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\NoisySpeech_training\\'+item for item in noisy_speech_ordered_list]

In [187]:
# Read the audio files in the list
pathAudio = "Noise_training/"
os.path.dirname(os.path.realpath(pathAudio))
noise_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\Noise_training\\'+item for item in noise_ordered_list]

In [188]:
# Load the audio files as floating point time series
clean_speech_wave = np.zeros((len(clean_speech_ordered_list), 500000))
i = 0
for y in clean_speech_ordered_list: 
    wave, sr = librosa.load(y, sr = 16000,mono = True)
    clean_speech_wave[i,:len(wave)] = wave
    i = i + 1

In [189]:
# Generate Short Time Fourier Transform of each audio wave 
# First clean_samples_wave is used to find the stft dimensions
# STFT is reduced to 160 frames, equal to 5sec sample
f, t, z = scipy.signal.stft(clean_speech_wave[0,:], fs=16000, window='hamm', nperseg=512, noverlap=0.75)
clean_speech_frequency = np.zeros((f.shape[0],clean_speech_wave.shape[0]))
clean_speech_time = np.zeros((t.shape[0],clean_speech_wave.shape[0]))
clean_speech_zxx = np.zeros((f.shape[0],t.shape[0],clean_speech_wave.shape[0]), dtype=complex)
for i in (0,clean_speech_wave.shape[0]-1):
    f, t, z  = scipy.signal.stft(clean_speech_wave[i,:], fs=16000, window='hamm', nperseg=512, noverlap=0.75)
    clean_speech_frequency[:,i] = f
    clean_speech_time[:,i] = t
    clean_speech_zxx[:,:,i] = z
clean_speech_time = clean_speech_time[:160]
clean_speech_zxx = clean_speech_zxx[:,:160,:]

In [190]:
# Load the audio files as floating point time series
noisy_speech_wave = np.zeros((len(noisy_speech_ordered_list), 500000))
i = 0
for y in noisy_speech_ordered_list: 
    wave, sr = librosa.load(y, sr = 16000,mono = True)
    noisy_speech_wave[i,:len(wave)] = wave
    i = i + 1

In [191]:
# Generate Short Time Fourier Transform of each audio wave 
# First clean_samples_wave is used to find the stft dimensions
# STFT is reduced to 160 frames, equal to 5sec sample
f, t, z = scipy.signal.stft(noisy_speech_wave[0,:], fs=16000, window='hamm', nperseg=512, noverlap=0.75)
noisy_speech_frequency = np.zeros((f.shape[0],noisy_speech_wave.shape[0]))
noisy_speech_time = np.zeros((t.shape[0],noisy_speech_wave.shape[0]))
noisy_speech_zxx = np.zeros((f.shape[0],t.shape[0],noisy_speech_wave.shape[0]), dtype=complex)
for i in (0,noisy_speech_wave.shape[0]-1):
    f, t, z  = scipy.signal.stft(noisy_speech_wave[i,:], fs=16000, window='hamm', nperseg=512, noverlap=0.75)
    noisy_speech_frequency[:,i] = f
    noisy_speech_time[:,i] = t
    noisy_speech_zxx[:,:,i] = z
noisy_speech_time = noisy_speech_time[:160]
noisy_speech_zxx = noisy_speech_zxx[:,:160,:]

In [192]:
# Load the audio files as floating point time series
noise_wave = np.zeros((len(noise_ordered_list), 500000))
i = 0
for y in noise_ordered_list: 
    wave, sr = librosa.load(y, sr = 16000,mono = True)
    noise_wave[i,:len(wave)] = wave
    i = i + 1

In [193]:
# Generate Short Time Fourier Transform of each audio wave 
# First clean_samples_wave is used to find the stft dimensions
# STFT is reduced to 160 frames, equal to 5sec sample
f, t, z = scipy.signal.stft(noise_wave[0,:], fs=16000, window='hamm', nperseg=512, noverlap=0.75)
noise_frequency = np.zeros((f.shape[0],noise_wave.shape[0]))
noise_time = np.zeros((t.shape[0],noise_wave.shape[0]))
noise_zxx = np.zeros((f.shape[0],t.shape[0],noise_wave.shape[0]), dtype=complex)
for i in (0,noise_wave.shape[0]-1):
    f, t, z  = scipy.signal.stft(noise_wave[i,:], fs=16000, window='hamm', nperseg=512, noverlap=0.75)
    noise_frequency[:,i] = f
    noise_time[:,i] = t
    noise_zxx[:,:,i] = z
noise_time = noise_time[:160]
noise_zxx = noise_zxx[:,:160,:]

In [211]:
# Frequency Independent Normalization
noisy_speech_zxx_mean_fi = np.mean(noisy_speech_zxx)    
noisy_speech_zxx_std_fi = np.std(noisy_speech_zxx)
noisy_speech_normalized_fi = (noisy_speech_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi
noise_normalized_fi = (noise_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi
clean_speech_normalized_fi = (clean_speech_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi

In [201]:
# Voice Activity Detector (VAD) based on accumulated energy (measured in db) for frequencies between 300Hz and 5000Hz

# Calculate acumulated energy (db) for each frame of each audio
vad_min_frequency_limit = 300
vad_max_frequency_limit = 5000
clean_speech_zxx_db = -librosa.amplitude_to_db(np.abs(clean_speech_zxx), ref=np.max)
acumulated_energy_db = np.zeros((clean_speech_zxx.shape[1],clean_speech_zxx.shape[2]))
for s in range(0,clean_speech_frequency.shape[1]):
    for f in range(0,clean_speech_frequency.shape[0]):
        if clean_speech_frequency[f,s] > vad_min_frequency_limit and clean_speech_frequency[f,s] < vad_max_frequency_limit:
            for j in range(0,clean_speech_zxx.shape[1]-1):
                acumulated_energy_db[j,s] = acumulated_energy_db[j,s] + clean_speech_zxx_db[f,j,s]

In [234]:
# Smooth the acumulated energy over 3 frames
smooth_energy_db = np.zeros((acumulated_energy_db.shape))
smooth_energy_db[0,:] = acumulated_energy_db[0,:]
smooth_energy_db[acumulated_energy_db.shape[0]-1,:] = acumulated_energy_db[acumulated_energy_db.shape[0]-1,:]
for j in range(1,acumulated_energy_db.shape[0]-2):
    smooth_energy_db[j,:] = (acumulated_energy_db[j-1,:] + acumulated_energy_db[j,:] + acumulated_energy_db[j+1,:]) / 3

In [275]:
# Generate a matrix [0,1] for frames with an accumulated energy above (Max - threshold)
treshold_db = 1000
max_smooth_energy_db = smooth_energy_db.max(axis=0)
frame_bin = np.zeros((smooth_energy_db.shape))
for j in range (0, acumulated_energy_db.shape[0]):
    frame_bin[j,:] = np.where(smooth_energy_db[j,:] > max_smooth_energy_db[:] - treshold_db, 1, 0)

In [293]:
# Make zero the frames below the threshold
zxx_bin = np.repeat(np.expand_dims(frame_bin, axis=0), 257, 0)
vad_clean_speech_normalized_fi = clean_speech_normalized_fi * zxx_bin

In [291]:
# Network architecture

x = layers.Input(shape=(160,257))
cell = layers.GRU(257, batch_input_shape=(12,160,257), return_sequences=True)(x)
cell_residual = layers.add([x, cell])
cell = layers.GRU(257, batch_input_shape=(12,160), return_sequences=True)(cell_residual)
cell_residual = layers.add([cell_residual, cell])
cell = layers.GRU(257, batch_input_shape=(12, 160), return_sequences=True)(cell_residual)
cell = layers.Dense(257, activation='sigmoid')(cell)
model = tf.keras.Model(inputs=x, outputs=cell)

In [292]:
# Parameter alfa (0,1) is the weight between audio distortion and noise cancellation (1-alfa) 
alfa = 0.35

In [296]:
# Define custom loss
def custom_loss(alfa, noise_normalized_fi):

    # Create a loss function that adds the MSE loss to the mean of all squared activations of a specific layer
    def loss(vad_clean_speech_normalized_fi,vad_clean_speech_pred):
        return alfa * K.mean(K.square(vad_clean_speech_normalized_fi - vad_clean_speech_pred), axis=-1) + (1-alfa) * np.mean(np.square(noise_normalized_fi), axix=-1)
   
    # Return a function
    return loss    

In [295]:
# Compile the model
model.compile(optimizer='adam',
              loss=custom_loss(1), # Call the loss function with the selected layer
              metrics=['accuracy'])   # use instead custom_loss

# train
model.fit(data, labels)  

TypeError: custom_loss() missing 1 required positional argument: 'noise_normalized_fi'