In [1]:
import scipy
import numpy as np
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
import keras.backend as K
import os
from os import listdir
from os.path import isfile, join
from pandas import DataFrame
import keras.backend as K

Using TensorFlow backend.


In [2]:
# Read Clean speech, Noise and Noisy speech training files
# Count how many samples of Clean speech are used for training and how many snr levels, 
# there are snr levels of Noise and Noisy speech for every Clean speech audio
clean_speech_list = [f for f in listdir("CleanSpeech_training/") if isfile(join("CleanSpeech_training/", f))]
noise_list = [f for f in listdir("Noise_training/") if isfile(join("Noise_training/", f))]
noisy_speech_list = [f for f in listdir("NoisySpeech_training/") if isfile(join("NoisySpeech_training/", f))]
samples = len(clean_speech_list)
snr_levels = len(noisy_speech_list) / samples

In [12]:
# 16KHz audios assumed
frames_per_sec = 32           # Number of frames per second
sample_length = 30            # Sample length in seconds
segment_length = 5            # Audio segment length in seconds considered in the network training   
wave_max_length = 600000      # Number larger than the longest audio wave 
frames_per_sample = frames_per_sec * sample_length 
segment_frames_lenght = segment_length * frames_per_sec

In [3]:
# Pick randonmly one of the snr level of the Noise and Noisy speech samples 
from random import seed
from random import randint
seed(1)
random_snr = np.random.randint(1, snr_levels+1, size=samples)
for i in range(samples):
    random_snr[i] = random_snr[i] + ((i) * snr_levels)
random_snr = random_snr - 1
random_snr = random_snr.astype(np.int64)

rand_noisy_speech_list = list(noisy_speech_list[i] for i in random_snr)
rand_noise_list = list(noise_list[i] for i in random_snr)
noisy_speech_df = DataFrame(rand_noisy_speech_list,columns=['Sample'])
noise_df = DataFrame(rand_noise_list,columns=['Sample'])
clean_speech_df = DataFrame(clean_speech_list, columns=["Sample"])

In [4]:
# Generate a list of integers using the last part of the file name
# file name format: noisy10_SNRdb_20.0_clnsp10.wav 
# List maximum possible elements is 99,999  
noisy_speech_aux_list = [x[-9:-4] for x in noisy_speech_df["Sample"]]
sample_column = noisy_speech_aux_list
for i in range(len(noisy_speech_aux_list)):
    try:
        sample_column[i] = int(noisy_speech_aux_list[i])
    except ValueError:
        try:
            sample_column[i] = int(noisy_speech_aux_list[i][-4:])
        except ValueError:
            try:
                sample_column[i] = int(noisy_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    sample_column[i] = int(noisy_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        sample_column[i] = int(noisy_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [5]:
# Order the df by sample_column number
# Noise has the same file formatting than Noisy speech so it is sorted using the same df column
noisy_speech_df["Sample_Number"] = sample_column
noise_df["Sample_Number"] = sample_column
noisy_speech_df.sort_values(by=['Sample_Number'], inplace=True)
noise_df.sort_values(by=['Sample_Number'], inplace=True)

In [6]:
# Generate a list of integers using the last part of the file name
# file name format: clnsp1.wav
# List maximum possible elements is 99,999
clean_speech_aux_list = [x[-9:-4] for x in clean_speech_df["Sample"]]
clean_speach_sample_number = clean_speech_aux_list
for i in range(len(clean_speech_aux_list)):
    try:
        clean_speach_sample_number[i] = int(clean_speech_aux_list[i])
    except ValueError:
        try:
            clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-4:])
        except ValueError:
            try:
                clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [7]:
# Order clean speech df by sample_column number
clean_speech_df["Sample_Number"] = clean_speach_sample_number
clean_speech_df.sort_values(by=['Sample_Number'], inplace=True)

In [8]:
# Generate lists based on the data frames
clean_speech_ordered_list = clean_speech_df["Sample"]
noisy_speech_ordered_list = noisy_speech_df["Sample"]
noise_ordered_list = noise_df['Sample']

In [9]:
# Read the audio files in the list
pathAudio = "CleanSpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
clean_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\CleanSpeech_training\\'+item for item in clean_speech_ordered_list]

In [10]:
# Read the audio files in the list
pathAudio = "NoisySpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
noisy_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\NoisySpeech_training\\'+item for item in noisy_speech_ordered_list]

In [11]:
# Read the audio files in the list
pathAudio = "Noise_training/"
os.path.dirname(os.path.realpath(pathAudio))
noise_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\Noise_training\\'+item for item in noise_ordered_list]

In [16]:
def wave_function(audio):
    # generate the wave of the audio sample
    
    wave, sr = librosa.load(audio, sr = 16000, mono = True)
    f, t, stft  = scipy.signal.stft(wave, fs=16000, window='hamm', nperseg=512, noverlap=0.75)
    
    return(stft, f, t)

In [18]:
def frequency_independent_normalization(sample_number):
    # Normalize noisy speech, clean speech and noise audios based on noisy speech global mean and standard deviation 
    # frequency independent
    
    noisy_speech_zxx =  wave_function(noisy_speech_ordered_list[sample_number])
    noisy_speech_zxx_mean_fi = np.mean(noisy_speech_zxx)    
    noisy_speech_zxx_std_fi = np.std(noisy_speech_zxx)
    noisy_speech_normalized_fi = (noisy_speech_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi
    
    noise_zxx = wave_function(noise_ordered_list[sample_number])
    noise_normalized_fi = (noise_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi
    
    clean_speech_zxx = wave_function(clean_speech_ordered_list[sample_number])
    clean_speech_normalized_fi = (clean_speech_zxx - noisy_speech_zxx_mean_fi) / noisy_speech_zxx_std_fi 

    return (noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi)

In [20]:
def vad(sample_number):
    # Voice Activity Detector (VAD) based on accumulated energy (measured in db) for frequencies between 300Hz and 5000Hz
    # Calculate acumulated energy (db) for each frame of each audio

    vad_min_frequency_limit = 300
    vad_max_frequency_limit = 5000
    clean_speech_zxx, clean_speech_frequency, clean_speech_time = wave_function(clean_speech_ordered_list[sample_number])
    clean_speech_zxx_db = -librosa.amplitude_to_db(np.abs(clean_speech_zxx))  # , ref=np.max
    acumulated_energy_db = np.zeros((clean_speech_zxx.shape[1],clean_speech_zxx.shape[2]))
    for s in range(clean_speech_frequency.shape[1]):
        for f in range(clean_speech_frequency.shape[0]):
            if clean_speech_frequency[f,s] > vad_min_frequency_limit and clean_speech_frequency[f,s] < vad_max_frequency_limit:
                for j in range(clean_speech_zxx.shape[1]):
                    acumulated_energy_db[j,s] = acumulated_energy_db[j,s] + clean_speech_zxx_db[f,j,s] 
    
    # Smooth the acumulated energy over 3 frames
    smooth_energy_db = np.zeros((acumulated_energy_db.shape))
    smooth_energy_db[0,:] = acumulated_energy_db[0,:]
    smooth_energy_db[acumulated_energy_db.shape[0]-1,:] = acumulated_energy_db[acumulated_energy_db.shape[0]-1,:]
    for j in range(1,acumulated_energy_db.shape[0]-2):
        smooth_energy_db[j,:] = (acumulated_energy_db[j-1,:] + acumulated_energy_db[j,:] + acumulated_energy_db[j+1,:]) / 3
        
    # Generate a matrix [0,1] for frames with an accumulated energy above (Max - threshold)
    treshold_db = 1000
    max_smooth_energy_db = smooth_energy_db.max(axis=0)
    frame_bin = np.zeros((smooth_energy_db.shape))
    for j in range (0, acumulated_energy_db.shape[0]):
        frame_bin[j,:] = np.where(smooth_energy_db[j,:] > max_smooth_energy_db[:] - treshold_db, 1, 0)
        
    # Make zero the frames below the threshold
    zxx_bin = np.repeat(np.expand_dims(frame_bin, axis=0), 257, 0)
    noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi = frequency_independent_normalization(sample_number)
    vad_clean_speech_normalized_fi = clean_speech_normalized_fi * zxx_bin
    
    return(vad_clean_speech_normalized_fi)

In [23]:
def adjust_matrix_size(batch_size, sample_number):
    # Adjust the noisy speech matrix size to input it in the network
    
    noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi = frequency_independent_normalization(sample_number)

    network_noisy_speech_input = np.zeros((noisy_speech_normalized_fi.shape[0],segment_frames_lenght,number_of_audios*(noisy_speech_normalized_fi.shape[1]-segment_frames_lenght)), complex)

    # noisy_speech_normalized_fi[2,5,1] = network_input[2,5,800]
    for i in range(batch_size):
        for j in range(noisy_speech_normalized_fi.shape[1]-segment_frames_lenght):
            for fr in range(segment_frames_lenght):
                noisy_speech_normalized_fi, clean_speech_normalized_fi, noise_normalized_fi = frequency_independent_normalization(sample_number+i)                
                network_noisy_speech_input[:,fr,i*(noisy_speech_normalized_fi.shape[1]-segment_frames_lenght)+j] = noisy_speech_normalized_fi[:,fr+j,i]
