In [1]:
%load_ext tensorboard
import scipy
import numpy as np
import librosa
#import matplotlib.pyplot as plt
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)
from tensorflow.keras import layers
from tensorflow.keras import optimizers
import keras.backend as K
import os
from os import listdir
from os.path import isfile, join
from pandas import DataFrame
import pandas as pd
import math
import datetime

import webrtcvad
import collections
import contextlib
import sys
import wave

from vad_utils import VadGenerator
from audio_processing import AudioProcessing
from dataset_generator import DatasetGenerator

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.decorators import jit as optional_jit
Using TensorFlow backend.


In [129]:
# Read Clean speech, Noise and Noisy speech training files
# Count how many samples of Clean speech are used for training and how many snr levels, 
# there are snr levels of Noise and Noisy speech for every Clean speech audio
clean_speech_list = [f for f in listdir("CleanSpeech_training/") if isfile(join("CleanSpeech_training/", f))]
noise_list = [f for f in listdir("Noise_training/") if isfile(join("Noise_training/", f))]
noisy_speech_list = [f for f in listdir("NoisySpeech_training/") if isfile(join("NoisySpeech_training/", f))]

In [130]:
## Make audios set complete

#Erase duplicate audios

clean_speech_list_complete = [item for item in clean_speech_list if item[-5:-4].isdigit()]
noise_list_complete = [item for item in noise_list if item[-5:-4].isdigit()]

noisy_speech_list_split = [item.split('_') for item in noisy_speech_list]
df = pd.DataFrame(noisy_speech_list_split)
df.columns = ['noisy','type','level','clean']
df = df[df['clean'].apply(lambda x : x[-5:-4].isdigit())]

#Get the noisy audios that match with clean and noise audios

df['noisy_audio'] = (df['noisy'] + '_' + df['type'] + '_' + df['level'] + '_' + df['clean']).values
df['noise_audio'] = (df['noisy'] + '_' + df['type'] + '_' + df['level'] + '.wav').values

#Get the noisy audios that have 3 times the same clean audio reference

df_grouped = df.groupby(['clean']).count()
df_count = df_grouped[df_grouped['noisy']==3]
clean_reference = [item for item in df_count.index]
df = df[df['clean'].isin(clean_reference)]
noise_reference = [item for item in df['noise_audio']]

#Filter noise and clean audios according to references from noisy audios
clean_speech_list_complete = [item for item in clean_speech_list_complete if item in clean_reference]
noise_list_complete = [item for item in noise_list_complete if item in noise_reference]

noisy_speech_list_complete = (df['noisy'] + '_' + df['type'] + '_' + df['level'] + '_' + df['clean']).values

#Compare original vs complete audios

print('noisy speech original {}'.format(len(noisy_speech_list)))
print('noisy speech complete {}'.format(len(noisy_speech_list_complete)))
print('clean speech original {}'.format(len(clean_speech_list)))
print('clean speech complete {}'.format(len(clean_speech_list_complete)))
print('noise original {}'.format(len(noise_list)))
print('noise complete {}'.format(len(noise_list_complete)))

#Update audios

clean_speech_list = clean_speech_list_complete
noise_list = noise_list_complete
noisy_speech_list = noisy_speech_list_complete

noisy speech original 3701
noisy speech complete 3690
clean speech original 1237
clean speech complete 1230
noise original 3705
noise complete 3690


In [131]:
samples = len(clean_speech_list)
snr_levels = len(noisy_speech_list) / samples

In [132]:
# 16KHz audios assumed
frames_per_sec = 32           # Number of frames per second
sample_length = 30            # Sample length in seconds
segment_length = 5            # Audio segment length in seconds considered in the network training   
wave_max_length = 600000      # Number larger than the longest audio wave 
frames_per_sample = frames_per_sec * sample_length 
segment_frames_lenght = segment_length * frames_per_sec

In [133]:
# Pick randonmly one of the snr level of the Noise and Noisy speech samples 
from random import seed
from random import randint
seed(1)
random_snr = np.random.randint(1, snr_levels+1, size=samples)
for i in range(samples):
    random_snr[i] = random_snr[i] + ((i) * snr_levels)
random_snr = random_snr - 1
random_snr = random_snr.astype(np.int64)

rand_noisy_speech_list = list(noisy_speech_list[i] for i in random_snr)
rand_noise_list = list(noise_list[i] for i in random_snr)
noisy_speech_df = DataFrame(rand_noisy_speech_list,columns=['Sample'])
noise_df = DataFrame(rand_noise_list,columns=['Sample'])
clean_speech_df = DataFrame(clean_speech_list, columns=["Sample"])

In [134]:
# Generate a list of integers using the last part of the file name
# file name format: noisy10_SNRdb_20.0_clnsp10.wav 
# List maximum possible elements is 99,999  
noisy_speech_aux_list = [x[-9:-4] for x in noisy_speech_df["Sample"]]
sample_column = noisy_speech_aux_list
for i in range(len(noisy_speech_aux_list)):
    try:
        sample_column[i] = int(noisy_speech_aux_list[i])
    except ValueError:
        try:
            sample_column[i] = int(noisy_speech_aux_list[i][-4:])
        except ValueError:
            try:
                sample_column[i] = int(noisy_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    sample_column[i] = int(noisy_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        sample_column[i] = int(noisy_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [135]:
# Order the df by sample_column number
# Noise has the same file formatting than Noisy speech so it is sorted using the same df column
noisy_speech_df["Sample_Number"] = sample_column
noise_df["Sample_Number"] = sample_column
noisy_speech_df.sort_values(by=['Sample_Number'], inplace=True)
noise_df.sort_values(by=['Sample_Number'], inplace=True)

In [136]:
# Generate a list of integers using the last part of the file name
# file name format: clnsp1.wav
# List maximum possible elements is 99,999
clean_speech_aux_list = [x[-9:-4] for x in clean_speech_df["Sample"]]
clean_speach_sample_number = clean_speech_aux_list
for i in range(len(clean_speech_aux_list)):
    try:
        clean_speach_sample_number[i] = int(clean_speech_aux_list[i])
    except ValueError:
        try:
            clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-4:])
        except ValueError:
            try:
                clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-3:])
            except ValueError:
                try:
                    clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-2:])
                except ValueError:
                    try:
                        clean_speach_sample_number[i] = int(clean_speech_aux_list[i][-1:])
                    except ValueError:
                        "error"

In [137]:
# Order clean speech df by sample_column number
clean_speech_df["Sample_Number"] = clean_speach_sample_number
clean_speech_df.sort_values(by=['Sample_Number'], inplace=True)

In [138]:
# Generate lists based on the data frames
clean_speech_ordered_list = clean_speech_df["Sample"]
noisy_speech_ordered_list = noisy_speech_df["Sample"]
noise_ordered_list = noise_df['Sample']

In [139]:
# Read the audio files in the list
pathAudio = "CleanSpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
clean_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\CleanSpeech_training\\'+item for item in clean_speech_ordered_list]

In [140]:
# Read the audio files in the list
pathAudio = "NoisySpeech_training/"
os.path.dirname(os.path.realpath(pathAudio))
noisy_speech_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\NoisySpeech_training\\'+item for item in noisy_speech_ordered_list]

In [141]:
# Read the audio files in the list
pathAudio = "Noise_training/"
os.path.dirname(os.path.realpath(pathAudio))
noise_ordered_list = [os.path.dirname(os.path.realpath(pathAudio))+'\\Noise_training\\'+item for item in noise_ordered_list]

In [142]:
# Network architecture
window_lenght = 32

noisy_speech = layers.Input(shape=(window_lenght,257))
clean_speech = layers.Input(shape=(257))
vad_clean_speech = layers.Input(shape=(257))
noise_speech = layers.Input(shape=(257))
noisy_speech_current = layers.Reshape((257,), input_shape=(1,257))(noisy_speech[:,-1,:])

gru1 = layers.GRU(257, batch_input_shape=(window_lenght,257), return_sequences=True)(noisy_speech)
gru1_red = layers.add([noisy_speech, gru1])
gru2 = layers.GRU(257, batch_input_shape=(window_lenght,257), return_sequences=True)(gru1_red)
gru2_red = layers.add([gru2, gru1_red])
gru3 = layers.GRU(257, batch_input_shape=(window_lenght, 257), return_sequences=False)(noisy_speech)
#dense1 = layers.Dense(257, activation='sigmoid')(gru3)
gain = layers.Dense(257, activation='sigmoid')(gru3)

model = tf.keras.Model(inputs=[noisy_speech, clean_speech, vad_clean_speech, noise_speech], outputs=gain)

In [143]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           [(None, 32, 257)]    0                                            
__________________________________________________________________________________________________
gru_17 (GRU)                    (None, 257)          397836      input_21[0][0]                   
__________________________________________________________________________________________________
input_22 (InputLayer)           [(None, 257)]        0                                            
__________________________________________________________________________________________________
input_23 (InputLayer)           [(None, 257)]        0                                            
____________________________________________________________________________________________

In [144]:
alpha = 0.35
#loss = alpha * K.mean( K.sqrt( K.sum(K.square(vad_clean_speech-vad_clean_speech*gain), axis=1) ) , axis=-1) + (1-alpha) * K.mean( K.sqrt( K.sum(K.square(noise_speech*gain), axis=1)), axis=-1)
loss = alpha * K.mean( K.square( K.abs(vad_clean_speech-vad_clean_speech*gain)), axis=-1) + (1-alpha) * K.mean(K.square(K.abs((noise_speech)*gain)), axis=-1)
#loss = alpha * K.mean( K.square( K.abs( clean_speech-clean_speech*gain) ), axis=-1) + (1-alpha) * K.mean(K.square( K.abs( noise_speech*gain) ) , axis=-1)
#loss = alpha * K.mean(  K.l2_normalize(clean_speech-clean_speech*gain, axis=1), axis=-1)  + (1-alpha) * K.mean( K.l2_normalize(noise_speech*gain, axis=1) , axis=-1)

In [146]:
model.add_loss(loss)

In [147]:
adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)
sgd = optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
model.compile(optimizer=sgd)



In [148]:
batch_size = 1
training_batch_generator = DatasetGenerator(clean_speech_ordered_list, noisy_speech_ordered_list, noise_ordered_list, frames_per_sec, batch_size)

In [149]:
log_dir = "logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
%%time
model.fit_generator(generator = training_batch_generator,
                    steps_per_epoch = int(1200 // batch_size),
                    epochs = 20,
                    verbose = 1,
                    callbacks=[tensorboard_callback])

Train for 1200 steps
Epoch 1/20

In [154]:
model.save('model_audios1200_epochs60.h5') 