Should be signal in signal out.

In [2]:
# Helper functions
import numpy as np
import tensorflow as tf
from keras import layers
import keras as keras



# GENERATE DILATED LAYER FROM 1D SIGNAL
def signal_to_dilated(signal, dilation, n_channels):
    shape = tf.shape(signal)
    pad_elements = dilation - 1 - (shape[2] + dilation - 1) % dilation
    dilated = tf.pad(signal, [[0, 0], [0, 0], [0, pad_elements], [0, 0]])
    dilated = tf.reshape(dilated, [shape[0],-1,dilation,n_channels])
    return tf.transpose(dilated, perm=[0,2,1,3]), pad_elements


# COLLAPSE DILATED LAYER TO 1D SIGNAL
def dilated_to_signal(dilated, pad_elements, n_channels):
    shape = tf.shape(dilated)
    signal = tf.transpose(dilated, perm=[0,2,1,3])
    signal = tf.reshape(signal, [shape[0],1,-1,n_channels])
    return signal[:,:,:shape[1]*shape[2]-pad_elements,:]


# IDENTITY INITIALIZATION OF CONV LAYERS
def identity_initializer():
    def _initializer(shape, dtype=tf.float32, partition_info=None):
        array = np.zeros(shape, dtype=float)
        cx, cy = shape[0]//2, shape[1]//2
        for i in range(np.minimum(shape[2],shape[3])):
            array[cx, cy, i, i] = 1
        return tf.constant(array, dtype=dtype)
    return _initializer


# “In our experiments, simple training losses (e.g., L1) led to noticeably degraded output quality at lower signal-to-noise ratios (SNRs).” ([Germain et al., 2018, p. 2](zotero://select/library/items/A6D78SNY)) ([pdf](zotero://open-pdf/library/items/P4HPP4P3?page=2&annotation=DZN467TR))
def loss_function(target, current, type):
    if type == 'L2':
        return tf.reduce_mean(tf.square(target-current))

“receptive field of the pipeline is 2^14 + 1 samples, i.e., about 1 s of audio for fs = 16 kHz.” ([Germain et al., 2018, p. 2](zotero://select/library/items/A6D78SNY)) ([pdf](zotero://open-pdf/library/items/P4HPP4P3?page=2&annotation=WTGLQ8JQ))

In [24]:
# De-noising Network
import numpy as np
import tensorflow as tf
from keras import layers
class AdaptiveNormalization(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AdaptiveNormalization, self).__init__(**kwargs)
        self.alpha = tf.Variable(1.0, name='alpha')
        self.beta = tf.Variable(0.0, name='beta')
        self.batch_norm = tf.keras.layers.BatchNormalization()
    def call(self, x):
        return self.alpha * x + self.beta * self.batch_norm(x)
# “point-wise nonlinear leaky rectified linear unit (LReLU) [28] max(0.2x, x)” 
# ([Germain et al., 2018, p. 2]
def LeakyReLU(x):
    return tf.maximum(0.2*x,x)

n_layers=7 # num of internal layers
n_channels=16 # number of feature maps (64 in paper)
norm_type="AN"

if norm_type == "AN": # Adaptive Norm
    norm_fn = AdaptiveNormalization
elif norm_type == "SBN": # Std Batch Norm
    norm_fn = layers.BatchNormalization
else: # NO LAYER NORMALIZATION
    norm_fn = None
model_input = tf.keras.layers.Input(shape=(None, 1), dtype=tf.float32) # input is a single channel waveform (time, 1)
input = tf.expand_dims(model_input, axis=-1) # add a conv feature dimension (batch, time, 1, features)
input = tf.transpose(input, [0, 2, 1, 3]) # transpose to (batch, 1, time, features)
for current_layer in range(n_layers):
    if current_layer == 0:
        net = tf.keras.layers.Conv2D(n_channels, kernel_size=[1, 3], activation=LeakyReLU,name='se_conv_%d' % current_layer,padding='SAME')(input)
        # net = tf.keras.layers.Dropout(0.3)(net) # I added this dropout layer
        net = norm_fn(name='se_norm_%d' % current_layer)(net)
    else:
        # The content of each intermediate layer is computed from the previous layer via a dilated convolution with 3 × 1 convolutional kernels
        # “Here, we increase the dilation factor exponentially with depth from 2^0 for the 1st intermediate layer to 2^12 for the 13th one.” ([Germain et al., 2018, p. 2])
        dilation_factor = 2 ** current_layer
        net, pad_elements = signal_to_dilated(net, n_channels=n_channels, dilation=dilation_factor)
        net = layers.Conv2D(n_channels, kernel_size=[1, 3], activation=LeakyReLU,name='se_conv_%d' % current_layer,padding='SAME')(net)
        # net = tf.keras.layers.Dropout(0.2)(net) # I added this dropout layer
        net = norm_fn(name='se_norm_%d' % current_layer)(net)
        net = dilated_to_signal(net, n_channels=n_channels, pad_elements=pad_elements)
net = layers.Conv2D(n_channels, kernel_size=[1, 3], activation=LeakyReLU, name='se_conv_last', padding='SAME')(net)
net = norm_fn(name='se_norm_last')(net)
net = layers.Conv2D(1, kernel_size=[1, 1], activation='sigmoid',
                        name='se_fc_last', padding='SAME')(net)
# undo the transpose and squeeze the feature dimension
output = tf.squeeze(tf.transpose(net, [0, 2, 1, 3]), axis=-1)
model = keras.Model(inputs=model_input, outputs=output)
# model.summary()

In [25]:
import numpy as np
import os
os.environ['XLA_FLAGS'] = '--xla=false'
import tensorflow as tf
data_path = os.path.join(os.getcwd(),'data')
X = np.load(os.path.join(data_path, 'inputs_100_1000_signal.npy'), allow_pickle=True)
X = tf.ragged.stack([tf.constant(x) for x in X], axis=0)
X = tf.expand_dims(X, axis=-1)

Y = np.load(os.path.join(data_path, 'targets_100_1000_signal.npy'), allow_pickle=True)
Y = tf.ragged.stack([tf.constant(y) for y in Y], axis=0)
Y = tf.expand_dims(Y, axis=-1)
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
dataset = dataset.shuffle(seed=70, buffer_size=1000)
data_size = dataset.cardinality().numpy()
train_size = int(0.75 * data_size)

train_set = dataset.take(train_size)
val_set = dataset.skip(train_size)

In [26]:
# A larger batch size can lead to faster training, but can also result in less accurate models. A smaller batch size can lead to slower training, but can also result in more accurate models.
# start at 64/128
# In general, you should start with a small number of epochs (e.g. 10-20) and increase the number of epochs until the model begins to overfit the training data. 
tf.debugging.set_log_device_placement(True)
print(tf.config.list_physical_devices('GPU'))
print(tf.test.is_built_with_cuda())
print(tf.test.gpu_device_name())

from keras import callbacks
import matplotlib.pyplot as plt
import os
SE_LOSS_LAYERS = 6 # NUMBER OF FEATURE LOSS LAYERS
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
def L1_loss(y_true, y_pred):
    return tf.reduce_mean(tf.abs(y_pred - y_true))  
custom_objects = tf.keras.utils.get_custom_objects()
custom_objects['L1_loss'] = L1_loss
model.compile(loss=tf.keras.losses.MSE, optimizer=optimizer, metrics=['mse', 'mae', 'accuracy' ])
model_path = os.path.join(os.getcwd(),'saved','models', 'MSE2_FCNN_AN.model')
if os.path.exists(model_path):
    model.load_weights(model_path)
    print('Model loaded from: ', model_path)
checkpoint = callbacks.ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False,mode='min')
stop = callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
# callbacks=[checkpoint, stop]
history = model.fit(train_set, epochs=150, validation_data=val_set, batch_size=128,callbacks=[checkpoint, stop],verbose=1)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
True
/device:GPU:0




















































Model loaded from:  c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model
Epoch 1/150
Epoch 1: val_loss improved from inf to 0.00247, saving model to c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model
















































































































































INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model\assets


INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model\assets


Epoch 2/150
Epoch 2: val_loss improved from 0.00247 to 0.00242, saving model to c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model




INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model\assets


INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model\assets


Epoch 3/150
Epoch 3: val_loss improved from 0.00242 to 0.00229, saving model to c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model




INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model\assets


INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\MSE2_FCNN_AN.model\assets


Epoch 4/150
Epoch 4: val_loss did not improve from 0.00229
Epoch 5/150

In [13]:
# model_path = os.path.join(os.getcwd(),'saved','models', 'L1_FCNN_AN.model')
# model.save(model_path)



INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\L1_FCNN_AN.model\assets


INFO:tensorflow:Assets written to: c:\Users\Julio\Documents\Projects\audio-denoiser\saved\models\L1_FCNN_AN.model\assets


In [23]:
import librosa
import numpy as np
import IPython.display as ipd
idx = 69
demo_tensor = X[idx]
demo_target = Y[idx]
model_input = tf.expand_dims(demo_tensor, axis=0)
prediction = model.predict(model_input).squeeze()
input = demo_tensor.numpy().squeeze()
target = demo_target.numpy().squeeze()
import soundfile as sf
sf.write('input.wav', input, 16000)
sf.write('prediction.wav', prediction, 16000)
sf.write('prediction.wav', target, 16000)
ipd.Audio(prediction, rate=16000)



“To compute the loss between two waveforms, we apply a pretrained audio classification network to each waveform and compare the internal activation patterns induced in the network by the two signals” ([Germain et al., 2018, p. 1](zotero://select/library/items/A6D78SNY)) ([pdf](zotero://open-pdf/library/items/P4HPP4P3?page=1&annotation=Y3F49L4C))

“The network consists of 15 convolutional layers with 3×1 kernels, batch normalization, LReLU units, and zero padding” ([Germain et al., 2018, p. 2](zotero://select/library/items/A6D78SNY)) ([pdf](zotero://open-pdf/library/items/P4HPP4P3?page=2&annotation=J3JNI54Q))


In [None]:
# FEATURE LOSS NETWORK
import keras as keras
# TODO
# Still in the proccess of converting to keras
# still need to train the network
# still need to figure out how to connect the feature loss network to the main network
n_layers=14
training=True
reuse=False
# “The number of channels is doubled every 5 layers, with 32 channels in the first intermediate layer.” 
base_channels=32
doubling_rate=5
conv_layers = []
norm_type="SBN"
if norm_type == "AN": # ADAPTIVE BATCH NORM
    norm_fn = AdaptiveNormalization
else: # Std BN
    norm_fn = layers.BatchNormalization

    
# STRCUTURE OF THE FEATURE LOSS NETWORK VERY SIMILAR TO THE MAIN NETWORK
for current_layer in range(n_layers):
    n_channels = base_channels * (2 ** (current_layer // doubling_rate)) # UPDATE CHANNEL COUNT
    if current_layer == 0:
        # "Each Layer is decimated by 2" - just means stride of 2 in the time dimension.
        net = layers.Conv2D(input, n_channels, kernel_size=[1, 3], activation=LeakyReLU, stride=[1, 2],
                            name='loss_conv_%d'%current_layer, padding='SAME')
        net = norm_fn(net)
        conv_layers.append(net)
    elif current_layer < n_layers - 1:
        net = layers.Conv2D(conv_layers[-1], n_channels, kernel_size=[1, 3], activation=LeakyReLU,
                            stride=[1, 2], name='loss_conv_%d'%current_layer, padding='SAME')
        net = norm_fn(net)
        conv_layers.append(net)
    else:
        net = layers.Conv2D(conv_layers[-1], n_channels, kernel_size=[1, 3], activation=LeakyReLU,
                            name='loss_conv_%d'%current_layer, padding='SAME')
        net = norm_fn(net)
        conv_layers.append(net)
        # TODO
        # "Each channel in the last layer is averaged-pooled to produce the output ferature vector."

def FeatureLoss(target, current, loss_weights, loss_layers):

    feat_current = LossNetwork(current, reuse=False, n_layers=n_layers, norm_type=norm_type)
    
    feat_target = LossNetwork(target, reuse=True, n_layers=n_layers, norm_type=norm_type,
                            base_channels=base_channels, blk_channels=blk_channels)
    
    loss_vec = [0]
    #“The weights λm are set to balance the contribution of each layer to the loss. They are set to the inverse of the relative values of ‖Φm(ß) − Φm(g(x; θ))‖1 after 10 training epochs. (For these first 10 epochs, the weights are set to 1.)”
    for id in range(loss_layers):
        loss_vec.append(loss_function(feat_current[id], feat_target[id], type="L1") / loss_weights[id])
    # b) Denoising loss function:
    for id in range(1,loss_layers+1):
        loss_vec[0] += loss_vec[id]
    
    return loss_vec
