<a href="https://colab.research.google.com/github/jkohler-u/Denoise_tf/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#connect to colab
!git clone https://github.com/jkohler-u/Denoise_tf.git
import sys
sys.path.append('/content/Denoise_tf')

Cloning into 'Denoise_tf'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 65 (delta 32), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (65/65), 6.32 MiB | 2.23 MiB/s, done.


In [None]:
#our own pyfiles
import preprocessing
import unet
import postprocessing

In [None]:
#load tensorboard
%load_ext tensorboard

In [None]:
#working with files
from zipfile import ZipFile
from google.colab import drive
import os

filename = 'a1'

# Mount your Google Drive
drive.mount('/content/drive')

# Change the working directory to your Drive
os.chdir('/content/drive/MyDrive')

# Open the zipfile and extract its contents to memory
with ZipFile('CleanSpeech_training.zip', 'r') as zip_file:
  zip_file.extractall(filename)

filename = 'a2'

# Open the zipfile and extract its contents to memory
with ZipFile('NoisySpeech_training.zip', 'r') as zip_file:
  zip_file.extractall(filename)

Mounted at /content/drive


In [None]:
#creating the dataset
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

#load the data 
train_noisy = preprocessing.prepare_data('a2')
train_clean = preprocessing.prepare_data('a1')
train_noisy = preprocessing.make_same(train_noisy, 196500)
train_clean = preprocessing.make_same(train_clean, 196500)

# Turn all the data into complex spectrograms
clean_spec = preprocessing.spectrogram(np.array(train_clean))
noisy_spec = preprocessing.spectrogram(np.array(train_noisy))

#convert the data into a tf dataset
tf_dataset = tf.data.Dataset.from_tensor_slices((noisy_spec, clean_spec))

# Calculate the total number of samples in the dataset
num_samples = tf.data.experimental.cardinality(tf_dataset).numpy()

# Shuffle the dataset
tf_dataset = tf_dataset.shuffle(num_samples)

#randomly split the dataset into train and test sets
train_size = int(0.8 * num_samples)
test_size = num_samples - train_size
train_dataset = tf_dataset.take(train_size)
test_dataset = tf_dataset.skip(train_size)

#batch
batch_size = 10
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

  return np.array(data)


In [None]:
'''fit function'''
#tf
import datetime
import tensorflow as tf
from tensorflow import keras

#added tensorboard log
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

#run the model
unet.model.fit(
    train_dataset,
    epochs=20,
    shuffle=True,
    validation_data=test_dataset,
    callbacks=[tensorboard_callback],
)

Epoch 1/20
Epoch 2/20

In [None]:
# Convert tf dataset back to numpy arrays
prediciton, noisy, clean = postprocessing.get_one_of_each(train_dataset)

#prediction_for_rest = prediction_for_rest.take(1)
pred = (unet.model.predict(prediciton.batch(1)))
pred = tf.squeeze(pred)

In [None]:
#plotting the data
import matplotlib.pyplot as plt

f, axarr = plt.subplots(1,3, figsize=(20,4))
axarr[0].imshow(noisy, aspect='auto', origin='lower', cmap='viridis')
axarr[0].set_title("noisy")
axarr[1].imshow(clean, aspect='auto', origin='lower', cmap='viridis')
axarr[1].set_title("clean")
axarr[2].imshow(pred, aspect='auto', origin='lower', cmap='viridis')
axarr[2].set_title("prediction")


In [None]:
#displying audio
import IPython
import soundfile as sf

#convert the spectograms back to audio
n = postprocessing.convert_to_audio(noisy)
c = postprocessing.convert_to_audio(clean)
p = postprocessing.convert_to_audio(pred)

#create audio files
sr = 16000
sf.write('noisy.wav', n, sr)
sf.write('clean.wav', c, sr)
sf.write('pred.wav', p, sr)

#noisy audio
IPython.display.Audio('noisy.wav')

In [None]:
#clean audio
IPython.display.Audio('clean.wav')

In [None]:
#predicted audio
IPython.display.Audio('pred.wav')

In [None]:
%tensorboard --logdir logs/fit

Reconstructing audio from a mel-spectrogram always gives it a tinny sound. This is the case, because when converting an audio into a spectrogram, it looses the phase information. To fix this we would convert the audio into a bigger, normal spectogram and then later reconstruct the phase information with the griffin lim algoithm. We show the code for that as a proof of concept below

In [None]:
import librosa
import IPython
import soundfile as sf

#we take one preprcessed audio sample 
S = np.array(train_clean[0])

#convert audio to a spectogram
spectrogram = librosa.stft(S, n_fft=1024, hop_length=256, window='hann')

#convert spectogram to db scale
spectrogramdb = librosa.amplitude_to_db(np.abs(spectrogram), ref=np.max)

#aestimate the phase of that spectogram with the griffin lim algorithm
phase = librosa.griffinlim(spectrogramdb, n_iter = 100, hop_length = 256, win_length = 1024)

#convert the spectogram back to audio
s = librosa.stft(phase, n_fft=1024, hop_length=256)

#convert the spectoram back from db scale 
spectrogram_linear = librosa.db_to_amplitude(spectrogramdb, ref=np.max(np.abs(spectrogramdb)))

# Combine the magnitude and phase to obtain the complex spectrogram
spectrogram_complex = spectrogram_linear * np.exp(1j * np.angle(s))
reconstructed_audio=librosa.istft(spectrogram_complex, hop_length=256, window='hann')

#play the audio
sr = 16000
sf.write('reconstructed.wav', reconstructed_audio, sr)
IPython.display.Audio('reconstructed.wav')