In [5]:
import os
import numpy as np
import tensorflow as tf
import librosa

# Step 1: Get list of file paths

def get_audio_file_paths(directory):
    """
    Get a list of audio file paths from a given directory.

    Args:
        directory (str): Path to the directory containing .wav audio files.

    Returns:
        List[str]: List of full file paths for .wav files in the directory.
    """
    audio_files = []
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            audio_files.append(os.path.join(directory, filename))
    return audio_files


# Step 2: Preprocess audio files

def preprocess_audio(file_path, target_sample_rate=16000, duration=2.0):
    """
    Load and preprocess an audio file (load, resample, normalize, and pad/crop).

    Args:
        file_path (str): Path to the audio file.
        target_sample_rate (int): The sample rate to which audio will be resampled.
        duration (float): Target duration (in seconds) for the audio.

    Returns:
        np.ndarray: Preprocessed audio signal with fixed length.
    """
    # Load audio file
    audio, sr = librosa.load(file_path, sr=target_sample_rate)

    # Normalize audio to range [-1, 1]
    audio = librosa.util.normalize(audio)

    # Ensure fixed length (pad or crop)
    target_length = int(target_sample_rate * duration)
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        padding = target_length - len(audio)
        audio = np.pad(audio, (0, padding), 'constant')

    return audio


# Step 3: Get file lists for clean and noisy datasets

clean_dir = 'clean_testset_wav'
noisy_dir = 'noisy_testset_wav'

clean_files = get_audio_file_paths(clean_dir)
noisy_files = get_audio_file_paths(noisy_dir)

# Step 4: Prepare dataset

# Process all pairs into numpy arrays
clean_signals = []
noisy_signals = []

for clean_fp, noisy_fp in zip(clean_files, noisy_files):
    clean_audio = preprocess_audio(clean_fp)
    noisy_audio = preprocess_audio(noisy_fp)
    
    clean_signals.append(clean_audio)
    noisy_signals.append(noisy_audio)

# Convert lists to numpy arrays for training
clean_signals = np.array(clean_signals)
noisy_signals = np.array(noisy_signals)

# Add channel dimension for TensorFlow (batch_size, timesteps, channels)
clean_signals = np.expand_dims(clean_signals, -1)
noisy_signals = np.expand_dims(noisy_signals, -1)


# Step 5: Build Autoencoder Model using TensorFlow

def build_autoencoder(input_shape):
    """
    Build an autoencoder model for audio denoising using TensorFlow.

    Args:
        input_shape (tuple): Shape of input audio (timesteps, channels).

    Returns:
        tf.keras.Model: Compiled autoencoder model.
    """
    inputs = tf.keras.layers.Input(shape=input_shape)

    # Encoder
    x = tf.keras.layers.Conv1D(16, 3, padding='same', activation='relu')(inputs)
    x = tf.keras.layers.MaxPooling1D(2, padding='same')(x)
    x = tf.keras.layers.Conv1D(8, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.MaxPooling1D(2, padding='same')(x)
    x = tf.keras.layers.Conv1D(8, 3, padding='same', activation='relu')(x)
    encoded = tf.keras.layers.MaxPooling1D(2, padding='same')(x)

    # Decoder
    x = tf.keras.layers.Conv1D(8, 3, padding='same', activation='relu')(encoded)
    x = tf.keras.layers.UpSampling1D(2)(x)
    x = tf.keras.layers.Conv1D(8, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.UpSampling1D(2)(x)
    x = tf.keras.layers.Conv1D(16, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.UpSampling1D(2)(x)
    decoded = tf.keras.layers.Conv1D(1, 3, padding='same', activation='tanh')(x)

    # Model
    autoencoder = tf.keras.Model(inputs, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder


# Step 6: Instantiate and train the autoencoder

input_shape = (clean_signals.shape[1], 1)
autoencoder = build_autoencoder(input_shape)

# Train the model
autoencoder.fit(noisy_signals, clean_signals, epochs=50, batch_size=16, validation_split=0.1)



Epoch 1/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 60ms/step - loss: 0.0165 - val_loss: 0.0055
Epoch 2/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step - loss: 0.0049 - val_loss: 0.0041
Epoch 3/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step - loss: 0.0039 - val_loss: 0.0039
Epoch 4/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 60ms/step - loss: 0.0039 - val_loss: 0.0038
Epoch 5/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - loss: 0.0036 - val_loss: 0.0037
Epoch 6/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 58ms/step - loss: 0.0037 - val_loss: 0.0036
Epoch 7/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 58ms/step - loss: 0.0036 - val_loss: 0.0036
Epoch 8/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 58ms/step - loss: 0.0035 - val_loss: 0.0035
Epoch 9/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x30352eee0>

In [50]:
import soundfile as sf

def denoise_audio(file_path, model, target_sample_rate=16000, duration=10.0, output_path='denoised_output.wav'):
    """
    Denoise a new audio file using the trained autoencoder model and save the output.

    Args:
        file_path (str): Path to the noisy input audio file.
        model (tf.keras.Model): Trained autoencoder model.
        target_sample_rate (int): Target sample rate (Hz) for audio processing.
        duration (float): Duration (in seconds) to which audio will be cropped/padded.
        output_path (str): Path to save the denoised output .wav file.

    Returns:
        None
    """
    # Step 1: Preprocess the noisy input audio
    noisy_audio = preprocess_audio(file_path, target_sample_rate, duration)
    noisy_audio = np.expand_dims(noisy_audio, axis=0)  # Add batch dimension
    noisy_audio = np.expand_dims(noisy_audio, axis=-1)  # Add channel dimension

    # Step 2: Pass through the model to get denoised output
    denoised_audio = model.predict(noisy_audio)

    # Step 3: Postprocess - remove extra dimensions
    denoised_audio = np.squeeze(denoised_audio)

    # Step 4: Save the output audio as .wav file
    sf.write(output_path, denoised_audio, target_sample_rate)
    print(f"Denoised audio saved to {output_path}")

# Example usage:
new_noisy_file = 'noisy_testset_wav/p232_010.wav'  # replace with your noisy input
denoise_audio(new_noisy_file, autoencoder, output_path='denoised_result.wav')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Denoised audio saved to denoised_result.wav


## Hyper parameter tuning

In [65]:
import os
import numpy as np
import tensorflow as tf
import librosa
import keras_tuner as kt

# Step 1: Get audio file paths
def get_audio_file_paths(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.wav')]

# Step 2: Preprocess audio
def preprocess_audio(file_path, target_sample_rate=16000, duration=2.0):
    audio, sr = librosa.load(file_path, sr=target_sample_rate)
    audio = librosa.util.normalize(audio)
    target_length = int(target_sample_rate * duration)
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        padding = target_length - len(audio)
        audio = np.pad(audio, (0, padding), 'constant')
    return audio

# Step 3: Load clean & noisy files
clean_dir = 'clean_testset_wav'
noisy_dir = 'noisy_testset_wav'
clean_files = get_audio_file_paths(clean_dir)
noisy_files = get_audio_file_paths(noisy_dir)

clean_signals, noisy_signals = [], []
for c_fp, n_fp in zip(clean_files, noisy_files):
    clean_signals.append(preprocess_audio(c_fp))
    noisy_signals.append(preprocess_audio(n_fp))

clean_signals = np.expand_dims(np.array(clean_signals), -1)
noisy_signals = np.expand_dims(np.array(noisy_signals), -1)


# Step 4: Autoencoder Model with Balanced Pool/UpSampling
def build_autoencoder(hp):
    inputs = tf.keras.layers.Input(shape=(clean_signals.shape[1], 1))

    # Force same number of encoder and decoder layers
    num_layers = hp.Int('num_layers', min_value=2, max_value=4, step=1)
    x = inputs

    # Encoder
    filter_list = []
    for i in range(num_layers):
        filters = hp.Int(f'filters_{i}', min_value=16, max_value=64, step=16)
        filter_list.append(filters)
        kernel_size = hp.Choice(f'kernel_size_{i}', [3, 5])
        x = tf.keras.layers.Conv1D(filters, kernel_size, padding='same', activation='relu')(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=2, padding='same')(x)

    # Bottleneck
    x = tf.keras.layers.Conv1D(filters, 3, padding='same', activation='relu')(x)

    # Decoder (mirrors encoder)
    for i in reversed(range(num_layers)):
        x = tf.keras.layers.UpSampling1D(2)(x)
        x = tf.keras.layers.Conv1D(filter_list[i], 3, padding='same', activation='relu')(x)

    # Output layer
    outputs = tf.keras.layers.Conv1D(1, 3, padding='same', activation='tanh')(x)

    model = tf.keras.Model(inputs, outputs)
    learning_rate = hp.Float('learning_rate', 1e-5, 1e-3, sampling='log')
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')

    return model


# Step 5: Keras Tuner setup
tuner = kt.RandomSearch(
    build_autoencoder,
    objective='val_loss',
    max_trials=3,
    directory='autoencoder_tuning',
    project_name='audio_denoiser_v2'
)

# Step 6: Search for best hyperparameters
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

tuner.search(
    noisy_signals, clean_signals,
    epochs=10,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stop]
)

# Step 7: Get the best model
best_hp = tuner.get_best_hyperparameters(1)[0]
print("Best Hyperparameters:", best_hp.values)

best_model = tuner.hypermodel.build(best_hp)
best_model.fit(
    noisy_signals, clean_signals,
    epochs=10,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stop]
)


Reloading Tuner from autoencoder_tuning/audio_denoiser_v2/tuner0.json
Best Hyperparameters: {'num_layers': 3, 'filters_0': 64, 'kernel_size_0': 3, 'filters_1': 64, 'kernel_size_1': 3, 'learning_rate': 0.0001562434526149395, 'filters_2': 64, 'kernel_size_2': 5, 'filters_3': 16, 'kernel_size_3': 5}
Epoch 1/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 486ms/step - loss: 0.0147 - val_loss: 0.0041
Epoch 2/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 527ms/step - loss: 0.0038 - val_loss: 0.0033
Epoch 3/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 563ms/step - loss: 0.0032 - val_loss: 0.0029
Epoch 4/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 577ms/step - loss: 0.0027 - val_loss: 0.0026
Epoch 5/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 581ms/step - loss: 0.0025 - val_loss: 0.0024
Epoch 6/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 587ms/step - loss: 0.

<keras.src.callbacks.history.History at 0x322392400>

In [67]:
# Get best params
best_hp = tuner.get_best_hyperparameters(1)[0]

# Build final model
final_model = build_autoencoder(best_hp)
final_model.summary()  # Optional: show architecture

# Retrain on full data
final_model.fit(
    noisy_signals, clean_signals,
    epochs=50,
    batch_size=16,
    validation_split=0.1,
    callbacks=[early_stop]
)

# Save final model
final_model.save("audio_denoiser_best_model.h5")


Epoch 1/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 588ms/step - loss: 0.0155 - val_loss: 0.0042
Epoch 2/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 726ms/step - loss: 0.0039 - val_loss: 0.0034
Epoch 3/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 763ms/step - loss: 0.0033 - val_loss: 0.0030
Epoch 4/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 783ms/step - loss: 0.0029 - val_loss: 0.0028
Epoch 5/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 842ms/step - loss: 0.0028 - val_loss: 0.0026
Epoch 6/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 850ms/step - loss: 0.0026 - val_loss: 0.0025
Epoch 7/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 885ms/step - loss: 0.0023 - val_loss: 0.0024
Epoch 8/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 773ms/step - loss: 0.0024 - val_loss: 0.0023
Epoch 9/50
[1m47/47[0m [32m━━



In [68]:
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError

model = load_model("audio_denoiser_best_model.h5", custom_objects={'mse': MeanSquaredError()})




In [69]:
import librosa
import numpy as np
import soundfile as sf  # Recommended for saving audio


In [76]:
def preprocess_audio(file_path, target_sample_rate=32000):
    audio, sr = librosa.load(file_path, sr=target_sample_rate)
    audio = librosa.util.normalize(audio)
    return audio

def chunk_audio(audio, chunk_size=32000):
    # Split into non-overlapping chunks of chunk_size
    num_chunks = int(np.ceil(len(audio) / chunk_size))
    padded_audio = np.pad(audio, (0, num_chunks * chunk_size - len(audio)), 'constant')
    chunks = np.reshape(padded_audio, (num_chunks, chunk_size))
    return chunks

def denoise_audio(input_file, output_file, model, target_sample_rate=32000):
    # Step 1: Preprocess the noisy input (normalize & load)
    audio = preprocess_audio(input_file, target_sample_rate)
    
    # Step 2: Split into 1-sec chunks (32000 samples)
    chunks = chunk_audio(audio, chunk_size=32000)
    
    denoised_chunks = []
    for chunk in chunks:
        chunk_input = np.expand_dims(chunk, axis=(0, -1))  # Shape: (1, 32000, 1)
        denoised_chunk = model.predict(chunk_input)
        denoised_chunk = denoised_chunk.squeeze()
        denoised_chunks.append(denoised_chunk)
    
    # Step 3: Concatenate all denoised chunks
    denoised_audio = np.concatenate(denoised_chunks)
    denoised_audio = librosa.util.normalize(denoised_audio)
    
    # Step 4: Save the denoised audio
    sf.write(output_file, denoised_audio, target_sample_rate)
    print(f"Denoised audio saved to: {output_file}")

# Example usage:
input_wav = "noisy_testset_wav/p232_010.wav"
output_wav = "chunked_denoised_audio.wav"

denoise_audio(input_wav, output_wav, model)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Denoised audio saved to: chunked_denoised_audio.wav
