5.1  Frequency Masking

Frequency masking simulates the effect of missing frequency bands in audio signals. This can occur in real-world scenarios like microphone limitations or environmental factors.

In [1]:

import os
import librosa
import numpy as np
import soundfile as sf
import random

def add_frequency_mask(audio, sr, freq_mask_width=1000):
    stft = librosa.stft(audio)
    stft_magnitude, stft_phase = librosa.magphase(stft)

    num_freq_bins = stft_magnitude.shape[0]
    freq_mask_bins = int(freq_mask_width / (sr / (2 * num_freq_bins)))

    start_bin = random.randint(0, num_freq_bins - freq_mask_bins - 1)
    stft_magnitude[start_bin:start_bin + freq_mask_bins, :] = 0 

    masked_stft = stft_magnitude * stft_phase
    masked_audio = librosa.istft(masked_stft)
    
    return masked_audio

def process_audio_with_frequency_mask(input_folder, output_folder, freq_mask_width=1000):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(input_folder, file_name)
            print(f"Processing file: {file_name}")

            audio, sr = librosa.load(file_path, sr=16000)

            masked_audio = add_frequency_mask(audio, sr, freq_mask_width)

            output_file_path = os.path.join(output_folder, file_name)
            sf.write(output_file_path, masked_audio, sr)
            print(f"Saved masked audio to: {output_file_path}")

input_folder = "./frequency_masking"  
output_folder = "./frequency_masking_ready" 
freq_mask_width = 1000 

process_audio_with_frequency_mask(input_folder, output_folder, freq_mask_width)


Processing file: f10_script2_iphone_livingroom1.wav
Saved masked audio to: ./frequency_masking_ready\f10_script2_iphone_livingroom1.wav
Processing file: f1_script2_ipad_office1.wav
Saved masked audio to: ./frequency_masking_ready\f1_script2_ipad_office1.wav
Processing file: f1_script2_iphone_balcony1.wav
Saved masked audio to: ./frequency_masking_ready\f1_script2_iphone_balcony1.wav
Processing file: f1_script3_ipad_office1.wav
Saved masked audio to: ./frequency_masking_ready\f1_script3_ipad_office1.wav
Processing file: f1_script4_ipad_confroom2.wav
Saved masked audio to: ./frequency_masking_ready\f1_script4_ipad_confroom2.wav
Processing file: f2_script1_iphone_livingroom1.wav
Saved masked audio to: ./frequency_masking_ready\f2_script1_iphone_livingroom1.wav
Processing file: f2_script4_ipad_confroom2.wav
Saved masked audio to: ./frequency_masking_ready\f2_script4_ipad_confroom2.wav
Processing file: f3_script1_iphone_livingroom1.wav
Saved masked audio to: ./frequency_masking_ready\f3_scr

5.2 Time Masking

Time masking mimics situations where parts of the audio signal are cut or silenced. Randomly mask segments in the time domain.

In [2]:
import os
import librosa
import numpy as np
import soundfile as sf
import random

def add_time_mask(audio, sr, time_mask_duration=0.5):
    num_samples = len(audio)
    mask_samples = int(time_mask_duration * sr)

    start_sample = random.randint(0, num_samples - mask_samples - 1)

    masked_audio = np.copy(audio)
    masked_audio[start_sample:start_sample + mask_samples] = 0
    
    return masked_audio

def process_audio_with_time_mask(input_folder, output_folder, time_mask_duration=0.5):

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(input_folder, file_name)
            print(f"Processing file: {file_name}")

            audio, sr = librosa.load(file_path, sr=16000)

            masked_audio = add_time_mask(audio, sr, time_mask_duration)

            output_file_path = os.path.join(output_folder, file_name)
            sf.write(output_file_path, masked_audio, sr)
            print(f"Saved masked audio to: {output_file_path}")

input_folder = "./time_masking"  
output_folder = "./time_masking_ready" 
time_mask_duration = 1 

process_audio_with_time_mask(input_folder, output_folder, time_mask_duration)


Processing file: f10_script2_iphone_livingroom1.wav
Saved masked audio to: ./time_masking_ready\f10_script2_iphone_livingroom1.wav
Processing file: f10_script4_ipadflat_confroom1.wav
Saved masked audio to: ./time_masking_ready\f10_script4_ipadflat_confroom1.wav
Processing file: f10_script5_ipadflat_confroom1.wav
Saved masked audio to: ./time_masking_ready\f10_script5_ipadflat_confroom1.wav
Processing file: f1_script4_ipad_balcony1.wav
Saved masked audio to: ./time_masking_ready\f1_script4_ipad_balcony1.wav
Processing file: f1_script4_ipad_confroom2.wav
Saved masked audio to: ./time_masking_ready\f1_script4_ipad_confroom2.wav
Processing file: f2_script1_ipad_balcony1.wav
Saved masked audio to: ./time_masking_ready\f2_script1_ipad_balcony1.wav
Processing file: f2_script1_iphone_livingroom1.wav
Saved masked audio to: ./time_masking_ready\f2_script1_iphone_livingroom1.wav
Processing file: f2_script4_ipad_confroom2.wav
Saved masked audio to: ./time_masking_ready\f2_script4_ipad_confroom2.wa

5.3 Noise Injection. Adding noise simulates real-world environments like background chatter, machinery, or wind.

In [3]:
import os
import librosa
import numpy as np
import soundfile as sf

def add_looped_noise(audio, sr, noise_file, noise_level=0.2):
    noise, noise_sr = librosa.load(noise_file, sr=sr)

    repeat_count = int(np.ceil(len(audio) / len(noise)))
    looped_noise = np.tile(noise, repeat_count)[:len(audio)]

    looped_noise = looped_noise * noise_level

    noisy_audio = audio + looped_noise
    noisy_audio = np.clip(noisy_audio, -1.0, 1.0)
    
    return noisy_audio

def process_folder_with_multiple_noises(input_folder, output_folder, noise_folder, noise_level=0.2):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    noise_files = [os.path.join(noise_folder, f) for f in os.listdir(noise_folder) if f.endswith(".wav")]

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            input_file_path = os.path.join(input_folder, file_name)
            print(f"Processing file: {file_name}")

            audio, sr = librosa.load(input_file_path, sr=16000)

            for noise_file in noise_files:
                noise_name = os.path.splitext(os.path.basename(noise_file))[0]

                noisy_audio = add_looped_noise(audio, sr, noise_file, noise_level)

                noise_percentage = int(noise_level * 100)
                output_file_name = f"{os.path.splitext(file_name)[0]}_{noise_name}_noise_{noise_percentage}pct.wav"
                output_file_path = os.path.join(output_folder, output_file_name)

                sf.write(output_file_path, noisy_audio, sr)
                print(f"Saved noisy audio to: {output_file_path}")

input_folder = "./noise_injection"  
output_folder = "./noise_injection_ready" 
noise_folder = "./noise"  
noise_level = 0.2  #noise level, 20% of its original loudness

process_folder_with_multiple_noises(input_folder, output_folder, noise_folder, noise_level)


Processing file: f10_script2_iphone_livingroom1.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_AirConditioner_1_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_AirportAnnouncements_9_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_Babble_6_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_Car_1_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_Metro_1_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_ShuttingDoor_1_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_Typing_1_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_VacuumCleaner_1_noise_20pct.wav
Saved noisy audio to: ./noise_injection_ready\f10_script2_iphone_livingroom1_WasherDryer_4_noise_20pc

5.4 Speed Perturbation. Changing the speed of audio tests the model’s adaptability to faster or slower speech.

In [4]:
import os
import librosa
import soundfile as sf

def change_audio_speed(audio, sr, speed_factor):
    stft_audio = librosa.stft(audio)  
    stretched_stft = librosa.phase_vocoder(stft_audio, rate=speed_factor) 
    return librosa.istft(stretched_stft)

def process_audio_with_speed_change(input_folder, output_folder, speed_factor):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            input_file_path = os.path.join(input_folder, file_name)
            print(f"Processing file: {file_name}")

            audio, sr = librosa.load(input_file_path, sr=None)

            modified_audio = change_audio_speed(audio, sr, speed_factor)

            speed_type = "faster" if speed_factor > 1 else "slower"
            speed_percentage = int(speed_factor * 100)
            output_file_name = f"{os.path.splitext(file_name)[0]}_{speed_type}_{speed_percentage}pct.wav"
            output_file_path = os.path.join(output_folder, output_file_name)

            sf.write(output_file_path, modified_audio, sr)
            print(f"Saved processed audio to: {output_file_path}")

input_folder = "./speed" 
output_folder = "./speed_ready" 
speed_factor = 1.5  # >1 for speeding up, <1 for slowing down

process_audio_with_speed_change(input_folder, output_folder, speed_factor)


Processing file: f10_script2_iphone_livingroom1.wav
Saved processed audio to: ./speed_ready\f10_script2_iphone_livingroom1_faster_150pct.wav
Processing file: f1_script4_ipad_confroom2.wav
Saved processed audio to: ./speed_ready\f1_script4_ipad_confroom2_faster_150pct.wav
Processing file: f2_script1_ipad_balcony1.wav
Saved processed audio to: ./speed_ready\f2_script1_ipad_balcony1_faster_150pct.wav
Processing file: f2_script1_iphone_livingroom1.wav
Saved processed audio to: ./speed_ready\f2_script1_iphone_livingroom1_faster_150pct.wav
Processing file: f2_script4_ipad_confroom2.wav
Saved processed audio to: ./speed_ready\f2_script4_ipad_confroom2_faster_150pct.wav
Processing file: f3_script1_iphone_livingroom1.wav
Saved processed audio to: ./speed_ready\f3_script1_iphone_livingroom1_faster_150pct.wav
Processing file: f4_script5_ipad_balcony1.wav
Saved processed audio to: ./speed_ready\f4_script5_ipad_balcony1_faster_150pct.wav
Processing file: f6_script1_ipad_confroom2.wav
Saved processe

5.5 Reverberation. Adding reverberation simulates audio captured in echo-prone environments like large halls or empty rooms.

In [7]:
import os
import librosa
import soundfile as sf
import numpy as np

def add_reverb(audio, sr, delay=0.03, decay=0.6, num_echoes=5):
    audio_length = len(audio)
    delay_samples = int(delay * sr)
    reverb_audio = np.copy(audio)

    for i in range(1, num_echoes + 1):
        start_idx = i * delay_samples
        if start_idx < audio_length:
            reverb_audio[start_idx:] += audio[:-start_idx] * (decay ** i)

    max_amplitude = np.max(np.abs(reverb_audio))
    if max_amplitude > 1.0:
        reverb_audio = reverb_audio / max_amplitude
    
    return reverb_audio

def process_audio_with_reverb(input_folder, output_folder, delay=0.03, decay=0.6, num_echoes=5):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            input_file_path = os.path.join(input_folder, file_name)
            print(f"Processing file: {file_name}")

            audio, sr = librosa.load(input_file_path, sr=None)

            reverb_audio = add_reverb(audio, sr, delay, decay, num_echoes)

            output_file_name = f"{os.path.splitext(file_name)[0]}_reverb.wav"
            output_file_path = os.path.join(output_folder, output_file_name)

            sf.write(output_file_path, reverb_audio, sr)
            print(f"Saved processed audio to: {output_file_path}")

input_folder = "./reverberation"  
output_folder = "./reverberation_ready" 
delay = 0.04  
decay = 0.5  
num_echoes = 6  

process_audio_with_reverb(input_folder, output_folder, delay, decay, num_echoes)


Processing file: f10_script2_iphone_livingroom1.wav
Saved processed audio to: ./reverberation_ready\f10_script2_iphone_livingroom1_reverb.wav
Processing file: f10_script4_ipad_livingroom1.wav
Saved processed audio to: ./reverberation_ready\f10_script4_ipad_livingroom1_reverb.wav
Processing file: f1_script3_ipad_livingroom1.wav
Saved processed audio to: ./reverberation_ready\f1_script3_ipad_livingroom1_reverb.wav
Processing file: f1_script4_ipad_confroom2.wav
Saved processed audio to: ./reverberation_ready\f1_script4_ipad_confroom2_reverb.wav
Processing file: f2_script1_iphone_livingroom1.wav
Saved processed audio to: ./reverberation_ready\f2_script1_iphone_livingroom1_reverb.wav
Processing file: f2_script4_ipad_confroom2.wav
Saved processed audio to: ./reverberation_ready\f2_script4_ipad_confroom2_reverb.wav
Processing file: f3_script1_iphone_livingroom1.wav
Saved processed audio to: ./reverberation_ready\f3_script1_iphone_livingroom1_reverb.wav
Processing file: f4_script4_ipad_livingr

5.6 Silence injection. Randomly silenting parts of the speech, making the effect of inconsistent speech.

In [8]:
import os
import librosa
import soundfile as sf
import numpy as np
import random

def add_silence(audio, sr, silence_duration=0.5):
    silence_samples = int(silence_duration * sr)
    silence = np.zeros(silence_samples)
    
    insert_position = random.randint(0, len(audio))

    audio_with_silence = np.concatenate((audio[:insert_position], silence, audio[insert_position:]))
    return audio_with_silence

def process_audio_with_silence_injection(input_folder, output_folder, silence_duration=0.5):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".wav"):
            input_file_path = os.path.join(input_folder, file_name)
            print(f"Processing file: {file_name}")

            audio, sr = librosa.load(input_file_path, sr=None)

            modified_audio = add_silence(audio, sr, silence_duration)

            silence_ms = int(silence_duration * 1000)  # Convert to milliseconds
            output_file_name = f"{os.path.splitext(file_name)[0]}_silence_{silence_ms}ms.wav"
            output_file_path = os.path.join(output_folder, output_file_name)
            sf.write(output_file_path, modified_audio, sr)
            print(f"Saved processed audio to: {output_file_path}")

input_folder = "./silence"  
output_folder = "./silence_ready"  
silence_duration = 1.0 

process_audio_with_silence_injection(input_folder, output_folder, silence_duration)


Processing file: f10_script1_iphone_balcony1.wav
Saved processed audio to: ./silence_ready\f10_script1_iphone_balcony1_silence_1000ms.wav
Processing file: f10_script2_iphone_livingroom1.wav
Saved processed audio to: ./silence_ready\f10_script2_iphone_livingroom1_silence_1000ms.wav
Processing file: f1_script4_ipad_confroom2.wav
Saved processed audio to: ./silence_ready\f1_script4_ipad_confroom2_silence_1000ms.wav
Processing file: f2_script1_iphone_livingroom1.wav
Saved processed audio to: ./silence_ready\f2_script1_iphone_livingroom1_silence_1000ms.wav
Processing file: f2_script4_ipad_confroom2.wav
Saved processed audio to: ./silence_ready\f2_script4_ipad_confroom2_silence_1000ms.wav
Processing file: f3_script1_iphone_livingroom1.wav
Saved processed audio to: ./silence_ready\f3_script1_iphone_livingroom1_silence_1000ms.wav
Processing file: f3_script3_iphone_balcony1.wav
Saved processed audio to: ./silence_ready\f3_script3_iphone_balcony1_silence_1000ms.wav
Processing file: f6_script1_ip