In [3]:
## Skeleton for future Training Dataset pipeline generator

# TODO: Create split_spectrogram_frames function to split the spectrogram frames into fixed length sequences
import os
import soundfile as sf
from scipy.signal import stft
import numpy as np
from keras.preprocessing.sequence import TimeseriesGenerator
TARGET_SAMPLE_RATE = 16000
TARGET_FEATURE = "signal" # "stft" or "signal"
FEATURE_LENGTHS = 128 # width of the spectrogram images when using "stft" as feature
OUTPUT_SEQUENCES = True

def split_spectrogram_frames(spectrogram_frames: np.ndarray or list, sequence_length=128):
    """Split the spectrogram to sequences of fixed length.
    Args:
        spectrogram_frames (np.array): The spectrogram frames.
        sequence_length (int): The length of the sequence.
    Returns:
        np.ndarray: The spectrogram sequences. Shape: (num_sequences, freq_bins, sequence_length)
    """
    if isinstance(spectrogram_frames, list):
        spectrogram_frames = np.array(spectrogram_frames)
        # if is not NxM, raise error
        if len(spectrogram_frames.shape) != 2:
            raise ValueError("spectrogram_frames must be a 2D array")
    num_subsequences = spectrogram_frames.shape[1] // sequence_length
    # Create an array of the appropriate shape to hold the sub-sequences
    subsequences = np.zeros((spectrogram_frames.shape[0], num_subsequences, sequence_length))
    # Loop through the rows of the array
    for i in range(spectrogram_frames.shape[0]):
        # Loop through the sub-sequences
        for j in range(num_subsequences):
        # Extract the current sub-sequence and store it in the appropriate place in the subsequences array
            single_split = spectrogram_frames[i, j*sequence_length:(j+1)*sequence_length]
            if single_split.shape[0] != sequence_length:
                subsequences[i, j, :] = single_split
            
    # Return the subsequences array
    return subsequences



# # List all the files in the clean and distorted speech directories
# clean_speech_files = os.listdir(clean_speech_dir)
# distorted_speech_files = os.listdir(distorted_speech_dir)

# assert len(clean_speech_files) == len(distorted_speech_files), 'The number of clean and distorted speech files must be the same.'

# Loop through the clean and distorted speech files
dataset = []
def create_dataset(clean_speech_dir, distorted_speech_dir, seed=42, file_limit=None):
    selected_files = {}
    np.random.seed(seed)
    while len(selected_files) < file_limit:
        # collect `limit` random unique files from distorted_speech_dir
        max = len(os.listdir(distorted_speech_dir))
        rand_index = np.random.randint(0,max)

        while rand_index in selected_files:
            rand_index = np.random.randint(0,max)
        
        random_file = os.listdir(distorted_speech_dir)[rand_index]
        assert random_file in os.listdir(clean_speech_dir)
        selected_files[rand_index] = random_file
    wav_files = list(selected_files.values())
    # for clean_speech_file, distorted_speech_file in zip(clean_speech_files, distorted_speech_files):
    for clean_speech_file, distorted_speech_file in zip(wav_files, wav_files):
        assert clean_speech_file == distorted_speech_file, 'The clean and distorted speech files must have the same name and same order.'
        # Read and load the clean and distorted audio files
        clean_speech, _ = sf.read(os.path.join(clean_speech_dir, clean_speech_file))
        distorted_speech, _ = sf.read(os.path.join(distorted_speech_dir, distorted_speech_file))
        
        
        # Split the spectrogram frames into overlapping sequences of fixed length 
        if TARGET_FEATURE == "stft" :
        # Apply the STFT to the audio samples to get the spectrogram frames
            clean_speech_frames = stft(clean_speech)
            distorted_speech_frames = stft(clean_speech)
            inputs_sequences = split_spectrogram_frames(distorted_speech_frames, sequence_length=128)
            targets_sequences = split_spectrogram_frames(clean_speech_frames, sequence_length=128)
            assert inputs_sequences.shape == targets_sequences.shape, 'Split training and target stft must have the same shape.'
        
        # Create a data generator using the Timeseries


In [4]:
base_dir = 'E:\dataset'
pwd = os.getcwd()
seed = 4242 # random seed
limit = 100 # number of files to be used for training
noised_data_dir = os.path.join(base_dir, 'noisy')
clean_data_dir = os.path.join(base_dir , 'clean')
# Generate random files to be used for training
dataset = create_dataset(clean_data_dir, noised_data_dir, seed=seed, file_limit=limit)