In [79]:
import numpy as np 
import os 
import librosa
import soundfile as sf
import matplotlib.pyplot as plt
from tqdm import tqdm

### Introducing local scaling/normalization functions. 

`Local scaling` is better if we wish to preserve the dynamic range of the audio data, while `local norming` will be better if we want to preserve the distances between the data points


In [80]:
def local_scaling(x):
    """Scaling an array to fit between -1 and 1"""
    x_min = np.min(x)
    x_max = np.max(x)
    x_normed = (x - x_min) / (x_max - x_min)
    x_scaled = 2 * x_normed - 1
    
    return x_scaled

def local_norming(x):
    x -= np.mean(x)
    x /= np.max(np.abs(x))
    
    return x

### Functions to remove silence

`remove_silence` is the only working function so far. Alternatives include labelling weak signal points (low absolute value) as noise instead (WIP/KIV)

In [81]:
def remove_silence(signal, segment_duration=0.5, threshold=0.001):
    """Function to brute force remove silent segments of the audio by comparing the absolute normalized
    value to a certain threshold. 

    Args:
        signal (np.ndarray): audio signal
        segment_duration (float, optional): Duration of a segment. Essentially the duration of the
        periods of empty noise which we are trying to remove. Defaults to 0.5.
        threshold (float, optional): Threshold value which we deem to be noise/ambient sound. 
        Defaults to 0.001.

    Returns:
        new_signal (np.ndarray) : audio signal with the noise removed
    """
    fs = 48000
    segment_length = int(segment_duration * fs)
    
    # Number of segments
    num_segments = signal.shape[1] // segment_length

    # Initialize a 3D numpy array for the segments
    segments = np.zeros((num_segments, signal.shape[0], segment_length))

    # Split the signal into segments for each channel
    for i in range(num_segments):
        segments[i, :, :] = signal[:, i*segment_length:(i+1)*segment_length]

    # Calculate the absolute maximum value in each segment for each channel
    max_values = np.max(np.abs(segments), axis=2)

    # Identify the segments where the maximum value is above the threshold for any channel
    keep_segments = np.any(max_values > threshold, axis=1)

    # Keep only the segments where the maximum value is above the threshold for any channel
    new_signal = np.concatenate(segments[keep_segments, :, :], axis=1)

    
    return new_signal

### Removing silence
Now we actually cycle through the original dataset directories, for each class, and remove the silence in each concatenated track

In [86]:
# This should be the originally re-recorded data dir
original_audio_dir = "./data/Dataset_concatenated_tracks/"

# Final audio output dir
final_dir = './data/remove_silence/'

"""Active classes and their respective silence duration (how long we deem a silent period to be)
The silence duration is technically a hyperparameter we need to find out. Threshold is also 
a hyperparameter"""
audio_classes = {'Dog' : (0.1, 0.01), 'Impact' : (0.1, 0.01), 'Speech' : (0.1, 0.03)}

fs = 48000 # Sample rate, hardcoded
for cls , vars in audio_classes.items():
    sil_dur = vars[0] # Silent duration
    thresh  = vars[1] # Threshold
    class_audio_dir = os.path.join(original_audio_dir, cls)
    tqdm.write("Now processing : {}".format(class_audio_dir))
    for concat_track in tqdm(os.listdir(class_audio_dir)):
        if concat_track.endswith('.wav'):
            concat_track_fp = os.path.join(class_audio_dir, concat_track)
            audio_data , _ = librosa.load(concat_track_fp, sr=fs, mono=False, dtype=np.float32)
            
            # Here I choose to normalize the audio tracks, so that the mean will be nearer to 0 (but not
            # directly 0 if the audio is not balanced around 0)
            result = []
            for i in range(len(audio_data)):
                a = local_norming(audio_data[i]) # Change to normalization
                result.append(a)
            result = np.array(result)

            # Remove the silence from the audio
            result_remove_silence = remove_silence(result, 
                                                   segment_duration=sil_dur,
                                                   threshold=thresh)
            
            # Write them to output file
            os.makedirs(os.path.join(final_dir, cls), exist_ok=True)
            no_silence_file_dir = os.path.join(final_dir, cls, concat_track)
            sf.write(no_silence_file_dir, result_remove_silence.T, samplerate=fs)

Now processing : ./data/Dataset_concatenated_tracks/Dog


100%|██████████| 7/7 [00:10<00:00,  1.45s/it]


Now processing : ./data/Dataset_concatenated_tracks/Impact


100%|██████████| 7/7 [00:10<00:00,  1.45s/it]


Now processing : ./data/Dataset_concatenated_tracks/Speech


100%|██████████| 7/7 [00:08<00:00,  1.20s/it]
