In [None]:
import soundfile as sf 
import librosa
import numpy as np 
import pandas as pd 
import os
from tqdm import tqdm

### Segmenting Audio Files

Core files are separated into Dog , Speech , Impact

Segment into 200ms chunks with 100ms overlap 

Saved in "class_azimuth_idx.wav" file names

In [None]:
class_types = ['Dog', 'Impact' , 'Speech']
for ct in class_types:
    
    # raw data directory
    full_data_dir = '../dataset/data/Dataset_concatenated_tracks/{}/'.format(ct)
    # cleaned, output data directory
    output_data_dir = '../dataset/cleaned_data_upvolume/{}'.format(ct.lower())
    
    # create dirs
    os.makedirs(output_data_dir, exist_ok=True)
    
    # Manual settings 
    fs=48000
    frame_len = int(0.2*fs) # 200ms
    hop_len = int(0.1*fs)   # 100ms
    
    # loop through raw data dir
    for file in os.listdir(full_data_dir):
        if file.endswith('.wav'):
            fullfn = os.path.join(full_data_dir, file)
            
            # extract azimuth from gt
            vars = file.split('_')
            azimuth = vars[2]
            
            # load audio
            audio , _ = librosa.load(fullfn, sr=fs, mono=False, dtype=np.float32)
            
            # convert the dB increase to a linear scale
            # dB_increase = 10
            # factor = np.power(10.0, dB_increase/20.0)
            
            # audio *= factor 
            
            # Segment the audio input into overlapping frames
            frames = librosa.util.frame(audio, frame_length=frame_len, hop_length=hop_len)
            
            # Transpose into (n_segments, timebins, channels)
            frames = frames.T
            for idx, frame in enumerate(tqdm(frames)):
                final_fn = "{}_{}_{}.wav".format(ct.lower(), azimuth, idx+1)
                final_fp = os.path.join(output_data_dir, final_fn)
                sf.write(final_fp, frame, samplerate=48000)

### Checking SALSA-Lite feature dimensions/parameters



In [None]:
fmax_doa = 2000
n_fft = 1024
fmin_doa = 50 
fs = 48000
fmax_doa = np.min((fmax_doa, fs // 2))
n_bins = n_fft // 2 + 1
lower_bin = int(np.floor(fmin_doa * n_fft / float(fs)))  # 512: 1; 256: 0
upper_bin = int(np.floor(fmax_doa * n_fft / float(fs)))  # 9000Hz: 512: 192, 256: 96
lower_bin = np.max((1, lower_bin))

print(fmax_doa, fmin_doa, upper_bin, lower_bin)

fmax = 9000  # Hz
cutoff_bin = int(np.floor(fmax * n_fft / float(fs)))
print(cutoff_bin)