In [4]:
import soundfile as sf 
import librosa
import numpy as np 
import pandas as pd 
import os
from tqdm import tqdm

### Segmenting Audio Files

Core files are separated into Dog , Speech , Impact

Segment into 200ms chunks with 100ms overlap 

Saved in "class_azimuth_idx.wav" file names

In [5]:
class_types = ['Dog', 'Impact' , 'Speech']
for ct in class_types:
    
    # raw data directory
    full_data_dir = '../dataset/data/Dataset_concatenated_tracks/{}/'.format(ct)
    # cleaned, output data directory
    output_data_dir = '../dataset/cleaned_data/{}'.format(ct.lower())
    
    # create dirs
    os.makedirs(output_data_dir, exist_ok=True)
    
    # Manual settings 
    fs=48000
    frame_len = int(0.2*fs) # 200ms
    hop_len = int(0.1*fs)   # 100ms
    
    # loop through raw data dir
    for file in os.listdir(full_data_dir):
        if file.endswith('.wav'):
            fullfn = os.path.join(full_data_dir, file)
            
            # extract azimuth from gt
            vars = file.split('_')
            azimuth = vars[2]
            
            # load audio
            audio , _ = librosa.load(fullfn, sr=fs, mono=False, dtype=np.float32)
            
            # Segment the audio input into overlapping frames
            frames = librosa.util.frame(audio, frame_length=frame_len, hop_length=hop_len)
            
            # Transpose into (n_segments, timebins, channels)
            frames = frames.T
            for idx, frame in enumerate(tqdm(frames)):
                final_fn = "{}_{}_{}.wav".format(ct.lower(), azimuth, idx+1)
                final_fp = os.path.join(output_data_dir, final_fn)
                sf.write(final_fp, frame, samplerate=48000)

100%|██████████| 5983/5983 [00:26<00:00, 225.74it/s]
100%|██████████| 5983/5983 [00:28<00:00, 208.23it/s]
100%|██████████| 5983/5983 [00:19<00:00, 309.56it/s]
100%|██████████| 5983/5983 [00:15<00:00, 379.09it/s]
100%|██████████| 5983/5983 [00:15<00:00, 393.54it/s]
100%|██████████| 5983/5983 [00:15<00:00, 379.35it/s]
100%|██████████| 6679/6679 [00:17<00:00, 371.72it/s]
100%|██████████| 6679/6679 [00:20<00:00, 328.50it/s]
100%|██████████| 6679/6679 [00:42<00:00, 155.33it/s]
100%|██████████| 6679/6679 [00:41<00:00, 159.25it/s]
100%|██████████| 6679/6679 [00:46<00:00, 142.78it/s]
100%|██████████| 6679/6679 [00:49<00:00, 135.60it/s]
100%|██████████| 6013/6013 [00:41<00:00, 145.74it/s]
100%|██████████| 6013/6013 [00:41<00:00, 145.93it/s]
100%|██████████| 6013/6013 [00:38<00:00, 156.73it/s]
100%|██████████| 6013/6013 [00:38<00:00, 156.93it/s]
100%|██████████| 6013/6013 [00:39<00:00, 153.12it/s]
100%|██████████| 6013/6013 [00:40<00:00, 148.85it/s]


### Checking SALSA-Lite feature dimensions/parameters



In [6]:
fmax_doa = 2000
n_fft = 1024
fmin_doa = 50 
fs = 48000
fmax_doa = np.min((fmax_doa, fs // 2))
n_bins = n_fft // 2 + 1
lower_bin = int(np.floor(fmin_doa * n_fft / float(fs)))  # 512: 1; 256: 0
upper_bin = int(np.floor(fmax_doa * n_fft / float(fs)))  # 9000Hz: 512: 192, 256: 96
lower_bin = np.max((1, lower_bin))

print(fmax_doa, fmin_doa, upper_bin, lower_bin)

fmax = 9000  # Hz
cutoff_bin = int(np.floor(fmax * n_fft / float(fs)))
print(cutoff_bin)

2000 50 42 1
192
