In [1]:
from pathlib import Path
root_dir = Path('.').resolve().parent
import sys
sys.path.append(root_dir)

%load_ext autoreload
%autoreload 2

In [25]:
import torch
from scipy.io import wavfile
from scipy import signal
import numpy as np
import glob
import random

In [14]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))# nperseg = 320
    noverlap = int(round(step_size * sample_rate / 1e3))# noverlap = 160
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)
def convert(filename: str)->torch.FloatTensor:
    """
    Convert wav file to feature
    """
    sample_rate, samples = wavfile.read(filename)
    print(samples.shape)
    freqs, times, spectrogram = log_specgram(samples, sample_rate)
    print(spectrogram.shape)
    return torch.tensor(spectrogram)
convert(root_dir/ 'datasets'/ 'speech_commands_v1'/ 'bed'/ '1aed7c6d_nohash_0.wav')

(16000,)
(99, 161)


tensor([[ -3.7267,  -3.4231,  -0.3736,  ..., -11.8554, -10.8581, -11.5257],
        [ -5.6856,  -4.1784,  -2.0797,  ..., -11.4320, -11.4188, -12.1216],
        [ -6.1031,  -5.6279,   0.2300,  ..., -11.5894, -11.5558, -13.0355],
        ...,
        [ -9.1891,  -5.4730,  -2.6923,  ..., -11.7991, -13.1026, -12.1093],
        [ -9.6067,  -4.5401,  -2.6894,  ..., -12.9995, -11.7861, -13.6048],
        [ -9.0516,  -4.6354,  -1.5045,  ..., -10.9258, -17.5748, -16.7621]])

In [8]:
data_root = root_dir / 'datasets' / 'speech_commands_v1'
def get_list(filename):
    with open(filename) as f:
        file_list = f.readlines()
    return [s.strip() for s in file_list]
test_list = get_list(data_root / 'testing_list.txt')
dev_list = get_list(data_root / 'validation_list.txt')

In [22]:
def get_train_list():
    train_list = []
    for s in data_root.glob('*/*.wav'):
        s = str(s)
        name = s.split('/')[-2:]
        if ('/'.join(name) not in (dev_list + test_list)) and name[0]!='_background_noise_':
            train_list.append('/'.join(name))
    print(len(train_list))
    print(train_list[:5])
    return train_list
train_list = get_train_list()

51088
['eight/1b88bf70_nohash_0.wav', 'eight/b12bef84_nohash_1.wav', 'eight/05b2db80_nohash_1.wav', 'eight/3852fca2_nohash_0.wav', 'eight/5ac04a92_nohash_0.wav']


In [23]:
with open(data_root / 'train_list.txt', 'w') as f:
    f.write('\n'.join(train_list))

In [24]:
print(len(train_list))
print(len(dev_list))
print(len(test_list))

51088
6798
6835


In [27]:
test_fixture_dir = root_dir / 'test' / 'fixtures'
def gen_sample(file_list, name, num=5):
    to_write = random.sample(file_list, len(file_list))[:num]
    with open(test_fixture_dir / name, 'w') as f:
        f.write('\n'.join(to_write))
gen_sample(train_list, 'train.txt')
gen_sample(dev_list, 'dev.txt')
gen_sample(test_list, 'test.txt')