# Speech command.
**Main goal:**</br>
Train a deep learning model that can convert speech to commands in order to play 2048.

## Imports

In [19]:
import os
import sys

import librosa
import numpy as np
import matplotlib.pyplot as plt

from random import random, choice
from IPython.display import Audio

import torch
import torchaudio
torchaudio.set_audio_backend("soundfile")

from torchaudio.transforms import Resample
from torch.nn import ConstantPad1d

from torch.utils.data import Dataset, DataLoader
from torch.nn import Sequential
from torch.nn import (
    Conv1d,
    ReLU,
    MaxPool1d,
    Dropout,
    Linear,
    Flatten
)
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

from pytorch_model_summary import summary

In [20]:
sys.path.append('..')
from utils.s3_class import S3Functions
s3_funcs = S3Functions('jdgallegoq-pinacle')

In [21]:
# print torch versions
print('Torch version: ', torch.__version__)
print('Torchaudio version: ', torchaudio.__version__)

Torch version:  2.0.1
Torchaudio version:  2.0.2


In [22]:
# set seeds
seed = 20240515
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x12265b6b0>

In [23]:
# warm up loading some audio files
key='speech-command/dataset_commands-200904-131537/right/00b01445_nohash_0.wav'
waveform, sample_rate = librosa.load(
    s3_funcs.read_audio(key),
    sr=16000
)

In [24]:
# print waveform and sample rate
waveform.shape, sample_rate

((14336,), 16000)

In [25]:
# duration of audio file
duration = len(waveform)/sample_rate
duration

0.896

In [26]:
# plot audio using matplotlib
#fig = plt.figure(figsize=(14, 8))

#ax = fig.add_subplot(211)
#ax.set_title('Raw wave of 00b01445_nohash_0.wav')
#ax.set_xlabel('time')
#ax.set_ylabel('Amplitude')
#ax.plot(np.linspace(0, duration, sample_rate), waveform)

#plt.show()

In [27]:
# play audio
Audio(waveform, rate=sample_rate)

In [28]:
# resampling audio
prev_sampling_rate = sample_rate
sample_rate = 8000

waveform = librosa.resample(
    y=waveform,
    orig_sr=prev_sampling_rate,
    target_sr=sample_rate
)
waveform.shape

(7168,)

In [29]:
# play audio
Audio(waveform, rate=sample_rate)

In [30]:
# define all files to read
filepaths = s3_funcs.list_bucket_objects(prefix='speech-command/dataset_commands-200904-131537/')

In [31]:
filepaths[0].split('/')[-2]

'down'

In [32]:
# using torchaudio
waveform, sample_rate = torchaudio.load(
    s3_funcs.read_audio(key)
)

In [33]:
waveform.shape, sample_rate

(torch.Size([1, 14336]), 16000)

## Load data (usin custom DataLoader)

In [34]:
# dict to convert categories to numerical labels
classes2labels = {
    'up': 0,
    'down': 1,
    'left': 2,
    'right': 3
}
# dict to convert numerical labels to classes
labels2classes = {v:k for k, v in classes2labels.items()}

In [35]:
class SpeechCommandsDataset(Dataset):
    def __init__(self,):
        # get all audio filepaths
        self.filepaths = s3_funcs.list_bucket_objects(prefix='speech-command/dataset_commands-200904-131537/')
    
    def __len__(self,):
        return len(self.filepaths)
    
    def __getitem__(self, idx):
        # load audiofile and the respect class
        waveform, sample_rate = torchaudio.load(
            s3_funcs.read_audio(self.filepaths[idx])
        )
        class_name = self.filepaths[idx].split('/')[-2]

        # trim or pad the audiofile if required
        if waveform.shape[1] > sample_rate:
            waveform = waveform[:, :sample_rate]
        elif waveform.shape[1] < sample_rate:
            pad_len = sample_rate - waveform.shape[1]
            waveform = ConstantPad1d((0, pad_len), 0)(waveform)
            
        # resample saudio
        waveform = Resample(orig_freq=sample_rate, new_freq=8000)(waveform)

        # normalize data
        waveform = waveform - waveform.min()
        waveform = waveform / waveform.max()

        # label encode the target variable
        label = classes2labels[class_name]

        return waveform, label

In [36]:
# custom dataset instance
dataset = SpeechCommandsDataset()
# create dataloader
data_loader = DataLoader(
    dataset,
    batch_size=128,
    num_workers=4,
    shuffle=True
)

In [37]:
# sanity checks
for batch_x, batch_y in data_loader:
    break

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "<string>", line 1, in <module>
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Caskroom/miniforge/base/envs/transformers/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
  File "/opt/homebrew/Caskroom/miniforge/base/envs/transformers/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
  File "/opt/homebrew/Caskroom/miniforge/base/envs/transformers/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
        exitcode = _main(fd, parent_sentinel)exitcode = _main(fd, parent_sentinel)

  File "/opt/homebrew/Caskroom/miniforge/base/envs/transformers/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
  File "/opt/homebrew/Caskroom/miniforge/base/envs/transformers/lib/python3.8/multiprocessing/spawn.py", line

RuntimeError: DataLoader worker (pid(s) 44801) exited unexpectedly