# Speech command.
**Main goal:**</br>
Train a deep learning model that can convert speech to commands in order to play 2048.

## Imports

In [1]:
import os
import sys

import librosa
import numpy as np
import matplotlib.pyplot as plt

from random import random, choice
from IPython.display import Audio

import torch
import torchaudio
torchaudio.set_audio_backend("soundfile")

from torchaudio.transforms import Resample
from torch.nn import ConstantPad1d

from torch.utils.data import Dataset, DataLoader
from torch.nn import Sequential
from torch.nn import (
    Conv1d,
    ReLU,
    MaxPool1d,
    Dropout,
    Linear,
    Flatten
)
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

from pytorch_model_summary import summary

In [2]:
sys.path.append('..')
from utils.s3_class import S3Functions
s3_funcs = S3Functions('jdgallegoq-pinacle')

In [3]:
# print torch versions
print('Torch version: ', torch.__version__)
print('Torchaudio version: ', torchaudio.__version__)

Torch version:  1.12.1+cu116
Torchaudio version:  0.12.1+cu116


In [4]:
# set seeds
seed = 20240515
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f3e613194f0>

In [5]:
# warm up loading some audio files
key='speech-command/dataset_commands-200904-131537/right/00b01445_nohash_0.wav'
waveform, sample_rate = librosa.load(
    s3_funcs.read_audio(key),
    sr=16000
)

In [6]:
# print waveform and sample rate
waveform.shape, sample_rate

((14336,), 16000)

In [7]:
# duration of audio file
duration = len(waveform)/sample_rate
duration

0.896

In [8]:
# plot audio using matplotlib
#fig = plt.figure(figsize=(14, 8))

#ax = fig.add_subplot(211)
#ax.set_title('Raw wave of 00b01445_nohash_0.wav')
#ax.set_xlabel('time')
#ax.set_ylabel('Amplitude')
#ax.plot(np.linspace(0, duration, sample_rate), waveform)

#plt.show()

In [9]:
# play audio
Audio(waveform, rate=sample_rate)

In [10]:
# resampling audio
prev_sampling_rate = sample_rate
sample_rate = 8000

waveform = librosa.resample(
    y=waveform,
    orig_sr=prev_sampling_rate,
    target_sr=sample_rate
)
waveform.shape

(7168,)

In [11]:
# play audio
Audio(waveform, rate=sample_rate)

In [12]:
# define all files to read
filepaths = s3_funcs.list_bucket_objects(prefix='speech-command/dataset_commands-200904-131537/')

In [13]:
filepaths[0].split('/')[-2]

'down'

In [14]:
# using torchaudio
waveform, sample_rate = torchaudio.load(
    s3_funcs.read_audio(key)
)

In [15]:
waveform.shape, sample_rate

(torch.Size([1, 14336]), 16000)

## Load data (usin custom DataLoader)

In [16]:
# dict to convert categories to numerical labels
classes2labels = {
    'up': 0,
    'down': 1,
    'left': 2,
    'right': 3
}
# dict to convert numerical labels to classes
labels2classes = {v:k for k, v in classes2labels.items()}

In [17]:
class SpeechCommandsDataset(Dataset):
    def __init__(self,):
        # get all audio filepaths
        self.filepaths = s3_funcs.list_bucket_objects(prefix='speech-command/dataset_commands-200904-131537/')
    
    def __len__(self,):
        return len(self.filepaths)
    
    def __getitem__(self, idx):
        # load audiofile and the respect class
        waveform, sample_rate = torchaudio.load(
            s3_funcs.read_audio(self.filepaths[idx])
        )
        class_name = self.filepaths[idx].split('/')[-2]

        # trim or pad the audiofile if required
        if waveform.shape[1] > sample_rate:
            waveform = waveform[:, :sample_rate]
        elif waveform.shape[1] < sample_rate:
            pad_len = sample_rate - waveform.shape[1]
            waveform = ConstantPad1d((0, pad_len), 0)(waveform)
            
        # resample saudio
        waveform = Resample(orig_freq=sample_rate, new_freq=8000)(waveform)

        # normalize data
        waveform = waveform - waveform.min()
        waveform = waveform / waveform.max()

        # label encode the target variable
        label = classes2labels[class_name]

        return waveform, label

In [18]:
# custom dataset instance
dataset = SpeechCommandsDataset()
# create dataloader
data_loader = DataLoader(
    dataset,
    batch_size=128,
    num_workers=4,
    shuffle=True
)

In [19]:
# sanity checks
for batch_x, batch_y in data_loader:
    break

In [20]:
batch_x.shape, batch_y.shape

(torch.Size([128, 1, 8000]), torch.Size([128]))

In [21]:
# check data random idx
idx = choice(range(len(batch_y)))
waveform, label = batch_x[idx], batch_y[idx]

print(waveform.shape)
print(waveform.min(), waveform.max())

torch.Size([1, 8000])
tensor(0.) tensor(1.)


In [22]:
Audio(waveform, rate=8000)

In [23]:
# check labels
print(label)
print(labels2classes[int(label)])

tensor(1)
down


Sanity check passed :)

# Model Architecture

In [26]:
model = Sequential(
    # fisrt Convolutional layer
    Conv1d(in_channels=1, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),
    # second Convolutional layer
    Conv1d(in_channels=64, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),
    # third convolutional layer
    Conv1d(in_channels=64, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),
    # fourth convolutional layer
    Conv1d(in_channels=64, out_channels=64, kernel_size=13),
    ReLU(),
    MaxPool1d(4),
    # flatten to pass to dense
    Flatten(),
    # first Dense layer
    Linear(1728, 256),
    ReLU(),
    Dropout(0.3),
    # second Dense layer
    Linear(256, 32),
    ReLU(),
    Dropout(0.3),
    # output layer
    Linear(32, 4)
)

In [27]:
# see model summary
print(summary(model, batch_x[:1]))

-----------------------------------------------------------------------
      Layer (type)        Output Shape         Param #     Tr. Param #
          Conv1d-1       [1, 64, 7988]             896             896
            ReLU-2       [1, 64, 7988]               0               0
       MaxPool1d-3       [1, 64, 1997]               0               0
          Conv1d-4       [1, 64, 1985]          53,312          53,312
            ReLU-5       [1, 64, 1985]               0               0
       MaxPool1d-6        [1, 64, 496]               0               0
          Conv1d-7        [1, 64, 484]          53,312          53,312
            ReLU-8        [1, 64, 484]               0               0
       MaxPool1d-9        [1, 64, 121]               0               0
         Conv1d-10        [1, 64, 109]          53,312          53,312
           ReLU-11        [1, 64, 109]               0               0
      MaxPool1d-12         [1, 64, 27]               0               0
     

In [29]:
# check model on one auido file
model.eval()

output = model(batch_x[:1])
print(output.shape)
print(output)

torch.Size([1, 4])
tensor([[ 0.1204, -0.1821, -0.0523, -0.1348]], grad_fn=<AddmmBackward0>)


# Model Training

In [30]:
# move model to GPU
model = model.to('cuda')

In [31]:
# define optimizer and loss function
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = CrossEntropyLoss()

In [32]:
# set model on train phase
model.train()

epochs = 10
for epoch in range(epochs):
    # intialize variables
    epoch_loss = cnt = 0
    # loop through data
    for batch_x, batch_y in data_loader:
        # move data to GPU
        batch_x = batch_x.to('cuda').type(torch.cuda.FloatTensor)
        batch_y = batch_y.to('cuda').type(torch.cuda.LongTensor)
        # clear gradients on each epoch
        optimizer.zero_grad()
        # pass audio to model
        outputs = model(batch_x)
        outputs = outputs.squeeze()
        # get loss
        loss = criterion(outputs, batch_y)
        # backward pass
        loss.backward()
        # update gradients
        optimizer.step()
        # sum loss and get count
        # .item to access value
        epoch_loss += loss.item()
        cnt += 1
    
    # take average loss for all batches
    epoch_loss /= cnt
    # print loss
    print("Training loss for epoch {} is {:.5f}".format(epoch+1, epoch_loss))


Training loss for epoch 1 is 0.80760
Training loss for epoch 2 is 0.00012
Training loss for epoch 3 is 0.00000
Training loss for epoch 4 is 0.02813
Training loss for epoch 5 is 0.00000
Training loss for epoch 6 is 0.00000
Training loss for epoch 7 is 0.00000
Training loss for epoch 8 is 0.00000
Training loss for epoch 9 is 0.00000
Training loss for epoch 10 is 0.00000


# Inference

In [33]:
model.eval()
output = model(batch_x[:1]).cpu().detach()
print(output.shape)

# output logits
print(output)

# need to pass softmax to get probas
output = torch.log_softmax(output, dim=1)
_, pred = torch.max(output, dim=1)
print(pred)

# covnert label to class
labels2classes[pred.numpy()[0]]

torch.Size([1, 4])
tensor([[-255.9628,  358.3331,   12.6525, -177.7383]])
tensor([1])


'down'

In [34]:
# visualize audio
Audio(
    batch_x[:1].cpu().detach().numpy()[0, 0, :],
    rate=8000
)

In [37]:
# print actual label
print(batch_y[:1], labels2classes[batch_y[:1].cpu().numpy()[0]])
# save model
torch.save(model.state_dict(), '../module/models/saved_model.pth')

tensor([1], device='cuda:0') down
