In [None]:
#FeaturesDict:'audio', 'label'

In [1]:
import torch
import torchaudio 
import torch.nn as nn
import torch.nn.functional as F
import pandas
import tqdm
from torch.nn.utils.rnn import pad_sequence
from nnAudio.features.mel import MelSpectrogram
import matplotlib.pyplot as plt
from IPython.display import Audio

str2int = \
{'backward': 0,
 'bed': 1,
 'bird': 2,
 'cat': 3,
 'dog': 4,
 'down': 5,
 'eight': 6,
 'five': 7,
 'follow': 8,
 'forward': 9,
 'four': 10,
 'go': 11,
 'happy': 12,
 'house': 13,
 'learn': 14,
 'left': 15,
 'marvin': 16,
 'nine': 17,
 'no': 18,
 'off': 19,
 'on': 20,
 'one': 21,
 'right': 22,
 'seven': 23,
 'sheila': 24,
 'six': 25,
 'stop': 26,
 'three': 27,
 'tree': 28,
 'two': 29,
 'up': 30,
 'visual': 31,
 'wow': 32,
 'yes': 33,
 'zero': 34
}

#create dict from dataset_label to int

In [2]:
device = 'cpu'

In [3]:
#process data before put it in dataloader
#source from https://github.com/KinWaiCheuk/pytorch_template/blob/610a207aab988818f35af36d991dcaaaa9fa1ffe/utils/text_processing.py#L108

def data_processing(data):
    waveforms = []
    labels = []
    
    for batch in data:
        # batch[0].shape = (1, audio_len) tensor
        waveforms.append(batch[0].squeeze(0)) #after squeeze => (audio_len) tensor # remove batch dim
        # batch[2] = string
        # str2int = dict
        # str2int[batch[2]] = int
        #torch.Tensor([str2int[batch[2]]]) = tensor
        label = torch.Tensor([str2int[batch[2]]]) # batch[2] is the label key #str --> int --> tensor
        ## print(f"{label=}")
        labels.append(label)
        
    
        
    waveform_padded = nn.utils.rnn.pad_sequence(waveforms, batch_first=True)  
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    
    output_batch = {'waveforms': waveform_padded, # it is waveforms instead of spectrograms, this tiny hack can make the code work with existing training function
             'labels': labels,
             }
    return output_batch

#after data processing, return a dict with waveforms [torch.Size: 3 dimensions] and labels

In [4]:
mel_layer = MelSpectrogram(sr=16000, 
                           n_fft=2048,
                           win_length=None,
                           n_mels=100, 
                           hop_length=512,
                           window='hann',
                           center=True,
                           pad_mode='reflect',
                           power=2.0,
                           htk=False,
                           fmin=0.0,
                           fmax=None,
                           norm=1,
                           trainable_mel=False,
                           trainable_STFT=False,
                           verbose=True,)
#sr: sampling rate, 1s take how many datapoint
#n_mels: the more , the better spectrogram
#hop_length: distance of window move
mel_layer.to(device)

STFT kernels created, time used = 0.1259 seconds
STFT filter created, time used = 0.0019 seconds
Mel filter created, time used = 0.0020 seconds


MelSpectrogram(
  Mel filter banks size = (100, 1025), trainable_mel=False
  (stft): STFT(n_fft=2048, Fourier Kernel size=(1025, 1, 2048), iSTFT=False, trainable=False)
)

In [5]:
batch_size = 8

In [6]:
trainset = torchaudio.datasets.SPEECHCOMMANDS('./',url='speech_commands_v0.02',folder_in_archive='SpeechCommands',download = False, subset = 'training')
testset = torchaudio.datasets.SPEECHCOMMANDS('./',url='speech_commands_v0.02',folder_in_archive='SpeechCommands',download = False, subset = 'testing')
#trainset Returns (waveform[0], sample_rate[1], label[2], speaker_id, utterance_number)

In [7]:
trainloader = torch.utils.data.DataLoader(trainset,
                              batch_size=batch_size,
                              collate_fn=lambda x: data_processing(x))

#speech_command_transform = Speech_Command_label_Transform(trainset)
testloader = torch.utils.data.DataLoader(testset,
                              batch_size=batch_size,
                              collate_fn=lambda x: data_processing(x))

In [8]:
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,6,5)    
        self.conv2 = nn.Conv2d(6,16,5)
        self.fc1 = nn.Linear(16*22*5,120) 
        #have to follow input, x.shape before flattern: 
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,35)
        
    def forward(self,x):
        #print(f"{x.shape=}")
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)),2)
        #print(f'before flatten x.shape = {x.shape}')
        x = torch.flatten(x,1)
        #print(f'x.shape = {x.shape}')
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
#define the network

# print(f'before flatten x.shape = {x.shape}')
# print(f'x.shape = {x.shape}')

In [9]:
net = net.to(device)

In [10]:
x = torch.randn(4, 1, 229,50).to(device)

In [11]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

#define loss function and optimizer

#normally use cross-entropyloss for classification problem
#nn.MSELoss for regression problem


In [12]:
#for name, i in net.named_parameters():
    #print(f"{name}")
    
#list out the parameters that can be trained

In [13]:
for epoch in range(2):
    Loss = 0
    
    for batch in tqdm.tqdm(trainloader):
        mel_output_batch = mel_layer(batch['waveforms'].to(device)) #3-dimension
        mel_output_batch4 = mel_output_batch.unsqueeze(1)  #4-dimension
        
        #print(f'{mel_output_batch4}')
        
        optimizer.zero_grad()
        outputs = net(mel_output_batch4)
        ##print(f"{batch['labels'].shape=}")
        ##print(f'{outputs.shape=}')
        
        #batch['labels'] =2 dimension [batch, 1]
        #batch['labels'].squeeze(1) = 1 dimension [batch]
        loss = criterion(outputs, batch['labels'].to(device).squeeze(1).long()) 

        loss.backward()
        optimizer.step()
        
        
#inputs have to be 4-dimension tensors
#outputs is the labels, required to be 1 dimension

#Train the network in the training data

100%|████████████████████████████████████████████████████████████████| 10606/10606 [01:34<00:00, 111.94it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 10606/10606 [01:32<00:00, 114.80it/s]


In [114]:
PATH = './speechcommands_net.pt'
torch.save(net.state_dict(), PATH)
#save the trained model as PATH

#for future use, open empty jupyter
#1.define the network
#2.net = Net()
#net.load_state_dict(torch.load(PATH)) 

In [None]:
import matplotlib.pyplot as plt
dataiter = iter(testloader)
inputs, labels = dataiter.next()
print(f'{(inputs.shape)}')

#iterate testloader

In [None]:
net = Net()
net.load_state_dict(torch.load(PATH))

#load back saved model


In [None]:
outputs = net(inputs)
#subs testing inputs into saved model

In [None]:
correct = 0
total = 0
#initialize number of testset

with torch.no_grad():
    for inputs, labels in tqdm.tqdm(testloader):
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        #max function will give 2 outcome
        #x_random = torch.randn(8,10)
        #torch.max(x_random,1) 
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy: %d %%' % (100 * correct / total))

#calculate the accuracy of the network on the testset