In [25]:
import torch
import torchaudio 
import torch.nn as nn
import torch.nn.functional as F
import pandas
import tqdm
from torch.nn.utils.rnn import pad_sequence
from nnAudio.features.mel import MelSpectrogram
import matplotlib.pyplot as plt
from IPython.display import Audio
from torch import Tensor
from pytorch_lightning.core.lightning import LightningModule

str2int = \
{'backward': 0,
 'bed': 1,
 'bird': 2,
 'cat': 3,
 'dog': 4,
 'down': 5,
 'eight': 6,
 'five': 7,
 'follow': 8,
 'forward': 9,
 'four': 10,
 'go': 11,
 'happy': 12,
 'house': 13,
 'learn': 14,
 'left': 15,
 'marvin': 16,
 'nine': 17,
 'no': 18,
 'off': 19,
 'on': 20,
 'one': 21,
 'right': 22,
 'seven': 23,
 'sheila': 24,
 'six': 25,
 'stop': 26,
 'three': 27,
 'tree': 28,
 'two': 29,
 'up': 30,
 'visual': 31,
 'wow': 32,
 'yes': 33,
 'zero': 34
}

In [26]:
#process data before put it in dataloader
#source from https://github.com/KinWaiCheuk/pytorch_template/blob/610a207aab988818f35af36d991dcaaaa9fa1ffe/utils/text_processing.py#L108

def data_processing(data):
    waveforms = []
    labels = []
    
    for batch in data:
        # batch[0].shape = (1, audio_len) tensor
        waveforms.append(batch[0].squeeze(0)) #after squeeze => (audio_len) tensor # remove batch dim
        # batch[2] = string
        # str2int = dict
        # str2int[batch[2]] = int
        #torch.Tensor([str2int[batch[2]]]) = tensor
        label = torch.Tensor([str2int[batch[2]]]) # batch[2] is the label key #str --> int --> tensor
        ## print(f"{label=}")
        labels.append(label)
        
    
        
    waveform_padded = nn.utils.rnn.pad_sequence(waveforms, batch_first=True)  
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    
    output_batch = {'waveforms': waveform_padded, # it is waveforms instead of spectrograms, this tiny hack can make the code work with existing training function
             'labels': labels,
             }
    return output_batch

#after data processing, return a dict with waveforms [torch.Size: 3 dimensions] and labels

In [27]:
device = 'cuda:0'
mel_layer = MelSpectrogram(sr=16000, 
                           n_fft=480,
                           win_length=None,
                           n_mels=40, 
                           hop_length=160,
                           window='hann',
                           center=True,
                           pad_mode='reflect',
                           power=2.0,
                           htk=False,
                           fmin=0.0,
                           fmax=None,
                           norm=1,
                           trainable_mel=False,
                           trainable_STFT=False,
                           verbose=True,)
#sr: sampling rate, 1s take how many datapoint
#n_fft: window size for the STFT
#n_mels: the more , the better spectrogram
#hop_length: distance of window move
#refer to number of sample
mel_layer.to(device)

STFT kernels created, time used = 0.0207 seconds
STFT filter created, time used = 0.0019 seconds
Mel filter created, time used = 0.0020 seconds


MelSpectrogram(
  Mel filter banks size = (40, 241), trainable_mel=False
  (stft): STFT(n_fft=480, Fourier Kernel size=(241, 1, 480), iSTFT=False, trainable=False)
)

In [31]:
batch_size = 100
trainset = torchaudio.datasets.SPEECHCOMMANDS('/workspace/projectA',url='speech_commands_v0.02',folder_in_archive='SpeechCommands',download = False, subset = 'training')
validset = torchaudio.datasets.SPEECHCOMMANDS('/workspace/projectA',url='speech_commands_v0.02',folder_in_archive='SpeechCommands',download = False, subset = 'validation')
testset = torchaudio.datasets.SPEECHCOMMANDS('/workspace/projectA',url='speech_commands_v0.02',folder_in_archive='SpeechCommands',download = False, subset = 'testing')

In [32]:
trainloader = torch.utils.data.DataLoader(trainset,                                
                              collate_fn=lambda x: data_processing(x),
                                         batch_size=batch_size,shuffle=True)
validloader = torch.utils.data.DataLoader(validset,                               
                              collate_fn=lambda x: data_processing(x),
                                         batch_size=batch_size)

testloader = torch.utils.data.DataLoader(testset,   
                              collate_fn=lambda x: data_processing(x),
                                        batch_size=batch_size)

In [33]:
class SubSpectralNorm(nn.Module):
    def __init__(self, C, S, eps=1e-5):
        super(SubSpectralNorm, self).__init__()
        self.S = S
        self.eps = eps
        self.bn = nn.BatchNorm2d(C*S)

    def forward(self, x):
        # x: input features with shape {N, C, F, T}
        # S: number of sub-bands
        N, C, F, T = x.size()
        x = x.view(N, C * self.S, F // self.S, T)

        x = self.bn(x)

        return x.view(N, C, F, T)

In [34]:
class BroadcastedBlock(nn.Module):
    def __init__(
            self,
            planes: int,
            dilation=1,
            stride=1,
            temp_pad=(0, 1),
    ) -> None:
        super(BroadcastedBlock, self).__init__()

        self.freq_dw_conv = nn.Conv2d(planes, planes, kernel_size=(3, 1), padding=(1, 0), groups=planes,
                                      dilation=dilation,
                                      stride=stride, bias=False)
        self.ssn1 = SubSpectralNorm(planes, 5)
        self.temp_dw_conv = nn.Conv2d(planes, planes, kernel_size=(1, 3), padding=temp_pad, groups=planes,
                                      dilation=dilation, stride=stride, bias=False)
        self.bn = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.channel_drop = nn.Dropout2d(p=0.1)
        self.swish = nn.SiLU()
        self.conv1x1 = nn.Conv2d(planes, planes, kernel_size=(1, 1), bias=False)

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        # f2
        ##########################
        out = self.freq_dw_conv(x)
        out = self.ssn1(out)
        ##########################

        auxilary = out
        out = out.mean(2, keepdim=True)  # frequency average pooling

        # f1
        ############################
        out = self.temp_dw_conv(out)
        out = self.bn(out)
        out = self.swish(out)
        out = self.conv1x1(out)
        out = self.channel_drop(out)
        ############################

        out = out + identity + auxilary
        out = self.relu(out)

        return out

In [35]:
class TransitionBlock(nn.Module):

    def __init__(
            self,
            inplanes: int,
            planes: int,
            dilation=1,
            stride=1,
            temp_pad=(0, 1),
    ) -> None:
        super(TransitionBlock, self).__init__()

        self.freq_dw_conv = nn.Conv2d(planes, planes, kernel_size=(3, 1), padding=(1, 0), groups=planes,
                                      stride=stride,
                                      dilation=dilation, bias=False)
        self.ssn = SubSpectralNorm(planes, 5)
        self.temp_dw_conv = nn.Conv2d(planes, planes, kernel_size=(1, 3), padding=temp_pad, groups=planes,
                                      dilation=dilation, stride=stride, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.channel_drop = nn.Dropout2d(p=0.5)
        self.swish = nn.SiLU()
        self.conv1x1_1 = nn.Conv2d(inplanes, planes, kernel_size=(1, 1), bias=False)
        self.conv1x1_2 = nn.Conv2d(planes, planes, kernel_size=(1, 1), bias=False)

    def forward(self, x: Tensor) -> Tensor:
        # f2
        #############################
        out = self.conv1x1_1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.freq_dw_conv(out)
        out = self.ssn(out)
        #############################
        auxilary = out
        out = out.mean(2, keepdim=True)  # frequency average pooling

        # f1
        #############################
        out = self.temp_dw_conv(out)
        out = self.bn2(out)
        out = self.swish(out)
        out = self.conv1x1_2(out)
        out = self.channel_drop(out)
        #############################

        out = auxilary + out
        out = self.relu(out)

        return out


In [36]:
class BCResNet(torch.nn.Module):
    def __init__(self):
        super(BCResNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 5, stride=(2, 1), padding=(2, 2))
        self.block1_1 = TransitionBlock(16, 8)
        self.block1_2 = BroadcastedBlock(8)

        self.block2_1 = TransitionBlock(8, 12, stride=(2, 1), dilation=(1, 2), temp_pad=(0, 2))
        self.block2_2 = BroadcastedBlock(12, dilation=(1, 2), temp_pad=(0, 2))

        self.block3_1 = TransitionBlock(12, 16, stride=(2, 1), dilation=(1, 4), temp_pad=(0, 4))
        self.block3_2 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4))
        self.block3_3 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4))
        self.block3_4 = BroadcastedBlock(16, dilation=(1, 4), temp_pad=(0, 4))

        self.block4_1 = TransitionBlock(16, 20, dilation=(1, 8), temp_pad=(0, 8))
        self.block4_2 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8))
        self.block4_3 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8))
        self.block4_4 = BroadcastedBlock(20, dilation=(1, 8), temp_pad=(0, 8))

        self.conv2 = nn.Conv2d(20, 20, 5, groups=20, padding=(0, 2))
        self.conv3 = nn.Conv2d(20, 32, 1, bias=False)
        self.conv4 = nn.Conv2d(32, 12, 1, bias=False)

    def forward(self, x):

        print('INPUT SHAPE:', x.shape)
        out = self.conv1(x)

        print('BLOCK1 INPUT SHAPE:', out.shape)
        out = self.block1_1(out)
        out = self.block1_2(out)

        print('BLOCK2 INPUT SHAPE:', out.shape)
        out = self.block2_1(out)
        out = self.block2_2(out)

        print('BLOCK3 INPUT SHAPE:', out.shape)
        out = self.block3_1(out)
        out = self.block3_2(out)
        out = self.block3_3(out)
        out = self.block3_4(out)

        print('BLOCK4 INPUT SHAPE:', out.shape)
        out = self.block4_1(out)
        out = self.block4_2(out)
        out = self.block4_3(out)
        out = self.block4_4(out)

        print('Conv2 INPUT SHAPE:', out.shape)
        out = self.conv2(out)

        print('Conv3 INPUT SHAPE:', out.shape)
        out = self.conv3(out)
        out = out.mean(-1, keepdim=True)

        print('Conv4 INPUT SHAPE:', out.shape)
        out = self.conv4(out)

        print('OUTPUT SHAPE:', out.shape)
        return out

In [37]:
bcresnet = BCResNet()

In [38]:
bcresnet = bcresnet.to(device)

In [39]:
x = torch.randn(4, 1, 229,50).to(device)

In [40]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(bcresnet.parameters(), lr=0.001, momentum=0.9)

In [42]:
for epoch in range(200):
    Loss = 0
    
    for batch in tqdm.tqdm(trainloader):
        mel_output_batch = mel_layer(batch['waveforms'].to(device)) #3-dimension
        mel_output_batch4 = mel_output_batch.unsqueeze(1)  #4-dimension
        
        #print(f'{mel_output_batch4}')
        
        optimizer.zero_grad()
        outputs = bcresnet(mel_output_batch4)
        outputs = outputs.squeeze(2).squeeze(2)
        
        ##print(f"{batch['labels'].shape=}")
        ##print(f'{outputs.shape=}')
        
        #batch['labels'] =2 dimension [batch, 1]
        #batch['labels'].squeeze(1) = 1 dimension [batch]
        loss = criterion(outputs, batch['labels'].to(device).squeeze(1).long()) 
        
        loss.backward()
        optimizer.step()
        
        

  0%|                                                                 | 0/849 [00:00<?, ?it/s]

INPUT SHAPE: torch.Size([100, 1, 40, 101])
BLOCK1 INPUT SHAPE: torch.Size([100, 16, 20, 101])
BLOCK2 INPUT SHAPE: torch.Size([100, 8, 20, 101])
BLOCK3 INPUT SHAPE: torch.Size([100, 12, 10, 101])
BLOCK4 INPUT SHAPE: torch.Size([100, 16, 5, 101])
Conv2 INPUT SHAPE: torch.Size([100, 20, 5, 101])
Conv3 INPUT SHAPE: torch.Size([100, 20, 1, 101])
Conv4 INPUT SHAPE: torch.Size([100, 32, 1, 1])
OUTPUT SHAPE: torch.Size([100, 12, 1, 1])





RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR

/pytorch/aten/src/ATen/native/cuda/Loss.cu:247: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:247: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:247: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:247: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:247: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:247: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/sr

In [None]:
PATH = './BC_ResNet.pt'
torch.save(bcresnet.state_dict(), PATH)