In [1]:
# Libraries related to PyTorch
import torch
from torch import Tensor
import torchaudio 
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import WeightedRandomSampler,DataLoader

# Libraries related to PyTorch Lightning
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# Libraries related to hydra
import hydra
from hydra.utils import to_absolute_path
from omegaconf import DictConfig, OmegaConf

# custom packages
from dataset.speechcommands import SPEECHCOMMANDS_12C #for 12 classes KWS task
import models as Model 

from dataloading_util import data_processing
from datetime import datetime



In [3]:
hydra.initialize("conf")

hydra.initialize()

In [15]:
# To set mel trainable, put "model.spec_args.trainable_mel=True" into the overrides list
# This is equalivant nnAudio.features.mel.MelSpectrogram(trainable_mel=True) at line 50 of models/nnAudio_model.py 

# To set mel trainable xxx
# To both trainable xxx

#

cfg = hydra.compose("KWS_config", overrides=["model.spec_args.trainable_mel=True"])



# Setting up dataset

In [21]:
cfg.dataset.train

{'root': '${data_root}', 'url': 'speech_commands_v0.02', 'folder_in_archive': 'SpeechCommands', 'download': '${download}', 'subset': 'training'}

In [7]:
data_root= './' # Download the data here

cfg.data_root = to_absolute_path(cfg.data_root) # convert relative path to absolute path

batch_size = cfg.batch_size

trainset = SPEECHCOMMANDS_12C(root=data_root,
                              'speech_commands_v0.02') # set up/download train set
validset = SPEECHCOMMANDS_12C(**cfg.dataset.val)
testset = SPEECHCOMMANDS_12C(**cfg.dataset.test)

basename='speech_commands_v0.02.tar.gz'


Loading training set: 100%|███████████| 84843/84843 [00:28<00:00, 3007.55it/s]


basename='speech_commands_v0.02.tar.gz'


Loading validation set: 100%|███████████| 9981/9981 [00:05<00:00, 1791.90it/s]


basename='speech_commands_test_set_v0.02.tar.gz'


Loading testing set: 0it [00:00, ?it/s]


In [16]:
    # for class weighting, rebalancing silence(10th class) and unknown(11th class) in training set
    class_weights = [1,1,1,1,1,1,1,1,1,1,4.6,1/17]
    sample_weights = [0] * len(trainset)
    #create a list as per length of trainset

    for idx, (data,rate,label,speaker_id, _) in enumerate(trainset):
        class_weight = class_weights[label]
        sample_weights[idx] = class_weight
    #apply sample_weights in each data base on their label class in class_weight
    #ref: https://www.youtube.com/watch?v=4JFVhJyTZ44&t=518s
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights),replacement=True)
            
        
    trainloader = DataLoader(trainset,                                
                                  collate_fn=lambda x: data_processing(x),
                                             **cfg.dataloader.train,sampler=sampler)

    validloader = DataLoader(validset,                               
                                  collate_fn=lambda x: data_processing(x),
                                             **cfg.dataloader.val)
    
    testloader = DataLoader(testset,   
                                  collate_fn=lambda x: data_processing(x),
                                            **cfg.dataloader.test)     

In [17]:
cfg.model.model_type

'BCResNet_nnAudio'

# Set up model

# Put the model code here

In [18]:
cfg.model.args.input_dim = cfg.model.spec_args.n_mels *101 
train_setting=cfg.model.spec_args.trainable_mel
n_mel=cfg.model.spec_args.n_mels
stft = cfg.model.spec_args.trainable_STFT

# nnAudio is integrated into the model at line 50 of models/nnAudio_model.py 
net = getattr(Model, cfg.model.model_type)(cfg.model)

print(type(net))

STFT kernels created, time used = 0.1901 seconds
STFT filter created, time used = 0.0049 seconds
Mel filter created, time used = 0.0049 seconds
<class 'models.nnAudio_model.BCResNet_nnAudio'>


# Training

In [19]:
   
    
    lr_monitor = LearningRateMonitor(logging_interval='step')
    checkpoint_callback = ModelCheckpoint(**cfg.checkpoint,
                                          auto_insert_metric_name=False) #save checkpoint
    
    callbacks = [checkpoint_callback, lr_monitor]

    trainer = Trainer(**cfg.trainer, callbacks=callbacks)
    
    trainer.fit(net, trainloader, validloader)
    trainer.test(net, testloader)
    #added validloader, in order to reach validation_step



GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

   | Name      | Type             | Params
------------------------------------------------
0  | conv1     | Conv2d           | 416   
1  | block1_1  | TransitionBlock  | 352   
2  | block1_2  | BroadcastedBlock | 208   
3  | block2_1  | TransitionBlock  | 480   
4  | block2_2  | BroadcastedBlock | 360   
5  | block3_1  | TransitionBlock  | 768   
6  | block3_2  | BroadcastedBlock | 544   
7  | block3_3  | BroadcastedBlock | 544   
8  | block3_4  | BroadcastedBlock | 544   
9  | block4_1  | TransitionBlock  | 1.1 K 
10 | block4_2  | BroadcastedBlock | 760   
11 | block4_3  | BroadcastedBlock | 760   
12 | block4_4  | BroadcastedBlock | 760   
13 | conv2     | Conv2d           | 520   
14 | conv3     | Conv2d           | 640   
15 | conv4     | Conv2d           | 384   
16 | mel_layer | MelSpectrogram   | 9.6 K 
17 | criterion | Cr

Validation sanity check:   0%|                          | 0/5 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                              

  rank_zero_warn(


Epoch 1:  89%|████████▉ | 856/958 [00:50<00:06, 16.81it/s, loss=1.92, v_num=1]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|                                     | 0/102 [00:00<?, ?it/s][A
Epoch 1:  90%|████████▉ | 858/958 [00:52<00:06, 16.26it/s, loss=1.92, v_num=1][A
Epoch 1:  90%|█████████ | 863/958 [00:52<00:05, 16.32it/s, loss=1.92, v_num=1][A
Epoch 1:  91%|█████████ | 871/958 [00:52<00:05, 16.44it/s, loss=1.92, v_num=1][A
Epoch 1:  92%|█████████▏| 879/958 [00:53<00:04, 16.56it/s, loss=1.92, v_num=1][A
Epoch 1:  93%|█████████▎| 887/958 [00:53<00:04, 16.68it/s, loss=1.92, v_num=1][A
Epoch 1:  93%|█████████▎| 895/958 [00:53<00:03, 16.80it/s, loss=1.92, v_num=1][A
Epoch 1:  94%|█████████▍| 903/958 [00:53<00:03, 16.91it/s, loss=1.92, v_num=1][A
Epoch 1:  95%|█████████▌| 911/958 [00:53<00:02, 17.03it/s, loss=1.92, v_num=1][A
Epoch 1:  96%|█████████▌| 920/958 [00:53<00:02, 17.16it/s, loss=1.92, v_num=1][A
Epoch 1:  97%|█████████▋| 929/958 [00:53<00:01, 17.30it/s, loss=1.9

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


MisconfigurationException: Total length of `Dataloader` across ranks is zero. Please make sure that it returns at least 1 batch.