# Speech command prediction with federated learning

This is an example of federated leraning for audio data 
The objective of this model is to predict speech command correctly. 

I borrowed almost all codes from this repository. Thank a lot!  
https://github.com/tugstugi/pytorch-speech-commands.git  
We skip a few steps like WeightedRandomSampler and lr_scheduler for for simplicity

Federated learning parts is taken from PySyft tutorial.  
https://github.com/OpenMined/PySyft/blob/master/examples/tutorials/Part%2006%20-%20Federated%20Learning%20on%20MNIST%20using%20a%20CNN.ipynb

you can learn 
1. how to handle audio datasets
2. how to apply federated learning concepts on audio datasets


In [None]:
# fist let's do setup for jupyter note book

# ignore warnings 
import warnings
warnings.filterwarnings('ignore')

repository
# some jupyter specific settings
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# import dependencies.
# mostly torch relatd
import torch
from torchvision.transforms import Compose
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import math
import time
from tqdm import *
import os
import librosa
import numpy as np
import random
import shutil

In [None]:
# then, we need set default type as torch.cuda.FloatTensor
# you get type error without this as of 9/1/2020
torch.set_default_tensor_type(torch.cuda.FloatTensor)

# let's define tutorials objective here 

We're training a model which take audio wav file as input and output the index of speech commands.

- Input: wav audio file
- Output: index of speech commands

So it's a classification problem.

In this tutorial, we have 12 classes to predict.  
unknown, silence, yes, no, up, down, left, right, on, off, stop, go

In [None]:
# let's define classes here 
# CLASSES = 'unknown, silence, yes, no, up, down, left, right, on, off, stop, go'.split(', ')
# use subset of classes
CLASSES = 'unknown, silence, yes, no, left, right'.split(', ')

# let's prepare datasets

we download speech command datasets from here  
http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz

In [None]:
# prepare datasets

# create directory if not exist
# we put data on datasets directory
if os.path.isdir('./datasets') is False:
    try:
        os.mkdir('./datasets')
    except OSError:
        print ("Creation of the directory datasets failed")

if os.path.isdir('./datasets/speech_commands') is True:
    print("datasets seems to exists.")
else :
    # download data
    ! wget -O datasets/speech_commands_v0.01.tar.gz http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
    
    # create directory
    os.mkdir('./datasets/speech_commands')
    
    # create audio directory
    if os.path.isdir('./datasets/speech_commands/audio') is False:
        try:
            os.mkdir('./datasets/speech_commands/audio')
        except OSError:
            print ("Creation of the directory datasets/speech_commands/audio failed")
        

    # untar files.
    ! tar -xzf datasets/speech_commands_v0.01.tar.gz -C datasets/speech_commands/audio      

In [None]:
# once you downloaded datasets,
# you can split datasets into training and validation datasets.
# we split with csv file.

# mode files 
def move_files(src_folder, to_folder, list_file):
    with open(list_file) as f:
        for line in f.readlines():
            line = line.rstrip()
            dirname = os.path.dirname(line)
            dest = os.path.join(to_folder, dirname)
            if not os.path.exists(dest):
                os.mkdir(dest)
            shutil.move(os.path.join(src_folder, line), dest)

In [None]:
# move files
def prepare_dataset():
    audio_folder = "datasets/speech_commands/audio"
    validation_path = "datasets/speech_commands/audio/validation_list.txt"
    test_path = "datasets/speech_commands/audio/testing_list.txt"

    valid_folder = "datasets/speech_commands/valid"
    test_folder = "datasets/speech_commands/test"
    train_folder = "datasets/speech_commands/train"

    if os.path.isdir(valid_folder) is False:
        os.mkdir(valid_folder)
    if os.path.isdir(test_folder) is False:
        os.mkdir(test_folder)

    move_files(audio_folder, test_folder, test_path)
    move_files(audio_folder, valid_folder, validation_path)
    os.rename(audio_folder, train_folder)

In [None]:
# create datasets
if os.path.isdir('./datasets/speech_commands/train') is False:
    prepare_dataset()


# Check datasets

Now we have datasets.
Let's check one of data

In [None]:
# seems like category is in the paths.
# input data is wav audio file and output is index of "right"
import IPython.display
example_path = "datasets/speech_commands/train/right/9f4098cb_nohash_0.wav"

IPython.display.Audio(example_path)

In [None]:
# here we define functions to process audio.
# basically convert raw audio into stft and convert stft into mel spectrogram and do some augmentation.
# poits is since we deal with audio like images in this tutorial, we need every audio exact 1 seconds.
# I mean all audio has exact same duration.

In [None]:
# this is just returning true or false ramdomly
def should_apply_transform(prob=0.5):
    """Transforms are only randomly applied with the given probability."""
    return random.random() < prob

In [None]:
# change ampletude for data augmentation
class ChangeAmplitude(object):
    """Changes amplitude of an audio randomly."""

    def __init__(self, amplitude_range=(0.7, 1.1)):
        self.amplitude_range = amplitude_range

    def __call__(self, data):
        if not should_apply_transform():
            return data

        data['samples'] = data['samples'] * random.uniform(*self.amplitude_range)
        return data

In [None]:
# change speedch and pitch for data augmentation
class ChangeSpeedAndPitchAudio(object):
    """Change the speed of an audio. This transform also changes the pitch of the audio."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        samples = data['samples']
        sample_rate = data['sample_rate']
        scale = random.uniform(-self.max_scale, self.max_scale)
        speed_fac = 1.0  / (1 + scale)
        data['samples'] = np.interp(np.arange(0, len(samples), speed_fac), np.arange(0,len(samples)), samples).astype(np.float32)
        return data

In [None]:
# function to fix audio length 
# Because our architecture is not RNN-based but CNN-based, we need shapes of all input data exact same.
class FixAudioLength(object):
    """Either pads or truncates an audio into a fixed length."""

    def __init__(self, time=1):
        self.time = time

    def __call__(self, data):
        samples = data['samples']
        sample_rate = data['sample_rate']
        length = int(self.time * sample_rate)
        if length < len(samples):
            data['samples'] = samples[:length]
        elif length > len(samples):
            data['samples'] = np.pad(samples, (0, length - len(samples)), "constant")
        return data

In [None]:
# convert raw audio samping into stft 
class ToSTFT(object):
    """Applies on an audio the short time fourier transform."""

    def __init__(self, n_fft=2048, hop_length=512):
        self.n_fft = n_fft
        self.hop_length = hop_length

    def __call__(self, data):
        samples = data['samples']
        sample_rate = data['sample_rate']
        data['n_fft'] = self.n_fft
        data['hop_length'] = self.hop_length
        data['stft'] = librosa.stft(samples, n_fft=self.n_fft, hop_length=self.hop_length)
        data['stft_shape'] = data['stft'].shape
        return data

In [None]:
class StretchAudioOnSTFT(object):
    """Stretches an audio on the frequency domain."""

    def __init__(self, max_scale=0.2):
        self.max_scale = max_scale

    def __call__(self, data):
        if not should_apply_transform():
            return data

        stft = data['stft']
        sample_rate = data['sample_rate']
        hop_length = data['hop_length']
        scale = random.uniform(-self.max_scale, self.max_scale)
        stft_stretch = librosa.core.phase_vocoder(stft, 1+scale, hop_length=hop_length)
        data['stft'] = stft_stretch
        return data

In [None]:
class TimeshiftAudioOnSTFT(object):
    """A simple timeshift on the frequency domain without multiplying with exp."""

    def __init__(self, max_shift=8):
        self.max_shift = max_shift

    def __call__(self, data):
        if not should_apply_transform():
            return data

        stft = data['stft']
        shift = random.randint(-self.max_shift, self.max_shift)
        a = -min(0, shift)
        b = max(0, shift)
        stft = np.pad(stft, ((0, 0), (a, b)), "constant")
        if a == 0:
            stft = stft[:,b:]
        else:
            stft = stft[:,0:-a]
        data['stft'] = stft
        return data

In [None]:
class FixSTFTDimension(object):
    """Either pads or truncates in the time axis on the frequency domain, applied after stretching, time shifting etc."""

    def __call__(self, data):
        stft = data['stft']
        t_len = stft.shape[1]
        orig_t_len = data['stft_shape'][1]
        if t_len > orig_t_len:
            stft = stft[:,0:orig_t_len]
        elif t_len < orig_t_len:
            stft = np.pad(stft, ((0, 0), (0, orig_t_len-t_len)), "constant")

        data['stft'] = stft
        return data


In [None]:
# here we dedfine data augmentatio functions 

data_aug_transform = Compose([
    ChangeAmplitude(), 
    ChangeSpeedAndPitchAudio(), 
    FixAudioLength(), 
    ToSTFT(), 
    StretchAudioOnSTFT(), 
    TimeshiftAudioOnSTFT(), 
    FixSTFTDimension()])


# Background Noise Augmentation
Adding background noise on the fly is a good way to generalize audio model.    
we send dataset to remote machines (workers) later with fedelted() command and it seems to be working.  
But I haven't tested addig noise augmentation on real remote training settings.  

What if noise file does not exist on remote machine???   

In [None]:
# here we define a way to add background noise
class BackgroundNoiseDataset(Dataset):
    """Dataset for silence / background noise."""

    def __init__(self, folder, transform=None, sample_rate=16000, sample_length=1):
        audio_files = [d for d in os.listdir(folder) if os.path.isfile(os.path.join(folder, d)) and d.endswith('.wav')]
        samples = []
        for f in audio_files:
            path = os.path.join(folder, f)
            s, sr = librosa.load(path, sample_rate)
            samples.append(s)

        samples = np.hstack(samples)
        c = int(sample_rate * sample_length)
        r = len(samples) // c
        self.samples = samples[:r*c].reshape(-1, c)
        self.sample_rate = sample_rate
        self.classes = CLASSES
        self.transform = transform
        self.path = folder

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        data = {'samples': self.samples[index], 'sample_rate': self.sample_rate, 'target': 1, 'path': self.path}
        
        if self.transform is not None:
            data = self.transform(data)

        return data

In [None]:
# we pick background noise randomly. so use dataset class
background_noise_dir = "./datasets/speech_commands/train/_background_noise_"
bg_dataset = BackgroundNoiseDataset(background_noise_dir, data_aug_transform)

In [None]:
# function to add background noise on datasets
class AddBackgroundNoiseOnSTFT(Dataset):
    """Adds a random background noise on the frequency domain."""

    def __init__(self, bg_dataset, max_percentage=0.45):
        self.bg_dataset = bg_dataset
        self.max_percentage = max_percentage

    def __call__(self, data):
        if not should_apply_transform():
            return data

        noise = random.choice(self.bg_dataset)['stft']
        percentage = random.uniform(0, self.max_percentage)
        data['stft'] = data['stft'] * (1 - percentage) + noise * percentage
        return data

In [None]:
# create a function
add_bg_noise = AddBackgroundNoiseOnSTFT(bg_dataset)

# MelSpectrogram

In this tutorial we use mel spectrogram as input format.    
Since our data format is still stft so far , this is the time to convert stft into mel spectrogram.  

mel spectrogram is one of best practices to handle audio data.  
This blog explains mel spectrogram well.  
https://towardsdatascience.com/getting-to-know-the-mel-spectrogram-31bca3e2d9d0  


In [None]:
# function to convert data from STFT into MelSpectrogram

class ToMelSpectrogramFromSTFT(object):
    """Creates the mel spectrogram from the short time fourier transform of a file. The result is a 32x32 matrix."""

    def __init__(self, n_mels=32):
        self.n_mels = n_mels

    def __call__(self, data):
        stft = data['stft']
        sample_rate = data['sample_rate']
        n_fft = data['n_fft']
        mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels)
        s = np.dot(mel_basis, np.abs(stft)**2.0)
        data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max)
        return data

In [None]:
class DeleteSTFT(object):
    """Pytorch doesn't like complex numbers, use this transform to remove STFT after computing the mel spectrogram."""

    def __call__(self, data):
        del data['stft']
        return data

In [None]:
class ToTensor(object):
    """Converts into a tensor."""

    def __init__(self, np_name, tensor_name, normalize=None):
        self.np_name = np_name
        self.tensor_name = tensor_name
        self.normalize = normalize

    def __call__(self, data):
        tensor = torch.FloatTensor(data[self.np_name])
        if self.normalize is not None:
            mean, std = self.normalize
            tensor -= mean
            tensor /= std
        data[self.tensor_name] = tensor
        return data

In [None]:
# make a couple of functions to one

# set the feature count of mel spectrogram as 32.
n_mels = 32

train_feature_transform = Compose([
    ToMelSpectrogramFromSTFT(n_mels=n_mels), 
    DeleteSTFT(), 
    ToTensor('mel_spectrogram', 'input')])


# Dataset Class

So far...
- We download data.
- split data into training and valdation.
- define data augmentation
- define adding noise daga augmentation
- define functions to convert raw audio to stft format
- define funcitons to convert stft format into mel spectrogram format.

lets use above to define dataset class

In [None]:
# function to load audio.
class LoadAudio(object):
    """Loads an audio into a numpy array."""

    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate

    def __call__(self, data):
        
        path = data['path']
        if path:
            samples, sample_rate = librosa.load(path, self.sample_rate)
        else:
            # silence
            sample_rate = self.sample_rate
            samples = np.zeros(sample_rate, dtype=np.float32)
        data['samples'] = samples
        data['sample_rate'] = sample_rate
        return data

In [None]:
# datasets class
# you can try subset of entire datasets with use_rate because distributing datasets to remote machines take time...

from random import shuffle

class SpeechCommandsDataset(Dataset):
    """Google speech commands dataset. Only 'yes', 'no', 'up', 'down', 'left',
    'right', 'on', 'off', 'stop' and 'go' are treated as known classes.
    All other classes are used as 'unknown' samples.
    See for more information: https://www.kaggle.com/c/tensorflow-speech-recognition-challenge
    """

    def __init__(self, folder, transform=None, classes=CLASSES, silence_percentage=0.1, use_rate=1.0):
        all_classes = [d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d)) and not d.startswith('_')]
        #for c in classes[2:]:
        #    assert c in all_classes

        class_to_idx = {classes[i]: i for i in range(len(classes))}
        for c in all_classes:
            if c not in class_to_idx:
                class_to_idx[c] = 0
        
        # we use subset of datasets
        data = []
        for c in all_classes:
            d = os.path.join(folder, c)
            target = class_to_idx[c]
            for f in os.listdir(d):
                path = os.path.join(d, f)
                data.append((path, target))
        
        shuffle(data)
        if use_rate != 1.0:
            sample_count = int(len(data) * use_rate)
            data = data[:sample_count]
        

        # add silence
        target = class_to_idx['silence']
        data += [('', target)] * int(len(data) * silence_percentage)

        self.classes = classes
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        path, target = self.data[index]
        data = {'path': path, 'target': target}

        if self.transform is not None:
            data = self.transform(data)

        return data['input'], target

    def make_weights_for_balanced_classes(self):
        """adopted from https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3"""

        nclasses = len(self.classes)
        count = np.zeros(nclasses)
        for item in self.data:
            count[item[1]] += 1

        N = float(sum(count))
        weight_per_class = N / count
        weight = np.zeros(len(self))
        for idx, item in enumerate(self.data):
            weight[idx] = weight_per_class[item[1]]
        return weight

In [None]:
# finally we define dataset here

# specify the percent of entire datasets
# use_rate = 0.2
use_rate = 1.0

train_dataset_dir = "./datasets/speech_commands/train"
train_dataset = SpeechCommandsDataset(train_dataset_dir,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         add_bg_noise,
                                         train_feature_transform]), use_rate=use_rate)

In [None]:
# check data count
len(train_dataset)

In [None]:
# this is a function to create melSpectrogram datasets from audio directly to skip data augumentation
# this is used in validation data
class ToMelSpectrogram(object):
    """Creates the mel spectrogram from an audio. The result is a 32x32 matrix."""

    def __init__(self, n_mels=32):
        self.n_mels = n_mels

    def __call__(self, data):
        samples = data['samples']
        sample_rate = data['sample_rate']
        s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels)
        data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max)
        return data

In [None]:
valid_feature_transform = Compose([
    ToMelSpectrogram(n_mels=n_mels), 
    ToTensor('mel_spectrogram', 'input')])

In [None]:
# define validation datasets

valid_dataset_dir = "./datasets/speech_commands/valid"
valid_dataset = SpeechCommandsDataset(valid_dataset_dir,
                                Compose([LoadAudio(),
                                         FixAudioLength(),
                                         valid_feature_transform]))


# Dataloader
Finally we can apply federated learning here.  
Validation datasets is just normal but we use PySyft library to split training dataset into 2 machines (workers) called Bob and Alice.

In [None]:
# define dataloaders

# batch size is 64
batch_size = 64

# we define training dataloader later right after importing PySyft, library for privacy preserving deep learning

# define validation dataloader, which is just normal dataloader
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Setup for federated leaning
In this tutorial we traing our model with federated learning.
To do that, we do 
- import syft
- create 2 machines (workders) called bob and alice 
- split training datasets and send them to bob and alice 

Note: In real business situation, each machines should have data in the first place. 

In [None]:
import syft as sy  # <-- NEW: import the Pysyft library
hook = sy.TorchHook(torch)  # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob")  # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice")  # <-- NEW: and alice

In [None]:
# defaine federated dataloader
# it takes time. be patient.
federated_train_loader = sy.FederatedDataLoader(
    train_dataset.federate((bob, alice))
)

# Model
we define architecture here.  
We use ResNet34 in this tutorial.  
ResNet is one of popular architecture for image problems.

In [None]:
# define conv block
def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

In [None]:
# define res block
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [None]:
# define ResNet
class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, in_channels=3):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(1, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        # x = x.view(x.size(0), -1)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)

        return x

In [None]:
# define resnet34
def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url('https://download.pytorch.org/models/resnet34-333f7ec4.pth'))
    return model

In [None]:
# create model 
model = resnet34(num_classes=len(CLASSES), in_channels=1)

In [None]:
# device setting.
# pleas use gpu, this example is too heavy for cpu training.

# if use_gpu:
#     device = torch.device("cuda")
# else:
#     device = torch.device("cpu")

device = torch.device("cuda")

In [None]:
# move model to gpu if you use gpu
model = model.to(device)

In [None]:
# loss function is normal crossentrophy
criterion = torch.nn.CrossEntropyLoss()

In [None]:

from syft.federated.floptimizer import Optims

# define optimizer
# adamw seems to be better
learning_rate = 1e-4
# weight_decay = 1e-2

# from syft.federated.floptimizer import Optims
# workers = ['bob', 'alice']
# optims = Optims(workers, optim=torch.optim.SGD(params=model.parameters(), lr=learning_rate ))

from syft.federated.floptimizer import Optims
workers = ['bob', 'alice']
optims = Optims(workers, optim=torch.optim.AdamW(params=model.parameters(), lr=learning_rate ))

In [None]:
start_timestamp = int(time.time()*1000)
start_epoch = 0
# max_epochs = 30
max_epochs = 5
best_accuracy = 0
best_loss = 1e100
global_step = 0

In [None]:
# # if you want to fine-tune model, make finetune True
# finetune = False

# if finetune is True:
#     # load saved weights
#     weight_path = "./checkpoints/best-acc-speech-commands-checkpoint-basic1.pth"
#     state = torch.load(
#         weight_path, 
#         map_location=torch.device("cpu"))
#     _ = model.load_state_dict(state2['state_dict'])

# Trining loop

In [None]:

full_name = "speech_command_with_fl"

def train(epoch):
    global global_step

    # print("epoch %3d with lr=%.02e" % (epoch, get_lr()))
    phase = 'train'
    
    model.train()  # Set model to training mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    # pbar = tqdm(train_dataloader, unit="audios", unit_scale=train_dataloader.batch_size)
    
    # use federated_train_loader
    pbar = tqdm(federated_train_loader, unit="audios", unit_scale=batch_size)
    for batch in pbar:
        
        inputs = batch[0]
        targets = batch[1]
        
        # get optimaizer on the same location with data
        _optimizer = optims.get_optim(inputs.location.id)
        
        # reset grad
        _optimizer.zero_grad()
        
        # send model to data.location
        model.send(inputs.location)
        
        inputs = torch.unsqueeze(inputs, 1)
        inputs = inputs.to(device)
        targets = targets.to(device)

        # forward/backward
        outputs = model(inputs)
        
        # get loss 
        loss = criterion(outputs, targets)
        
        # backward and step 
        loss.backward()
        _optimizer.step()
        
        # get model back
        model.get() # <-- NEW: get the model back
        
        # get loss back
        loss = loss.get() # <-- NEW: get the loss back

        # statistics
        it += 1
        global_step += 1
    
        # running_loss += loss.data[0]
        running_loss += loss.item()
        
        pred = outputs.max(1, keepdim=True)[1]
        
        # keep is to statistics
        _correct = pred.eq(targets.view_as(pred)).sum()
        correct += _correct.get().item()
        total += targets.shape[0]
        
        # update the progress bar    
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })
    
    accuracy = correct/total
    epoch_loss = running_loss / it
    print('%s/accuracy' % phase, 100*accuracy, epoch)
    print('%s/epoch_loss' % phase, epoch_loss, epoch)
    

In [None]:
def valid(epoch):
    global best_accuracy, best_loss, global_step
    
    phase = 'valid'
    model.eval()  # Set model to evaluate mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0
    
    for batch in valid_dataloader:
         
        inputs = batch[0]
          
        targets = batch[1]
        
        inputs = torch.unsqueeze(inputs, 1)
          
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        # forward
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]

        _correct = pred.eq(targets.view_as(pred)).sum().item()
        correct += _correct

        total += targets.size(0)
        
    accuracy = correct/total
    epoch_loss = running_loss / it
    
    print('%s/accuracy' % phase, 100*accuracy, epoch)
    print('%s/epoch_loss' % phase, epoch_loss, epoch)
    
    checkpoint = {
        'epoch': epoch,
        'step': global_step,
        'state_dict': model.state_dict(),
        'loss': epoch_loss,
        'accuracy': accuracy,
        'optimizer': optims.get_optim(bob.id).state_dict(),
    }

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(checkpoint, 'checkpoints/best-loss-speech-commands-checkpoint-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-loss.pth' % (start_timestamp, full_name))
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(checkpoint, 'checkpoints/best-acc-speech-commands-checkpoint-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-acc.pth' % (start_timestamp, full_name))
    
    torch.save(checkpoint, 'checkpoints/last-speech-commands-checkpoint.pth')
    del checkpoint  # reduce memory

    return epoch_loss

In [None]:

start_epoch = 0

if os.path.isdir('./checkpoints') is False:
    try:
        os.mkdir('./checkpoints')
    except OSError:
        print ("Creation of the directory %s failed" % path)
    

since = time.time()
for epoch in range(start_epoch, max_epochs):

    train(epoch)
    epoch_loss = valid(epoch)
    
    time_elapsed = time.time() - since
    time_str = 'total time elapsed: {:.0f}h {:.0f}m {:.0f}s '.format(time_elapsed // 3600, time_elapsed % 3600 // 60, time_elapsed % 60)
    
print("finished")



In [None]:
raise Exception("stop")

# Evaluation

In [None]:
# set default type torch.FloatTensor)
torch.set_default_tensor_type(torch.FloatTensor)

In [None]:
# First, let's see a data sample

import IPython.display

example_path = "./datasets/speech_commands/train/right/9f4098cb_nohash_0.wav"
IPython.display.Audio(example_path)

In [None]:
# define model again
model2 = resnet34(num_classes=len(CLASSES), in_channels=1)

In [None]:
# load saved weights
# weight_path2 = "./checkpoints/best-acc-speech-commands-checkpoint-basic1.pth"
weight_path2 = "./checkpoints/best-acc-speech-commands-checkpoint-speech_command_with_fl.pth"

state2 = torch.load(
    weight_path2, 
    map_location=torch.device("cpu"))

In [None]:
_ = model2.load_state_dict(state2['state_dict'])

In [None]:
# load audio 
# this is exact steps to load validation datasets

# our load audio function need this format
sample_data = {
    'path': example_path,
}

# load audio 
_load_audio = LoadAudio()
_audio_sample = _load_audio(sample_data)
len(_audio_sample["samples"])

In [None]:
# fix duration
_fixAudioLength = FixAudioLength()
_fixed_sample = _fixAudioLength(_audio_sample)

In [None]:
# apply validation transform
_processed_input = valid_feature_transform(_fixed_sample)

In [None]:
# adjust shape
_processed_input = _processed_input['input'][None][None]
_processed_input.shape

In [None]:
# get prediction
_ = model2.eval()
_eval_output = model2(_processed_input)
_eval_pred = _eval_output.max(1, keepdim=True)[1]
_eval_pred

In [None]:
CLASSES[7]