In [1]:
#!/usr/bin/env python
"""Train a CNN for Google speech commands."""

__author__ = 'Yuan Xu, Erdene-Ochir Tuguldur'

import argparse
import time

from tqdm import *

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.sampler import WeightedRandomSampler

import torchvision
from torchvision.transforms import *

from tensorboardX import SummaryWriter

import models
from datasets import *
from transforms import *
from mixup import *

In [7]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F

In [8]:
class BaseRNN(nn.Module):
    r"""
    Applies a multi-layer RNN to an input sequence.
    Note:
        Do not use this class directly, use one of the sub classes.
    Args:
        vocab_size (int): size of the vocabulary
        max_len (int): maximum allowed length for the sequence to be processed
        hidden_size (int): number of features in the hidden state `h`
        input_dropout_p (float): dropout probability for the input sequence
        dropout_p (float): dropout probability for the output sequence
        n_layers (int): number of recurrent layers
        rnn_cell (str): type of RNN cell (Eg. 'LSTM' , 'GRU')

    Inputs: ``*args``, ``**kwargs``
        - ``*args``: variable length argument list.
        - ``**kwargs``: arbitrary keyword arguments.

    Attributes:
        SYM_MASK: masking symbol
        SYM_EOS: end-of-sequence symbol
    """
    SYM_MASK = "MASK"
    SYM_EOS = "EOS"

    def __init__(self, vocab_size, max_len, hidden_size, input_dropout_p, dropout_p, n_layers, rnn_cell):
        super(BaseRNN, self).__init__()
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.input_dropout_p = input_dropout_p
        self.input_dropout = nn.Dropout(p=input_dropout_p)
        if rnn_cell.lower() == 'lstm':
            self.rnn_cell = nn.LSTM
        elif rnn_cell.lower() == 'gru':
            self.rnn_cell = nn.GRU
        else:
            raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell))

        self.dropout_p = dropout_p

    def forward(self, *args, **kwargs):
        raise NotImplementedError()

In [9]:
class EncoderRNN(BaseRNN):
    r"""
    Applies a multi-layer RNN to an input sequence.

    Args:
        vocab_size (int): size of the vocabulary
        max_len (int): a maximum allowed length for the sequence to be processed
        hidden_size (int): the number of features in the hidden state `h`
        input_dropout_p (float, optional): dropout probability for the input sequence (default: 0)
        dropout_p (float, optional): dropout probability for the output sequence (default: 0)
        n_layers (int, optional): number of recurrent layers (default: 1)
        bidirectional (bool, optional): if True, becomes a bidirectional encodr (defulat False)
        rnn_cell (str, optional): type of RNN cell (default: gru)
        variable_lengths (bool, optional): if use variable length RNN (default: False)
        embedding (torch.Tensor, optional): Pre-trained embedding.  The size of the tensor has to match
            the size of the embedding parameter: (vocab_size, hidden_size).  The embedding layer would be initialized
            with the tensor if provided (default: None).
        update_embedding (bool, optional): If the embedding should be updated during training (default: False).

    Inputs: inputs, input_lengths
        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is a list of token IDs.
        - **input_lengths** (list of int, optional): list that contains the lengths of sequences
            in the mini-batch, it must be provided when using variable length RNN (default: `None`)

    Outputs: output, hidden
        - **output** (batch, seq_len, hidden_size): tensor containing the encoded features of the input sequence
        - **hidden** (num_layers * num_directions, batch, hidden_size): tensor containing the features in the hidden state `h`

    Examples::

         >>> encoder = EncoderRNN(input_vocab, max_seq_length, hidden_size)
         >>> output, hidden = encoder(input)

    """

    def __init__(self, feature_size, hidden_size,
                 input_dropout_p=0, dropout_p=0,
                 n_layers=6, bidirectional=False, rnn_cell='gru', variable_lengths=False):
        super(EncoderRNN, self).__init__(0, 0, hidden_size,
                input_dropout_p, dropout_p, n_layers, rnn_cell)

        self.variable_lengths = variable_lengths
        

        """
        Copied from https://github.com/SeanNaren/deepspeech.pytorch/blob/master/model.py
        Copyright (c) 2017 Sean Naren
        MIT License
        """
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        )

        feature_size = math.ceil((feature_size - 11 + 1 + (5*2)) / 2)
        feature_size = math.ceil(feature_size - 11 + 1 + (5*2))
        feature_size *= 32

        self.rnn = self.rnn_cell(feature_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=bidirectional, dropout=dropout_p)

    def forward(self, input_var, input_lengths=None):
        """
        Applies a multi-layer RNN to an input sequence.

        Args:
            input_var (batch, seq_len): tensor containing the features of the input sequence.
            input_lengths (list of int, optional): A list that contains the lengths of sequences
              in the mini-batch

        Returns: output, hidden
            - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
            - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
        """
        
        input_var = input_var.unsqueeze(1)
        x = self.conv(input_var)

        # BxCxTxD => BxCxDxT
        x = x.transpose(1, 2)
        x = x.contiguous()
        sizes = x.size()
        x = x.view(sizes[0], sizes[1], sizes[2] * sizes[3])

        if self.training:
            self.rnn.flatten_parameters()

        output, hidden = self.rnn(x)

        return output, hidden

In [10]:
class DecoderRNN(BaseRNN):
    r"""
    Provides functionality for decoding in a seq2seq framework, with an option for attention.
    Args:
        vocab_size (int): size of the vocabulary
        max_len (int): a maximum allowed length for the sequence to be processed
        hidden_size (int): the number of features in the hidden state `h`
        sos_id (int): index of the start of sentence symbol
        eos_id (int): index of the end of sentence symbol
        n_layers (int, optional): number of recurrent layers (default: 1)
        rnn_cell (str, optional): type of RNN cell (default: gru)
        bidirectional (bool, optional): if the encoder is bidirectional (default False)
        input_dropout_p (float, optional): dropout probability for the input sequence (default: 0)
        dropout_p (float, optional): dropout probability for the output sequence (default: 0)
        use_attention(bool, optional): flag indication whether to use attention mechanism or not (default: false)
    Attributes:
        KEY_ATTN_SCORE (str): key used to indicate attention weights in `ret_dict`
        KEY_LENGTH (str): key used to indicate a list representing lengths of output sequences in `ret_dict`
        KEY_SEQUENCE (str): key used to indicate a list of sequences in `ret_dict`
    Inputs: inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio
        - **inputs** (batch, seq_len, input_size): list of sequences, whose length is the batch size and within which
          each sequence is a list of token IDs.  It is used for teacher forcing when provided. (default `None`)
        - **encoder_hidden** (num_layers * num_directions, batch_size, hidden_size): tensor containing the features in the
          hidden state `h` of encoder. Used as the initial hidden state of the decoder. (default `None`)
        - **encoder_outputs** (batch, seq_len, hidden_size): tensor with containing the outputs of the encoder.
          Used for attention mechanism (default is `None`).
        - **function** (torch.nn.Module): A function used to generate symbols from RNN hidden state
          (default is `torch.nn.functional.log_softmax`).
        - **teacher_forcing_ratio** (float): The probability that teacher forcing will be used. A random number is
          drawn uniformly from 0-1 for every decoding token, and if the sample is smaller than the given value,
          teacher forcing would be used (default is 0).
    Outputs: decoder_outputs, decoder_hidden, ret_dict
        - **decoder_outputs** (seq_len, batch, vocab_size): list of tensors with size (batch_size, vocab_size) containing
          the outputs of the decoding function.
        - **decoder_hidden** (num_layers * num_directions, batch, hidden_size): tensor containing the last hidden
          state of the decoder.
        - **ret_dict**: dictionary containing additional information as follows {*KEY_LENGTH* : list of integers
          representing lengths of output sequences, *KEY_SEQUENCE* : list of sequences, where each sequence is a list of
          predicted token IDs }.
    """

    KEY_ATTN_SCORE = 'attention_score'
    KEY_LENGTH = 'length'
    KEY_SEQUENCE = 'sequence'

    def __init__(self, vocab_size, max_len, hidden_size,
            sos_id, eos_id,
            n_layers=1, rnn_cell='gru', bidirectional=False,
            input_dropout_p=0, dropout_p=0, use_attention=False):
        super(DecoderRNN, self).__init__(vocab_size, max_len, hidden_size,
                input_dropout_p, dropout_p,
                n_layers, rnn_cell)

        self.bidirectional_encoder = bidirectional
        self.rnn = self.rnn_cell(hidden_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p)

        self.output_size = vocab_size
        self.max_length = max_len
        self.use_attention = use_attention
        self.eos_id = eos_id
        self.sos_id = sos_id

        self.init_input = None

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        if use_attention:
            self.attention = Attention(self.hidden_size)

        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward_step(self, input_var, hidden, encoder_outputs, function):
        batch_size = input_var.size(0)
        output_size = input_var.size(1)
        embedded = self.embedding(input_var)
        embedded = self.input_dropout(embedded)

        if self.training:
            self.rnn.flatten_parameters()

        output, hidden = self.rnn(embedded, hidden)

        attn = None
        if self.use_attention:
            output, attn = self.attention(output, encoder_outputs)

        predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size)), dim=1).view(batch_size, output_size, -1)
        return predicted_softmax, hidden, attn

    def forward(self, inputs=None, encoder_hidden=None, encoder_outputs=None,
                    function=F.log_softmax, teacher_forcing_ratio=0):
        ret_dict = dict()
        if self.use_attention:
            ret_dict[DecoderRNN.KEY_ATTN_SCORE] = list()

        inputs, batch_size, max_length = self._validate_args(inputs, encoder_hidden, encoder_outputs,
                                                             function, teacher_forcing_ratio)
        decoder_hidden = self._init_state(encoder_hidden)

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        decoder_outputs = []
        sequence_symbols = []
        lengths = np.array([max_length] * batch_size)

        def decode(step, step_output, step_attn):
            decoder_outputs.append(step_output)
            if self.use_attention:
                ret_dict[DecoderRNN.KEY_ATTN_SCORE].append(step_attn)
            symbols = decoder_outputs[-1].topk(1)[1]
            sequence_symbols.append(symbols)

            eos_batches = symbols.data.eq(self.eos_id)
            if eos_batches.dim() > 0:
                eos_batches = eos_batches.cpu().view(-1).numpy()
                update_idx = ((lengths > step) & eos_batches) != 0
                lengths[update_idx] = len(sequence_symbols)
            return symbols

        # Manual unrolling is used to support random teacher forcing.
        # If teacher_forcing_ratio is True or False instead of a probability, the unrolling can be done in graph
        if use_teacher_forcing:
            decoder_input = inputs[:, :-1]
            decoder_output, decoder_hidden, attn = self.forward_step(decoder_input, decoder_hidden, encoder_outputs,
                                                                     function=function)

            for di in range(decoder_output.size(1)):
                step_output = decoder_output[:, di, :]
                if attn is not None:
                    step_attn = attn[:, di, :]
                else:
                    step_attn = None
                decode(di, step_output, step_attn)
        else:
            decoder_input = inputs[:, 0].unsqueeze(1)
            for di in range(max_length):
                decoder_output, decoder_hidden, step_attn = self.forward_step(decoder_input, decoder_hidden, encoder_outputs,
                                                                         function=function)
                step_output = decoder_output.squeeze(1)
                symbols = decode(di, step_output, step_attn)
                decoder_input = symbols

        ret_dict[DecoderRNN.KEY_SEQUENCE] = sequence_symbols
        ret_dict[DecoderRNN.KEY_LENGTH] = lengths.tolist()

        return decoder_outputs, decoder_hidden, ret_dict

    def _init_state(self, encoder_hidden):
        """ Initialize the encoder hidden state. """
        if encoder_hidden is None:
            return None
        if isinstance(encoder_hidden, tuple):
            encoder_hidden = tuple([self._cat_directions(h) for h in encoder_hidden])
        else:
            encoder_hidden = self._cat_directions(encoder_hidden)
        return encoder_hidden

    def _cat_directions(self, h):
        """ If the encoder is bidirectional, do the following transformation.
            (#directions * #layers, #batch, hidden_size) -> (#layers, #batch, #directions * hidden_size)
        """
        if self.bidirectional_encoder:
            h = torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2)
        return h

    def _validate_args(self, inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio):
        if self.use_attention:
            if encoder_outputs is None:
                raise ValueError("Argument encoder_outputs cannot be None when attention is used.")

        # inference batch size
        if inputs is None and encoder_hidden is None:
            batch_size = 1
        else:
            if inputs is not None:
                batch_size = inputs.size(0)
            else:
                if self.rnn_cell is nn.LSTM:
                    batch_size = encoder_hidden[0].size(1)
                elif self.rnn_cell is nn.GRU:
                    batch_size = encoder_hidden.size(1)

        # set default input and max decoding length
        if inputs is None:
            if teacher_forcing_ratio > 0:
                raise ValueError("Teacher forcing has to be disabled (set 0) when no inputs is provided.")
            inputs = torch.LongTensor([self.sos_id] * batch_size).view(batch_size, 1)
            if torch.cuda.is_available():
                inputs = inputs.cuda()
            max_length = self.max_length
        else:
            max_length = inputs.size(1) - 1 # minus the start of sequence symbol

        return inputs, batch_size, max_length


In [12]:
class Attention(nn.Module):
    r"""
    Applies an attention mechanism on the output features from the decoder.

    .. math::
            \begin{array}{ll}
            x = context*output \\
            attn = exp(x_i) / sum_j exp(x_j) \\
            output = \tanh(w * (attn * context) + b * output)
            \end{array}

    Args:
        dim(int): The number of expected features in the output

    Inputs: output, context
        - **output** (batch, output_len, dimensions): tensor containing the output features from the decoder.
        - **context** (batch, input_len, dimensions): tensor containing features of the encoded input sequence.

    Outputs: output, attn
        - **output** (batch, output_len, dimensions): tensor containing the attended output features from the decoder.
        - **attn** (batch, output_len, input_len): tensor containing attention weights.

    Attributes:
        linear_out (torch.nn.Linear): applies a linear transformation to the incoming data: :math:`y = Ax + b`.
        mask (torch.Tensor, optional): applies a :math:`-inf` to the indices specified in the `Tensor`.

    Examples::

         >>> attention = seq2seq.models.Attention(256)
         >>> context = Variable(torch.randn(5, 3, 256))
         >>> output = Variable(torch.randn(5, 5, 256))
         >>> output, attn = attention(output, context)

    """
    def __init__(self, dim):
        super(Attention, self).__init__()
        self.linear_out = nn.Linear(dim*2, dim)
        self.mask = None

    def set_mask(self, mask):
        """
        Sets indices to be masked

        Args:
            mask (torch.Tensor): tensor containing indices to be masked
        """
        self.mask = mask

    def forward(self, output, context):
        batch_size = output.size(0)
        hidden_size = output.size(2)
        input_size = context.size(1)
        # (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len)
        attn = torch.bmm(output, context.transpose(1, 2))
        if self.mask is not None:
            attn.data.masked_fill_(self.mask, -float('inf'))
        attn = F.softmax(attn.view(-1, input_size), dim=1).view(batch_size, -1, input_size)

        # (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim)
        mix = torch.bmm(attn, context)

        # concat -> (batch, out_len, 2*dim)
        combined = torch.cat((mix, output), dim=2)
        # output -> (batch, out_len, dim)
        output = torch.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)

        return output, attn

In [13]:
class Seq2seq(nn.Module):
    """ Standard sequence-to-sequence architecture with configurable encoder
    and decoder.

    Args:
        encoder (EncoderRNN): object of EncoderRNN
        decoder (DecoderRNN): object of DecoderRNN
        decode_function (func, optional): function to generate symbols from output hidden states (default: F.log_softmax)

    Inputs: input_variable, input_lengths, target_variable, teacher_forcing_ratio
        - **input_variable** (list, option): list of sequences, whose length is the batch size and within which
          each sequence is a list of token IDs. This information is forwarded to the encoder.
        - **input_lengths** (list of int, optional): A list that contains the lengths of sequences
            in the mini-batch, it must be provided when using variable length RNN (default: `None`)
        - **target_variable** (list, optional): list of sequences, whose length is the batch size and within which
          each sequence is a list of token IDs. This information is forwarded to the decoder.
        - **teacher_forcing_ratio** (int, optional): The probability that teacher forcing will be used. A random number
          is drawn uniformly from 0-1 for every decoding token, and if the sample is smaller than the given value,
          teacher forcing would be used (default is 0)

    Outputs: decoder_outputs, decoder_hidden, ret_dict
        - **decoder_outputs** (batch): batch-length list of tensors with size (max_length, hidden_size) containing the
          outputs of the decoder.
        - **decoder_hidden** (num_layers * num_directions, batch, hidden_size): tensor containing the last hidden
          state of the decoder.
        - **ret_dict**: dictionary containing additional information as follows {*KEY_LENGTH* : list of integers
          representing lengths of output sequences, *KEY_SEQUENCE* : list of sequences, where each sequence is a list of
          predicted token IDs, *KEY_INPUT* : target outputs if provided for decoding, *KEY_ATTN_SCORE* : list of
          sequences, where each list is of attention weights }.

    """

    def __init__(self, encoder, decoder, decode_function=F.log_softmax):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.decode_function = decode_function

    def flatten_parameters(self):
        self.encoder.rnn.flatten_parameters()
        self.decoder.rnn.flatten_parameters()

    def forward(self, input_variable, input_lengths=None, target_variable=None,
                teacher_forcing_ratio=0):

        encoder_outputs, encoder_hidden = self.encoder(input_variable, input_lengths)

        result = self.decoder(inputs=target_variable,
                              encoder_hidden=encoder_hidden,
                              encoder_outputs=encoder_outputs,
                              function=self.decode_function,
                              teacher_forcing_ratio=teacher_forcing_ratio)
        return result[0]

In [17]:
feature_size = 40
dropout = 0.2
layer_size = 3
hidden_size = 512
max_len = 80

enc = EncoderRNN(feature_size, hidden_size,
                     input_dropout_p=dropout, dropout_p=dropout,
                     n_layers=layer_size, bidirectional=True, rnn_cell='gru', variable_lengths=False)

dec = DecoderRNN(len(CLASSES), max_len, hidden_size * (2 if True else 1), None, None,
                     n_layers=layer_size, rnn_cell='gru', bidirectional=True,
                     input_dropout_p=dropout, dropout_p=dropout, use_attention=True)

model = Seq2seq(enc, dec)

In [19]:
#model = ResNet(BasicBlock, [2, 2, 2], num_classes=len(CLASSES), in_channels=1)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

34261292

In [7]:
class Arg():
    def __init__(self):
        self.train_dataset="/home/cilab/LabMembers/DJ/sr_dataset/speech_command/train"
        self.valid_dataset="/home/cilab/LabMembers/DJ/sr_dataset/speech_command/valid"
        self.background_noise="/home/cilab/LabMembers/DJ/sr_dataset/speech_command/train/_background_noise_"
        self.comment=""
        self.batch_size=64
        self.dataload_workers_nums=6
        self.weight_decay=1e-2
        self.optim='sgd'
        self.learning_rate=0.01
        self.lr_scheduler='plateau'
        self.lr_scheduler_patience=5
        self.lr_scheduler_step_size=50
        self.lr_scheduler_gamma=0.1
        self.max_epochs=70
        self.resume=None
        self.model="resnet18"
        self.input="mel40"
        self.mixup=True
args = Arg()

In [13]:
use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

n_mels = 32
if args.input == 'mel40':
    n_mels = 40

data_aug_transform = Compose([ChangeAmplitude(), ChangeSpeedAndPitchAudio(), FixAudioLength(), ToSTFT(), StretchAudioOnSTFT(), TimeshiftAudioOnSTFT(), FixSTFTDimension()])
bg_dataset = BackgroundNoiseDataset(args.background_noise, data_aug_transform)
add_bg_noise = AddBackgroundNoiseOnSTFT(bg_dataset)
train_feature_transform = Compose([ToMelSpectrogramFromSTFT(n_mels=n_mels), DeleteSTFT(), ToTensor('mel_spectrogram', 'input')])
train_dataset = SpeechCommandsDataset(args.train_dataset,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         add_bg_noise,
                                         train_feature_transform]))

valid_feature_transform = Compose([ToMelSpectrogram(n_mels=n_mels), ToTensor('mel_spectrogram', 'input')])
valid_dataset = SpeechCommandsDataset(args.valid_dataset,
                                Compose([LoadAudio(),
                                         FixAudioLength(),
                                         valid_feature_transform]))

weights = train_dataset.make_weights_for_balanced_classes()
sampler = WeightedRandomSampler(weights, len(weights))
train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler,
                              pin_memory=use_gpu, num_workers=args.dataload_workers_nums)
valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
                              pin_memory=use_gpu, num_workers=args.dataload_workers_nums)

use_gpu True


In [14]:
# a name used to save checkpoints etc.
full_name = '%s_%s_%s_bs%d_lr%.1e_wd%.1e' % (args.model, args.optim, args.lr_scheduler, args.batch_size, args.learning_rate, args.weight_decay)
if args.comment:
    full_name = '%s_%s' % (full_name, args.comment)

#model = models.create_model(model_name=args.model, num_classes=len(CLASSES), in_channels=1)
model = ResNet(BasicBlock, [2, 2, 2], num_classes=len(CLASSES), in_channels=1)
print(model)
if use_gpu:
    model = torch.nn.DataParallel(model).cuda()

criterion = torch.nn.CrossEntropyLoss()

if args.optim == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay)
else:
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

start_timestamp = int(time.time()*1000)
start_epoch = 0
best_accuracy = 0
best_loss = 1e100
global_step = 0

if args.resume:
    print("resuming a checkpoint '%s'" % args.resume)
    checkpoint = torch.load(args.resume)
    model.load_state_dict(checkpoint['state_dict'])
    model.float()
    optimizer.load_state_dict(checkpoint['optimizer'])

    best_accuracy = checkpoint.get('accuracy', best_accuracy)
    best_loss = checkpoint.get('loss', best_loss)
    start_epoch = checkpoint.get('epoch', start_epoch)
    global_step = checkpoint.get('step', global_step)

    del checkpoint  # reduce memory

if args.lr_scheduler == 'plateau':
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=args.lr_scheduler_patience, factor=args.lr_scheduler_gamma)
else:
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_scheduler_step_size, gamma=args.lr_scheduler_gamma, last_epoch=start_epoch-1)

ResNet(
  (conv1): Conv2d(1, 12, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=(4, 3), stride=1, padding=0, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(12, 45, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv2d(12, 45, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
   

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

212499

In [16]:
def get_lr():
    return optimizer.param_groups[0]['lr']

writer = SummaryWriter(comment=('_speech_commands_' + full_name))

def train(epoch):
    global global_step

    print("epoch %3d with lr=%.02e" % (epoch, get_lr()))
    phase = 'train'
    writer.add_scalar('%s/learning_rate' % phase,  get_lr(), epoch)

    model.train()  # Set model to training mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    pbar = tqdm(train_dataloader, unit="audios", unit_scale=train_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']

        if args.mixup:
            inputs, targets = mixup(inputs, targets, num_classes=len(CLASSES))

        inputs = Variable(inputs, requires_grad=True)
        targets = Variable(targets, requires_grad=False)
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward/backward
        outputs = model(inputs)
        if args.mixup:
            loss = mixup_cross_entropy_loss(outputs, targets)
        else:
            loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]
        if args.mixup:
            targets = batch['target']
            targets = Variable(targets, requires_grad=False).cuda(async=True)
        correct += pred.eq(targets.data.view_as(pred)).sum()
        total += targets.size(0)

        writer.add_scalar('%s/loss' % phase, loss.item(), global_step)

        # update the progress bar
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })

    accuracy = correct/total
    epoch_loss = running_loss / it
    writer.add_scalar('%s/accuracy' % phase, 100*accuracy, epoch)
    writer.add_scalar('%s/epoch_loss' % phase, epoch_loss, epoch)

In [17]:
def valid(epoch):
    global best_accuracy, best_loss, global_step

    phase = 'valid'
    model.eval()  # Set model to evaluate mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    pbar = tqdm(valid_dataloader, unit="audios", unit_scale=valid_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']

        inputs = Variable(inputs, volatile = True)
        targets = Variable(targets, requires_grad=False)

        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]
        correct += pred.eq(targets.data.view_as(pred)).sum()
        total += targets.size(0)

        writer.add_scalar('%s/loss' % phase, loss.item(), global_step)

        # update the progress bar
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })

    accuracy = 100*correct/total
    epoch_loss = running_loss / it
    writer.add_scalar('%s/accuracy' % phase, accuracy, epoch)
    writer.add_scalar('%s/epoch_loss' % phase, epoch_loss, epoch)
    checkpoint = {
        'epoch': epoch,
        'step': global_step,
        'state_dict': model.state_dict(),
        'loss': epoch_loss,
        'accuracy': accuracy,
        'optimizer' : optimizer.state_dict(),
    }
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(checkpoint, 'checkpoints/best-acc-resnet18-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-loss.pth' % (start_timestamp, full_name))
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(checkpoint, 'checkpoints/best-loss-resnet18-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-acc.pth' % (start_timestamp, full_name))
    torch.save(model, './res18.pth')
    #torch.save(checkpoint, 'checkpoints/Resnet18.pth')
    del checkpoint  # reduce memory

    return epoch_loss

In [18]:
print("training %s for Google speech commands..." % args.model)
since = time.time()
for epoch in range(start_epoch, args.max_epochs):
    if args.lr_scheduler == 'step':
        lr_scheduler.step()

    train(epoch)
    epoch_loss = valid(epoch)

    if args.lr_scheduler == 'plateau':
        print(type(epoch_loss))
        lr_scheduler.step(metrics=epoch_loss)

    time_elapsed = time.time() - since
    time_str = 'total time elapsed: {:.0f}h {:.0f}m {:.0f}s '.format(time_elapsed // 3600, time_elapsed % 3600 // 60, time_elapsed % 60)
    print("%s, best accuracy: %.02f%%, best loss %f" % (time_str, best_accuracy, best_loss))
    with open('./train_res18.log', 'a+') as f:
        f.write("%s, epoch: %s, best accuracy: %.02f%%, best loss %f\n" % (time_str, epoch,best_accuracy, best_loss))
print("finished")

  0%|          | 0/56256 [00:00<?, ?audios/s]

training resnet18 for Google speech commands...
epoch   0 with lr=1.00e-02


100%|██████████| 56256/56256 [02:52<00:00, 326.35audios/s, loss=2.04352, acc=27.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1162.87audios/s, loss=1.01833, acc=63.00%]
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 59s , best accuracy: 63.00%, best loss 1.018326
epoch   1 with lr=1.00e-02


100%|██████████| 56256/56256 [02:36<00:00, 359.45audios/s, loss=1.65780, acc=41.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1152.37audios/s, loss=0.75405, acc=78.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 42s , best accuracy: 78.00%, best loss 0.754048
epoch   2 with lr=1.00e-02


100%|██████████| 56256/56256 [02:34<00:00, 363.99audios/s, loss=1.59521, acc=42.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1072.13audios/s, loss=1.39435, acc=55.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 8m 24s , best accuracy: 78.00%, best loss 0.754048
epoch   3 with lr=1.00e-02


100%|██████████| 56256/56256 [02:36<00:00, 359.81audios/s, loss=1.57956, acc=43.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1157.94audios/s, loss=1.13632, acc=65.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 11m 6s , best accuracy: 78.00%, best loss 0.754048
epoch   4 with lr=1.00e-02


100%|██████████| 56256/56256 [02:37<00:00, 357.48audios/s, loss=1.55939, acc=43.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1086.84audios/s, loss=1.65950, acc=46.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 13m 51s , best accuracy: 78.00%, best loss 0.754048
epoch   5 with lr=1.00e-02


100%|██████████| 56256/56256 [02:37<00:00, 357.76audios/s, loss=1.55636, acc=44.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1158.50audios/s, loss=0.69755, acc=82.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 16m 34s , best accuracy: 82.00%, best loss 0.697550
epoch   6 with lr=1.00e-02


100%|██████████| 56256/56256 [02:37<00:00, 356.97audios/s, loss=1.53871, acc=44.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1193.29audios/s, loss=1.43511, acc=50.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 19m 18s , best accuracy: 82.00%, best loss 0.697550
epoch   7 with lr=1.00e-02


100%|██████████| 56256/56256 [02:36<00:00, 358.74audios/s, loss=1.53517, acc=44.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1190.53audios/s, loss=0.92779, acc=70.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 22m 1s , best accuracy: 82.00%, best loss 0.697550
epoch   8 with lr=1.00e-02


100%|██████████| 56256/56256 [02:39<00:00, 353.00audios/s, loss=1.53722, acc=44.00%]
100%|██████████| 7488/7488 [00:06<00:00, 1096.28audios/s, loss=0.75551, acc=79.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 24m 48s , best accuracy: 82.00%, best loss 0.697550
epoch   9 with lr=1.00e-02


100%|██████████| 56256/56256 [01:30<00:00, 623.15audios/s, loss=1.53174, acc=44.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1725.71audios/s, loss=0.53771, acc=86.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 26m 22s , best accuracy: 86.00%, best loss 0.537706
epoch  10 with lr=1.00e-02


100%|██████████| 56256/56256 [01:25<00:00, 657.49audios/s, loss=1.52925, acc=44.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1723.87audios/s, loss=0.76026, acc=79.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 27m 52s , best accuracy: 86.00%, best loss 0.537706
epoch  11 with lr=1.00e-02


100%|██████████| 56256/56256 [01:58<00:00, 476.40audios/s, loss=1.52635, acc=45.00%]
100%|██████████| 7488/7488 [00:05<00:00, 1419.59audios/s, loss=0.93481, acc=69.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 29m 56s , best accuracy: 86.00%, best loss 0.537706
epoch  12 with lr=1.00e-02


100%|██████████| 56256/56256 [01:23<00:00, 676.10audios/s, loss=1.51975, acc=45.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1670.57audios/s, loss=0.69655, acc=77.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 31m 23s , best accuracy: 86.00%, best loss 0.537706
epoch  13 with lr=1.00e-02


100%|██████████| 56256/56256 [01:24<00:00, 669.41audios/s, loss=1.52349, acc=44.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1654.11audios/s, loss=0.66903, acc=80.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 32m 52s , best accuracy: 86.00%, best loss 0.537706
epoch  14 with lr=1.00e-02


100%|██████████| 56256/56256 [01:24<00:00, 665.46audios/s, loss=1.53275, acc=44.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1743.83audios/s, loss=1.31305, acc=56.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 34m 21s , best accuracy: 86.00%, best loss 0.537706
epoch  15 with lr=1.00e-02


100%|██████████| 56256/56256 [01:25<00:00, 661.60audios/s, loss=1.52363, acc=44.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1662.25audios/s, loss=0.85620, acc=74.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 35m 50s , best accuracy: 86.00%, best loss 0.537706
epoch  16 with lr=1.00e-03


100%|██████████| 56256/56256 [01:24<00:00, 669.71audios/s, loss=1.38431, acc=48.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1778.32audios/s, loss=0.42115, acc=88.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 37m 19s , best accuracy: 88.00%, best loss 0.421149
epoch  17 with lr=1.00e-03


100%|██████████| 56256/56256 [01:25<00:00, 657.59audios/s, loss=1.34799, acc=48.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1721.70audios/s, loss=0.44888, acc=89.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 38m 49s , best accuracy: 89.00%, best loss 0.421149
epoch  18 with lr=1.00e-03


100%|██████████| 56256/56256 [01:23<00:00, 673.34audios/s, loss=1.33398, acc=48.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1754.66audios/s, loss=0.35310, acc=91.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 40m 17s , best accuracy: 91.00%, best loss 0.353100
epoch  19 with lr=1.00e-03


100%|██████████| 56256/56256 [01:23<00:00, 671.83audios/s, loss=1.32435, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1689.68audios/s, loss=0.42009, acc=88.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 41m 45s , best accuracy: 91.00%, best loss 0.353100
epoch  20 with lr=1.00e-03


100%|██████████| 56256/56256 [01:22<00:00, 681.23audios/s, loss=1.31996, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1691.64audios/s, loss=0.33120, acc=92.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 43m 12s , best accuracy: 92.00%, best loss 0.331199
epoch  21 with lr=1.00e-03


100%|██████████| 56256/56256 [01:25<00:00, 660.97audios/s, loss=1.30978, acc=49.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1728.62audios/s, loss=0.42631, acc=89.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 44m 41s , best accuracy: 92.00%, best loss 0.331199
epoch  22 with lr=1.00e-03


100%|██████████| 56256/56256 [01:25<00:00, 657.47audios/s, loss=1.32014, acc=48.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1754.15audios/s, loss=0.28769, acc=93.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 46m 11s , best accuracy: 93.00%, best loss 0.287688
epoch  23 with lr=1.00e-03


100%|██████████| 56256/56256 [01:23<00:00, 677.30audios/s, loss=1.31315, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1702.63audios/s, loss=0.29181, acc=93.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 47m 39s , best accuracy: 93.00%, best loss 0.287688
epoch  24 with lr=1.00e-03


100%|██████████| 56256/56256 [01:23<00:00, 670.80audios/s, loss=1.30341, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1509.50audios/s, loss=0.41337, acc=89.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 49m 8s , best accuracy: 93.00%, best loss 0.287688
epoch  25 with lr=1.00e-03


100%|██████████| 56256/56256 [01:23<00:00, 671.92audios/s, loss=1.30761, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1742.51audios/s, loss=0.39451, acc=89.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 50m 36s , best accuracy: 93.00%, best loss 0.287688
epoch  26 with lr=1.00e-03


100%|██████████| 56256/56256 [01:22<00:00, 678.13audios/s, loss=1.29883, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1661.24audios/s, loss=0.42636, acc=89.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 52m 3s , best accuracy: 93.00%, best loss 0.287688
epoch  27 with lr=1.00e-03


100%|██████████| 56256/56256 [01:23<00:00, 675.65audios/s, loss=1.30011, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1684.49audios/s, loss=0.35392, acc=91.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 53m 31s , best accuracy: 93.00%, best loss 0.287688
epoch  28 with lr=1.00e-03


100%|██████████| 56256/56256 [01:22<00:00, 677.87audios/s, loss=1.29159, acc=49.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1681.94audios/s, loss=0.35021, acc=92.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 54m 58s , best accuracy: 93.00%, best loss 0.287688
epoch  29 with lr=1.00e-04


100%|██████████| 56256/56256 [01:24<00:00, 663.99audios/s, loss=1.24621, acc=49.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1661.10audios/s, loss=0.28853, acc=93.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 56m 28s , best accuracy: 93.00%, best loss 0.287688
epoch  30 with lr=1.00e-04


100%|██████████| 56256/56256 [01:25<00:00, 659.88audios/s, loss=1.22335, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1759.50audios/s, loss=0.27962, acc=93.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 57m 57s , best accuracy: 93.00%, best loss 0.279624
epoch  31 with lr=1.00e-04


100%|██████████| 56256/56256 [01:25<00:00, 656.24audios/s, loss=1.21779, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1775.35audios/s, loss=0.27557, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 59m 27s , best accuracy: 94.00%, best loss 0.275569
epoch  32 with lr=1.00e-04


100%|██████████| 56256/56256 [01:22<00:00, 678.97audios/s, loss=1.20928, acc=50.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1708.50audios/s, loss=0.26231, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 0m 54s , best accuracy: 94.00%, best loss 0.262310
epoch  33 with lr=1.00e-04


100%|██████████| 56256/56256 [01:22<00:00, 679.85audios/s, loss=1.20756, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1700.37audios/s, loss=0.31575, acc=93.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 2m 22s , best accuracy: 94.00%, best loss 0.262310
epoch  34 with lr=1.00e-04


100%|██████████| 56256/56256 [01:24<00:00, 665.27audios/s, loss=1.20463, acc=50.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1744.01audios/s, loss=0.27235, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 3m 50s , best accuracy: 94.00%, best loss 0.262310
epoch  35 with lr=1.00e-04


100%|██████████| 56256/56256 [01:23<00:00, 674.67audios/s, loss=1.19860, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1789.46audios/s, loss=0.26294, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 5m 18s , best accuracy: 94.00%, best loss 0.262310
epoch  36 with lr=1.00e-04


100%|██████████| 56256/56256 [01:23<00:00, 672.61audios/s, loss=1.19890, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1693.79audios/s, loss=0.28673, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 6m 46s , best accuracy: 94.00%, best loss 0.262310
epoch  37 with lr=1.00e-04


100%|██████████| 56256/56256 [01:22<00:00, 679.43audios/s, loss=1.19683, acc=50.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1690.88audios/s, loss=0.27033, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 8m 13s , best accuracy: 94.00%, best loss 0.262310
epoch  38 with lr=1.00e-04


100%|██████████| 56256/56256 [01:22<00:00, 678.73audios/s, loss=1.19191, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1665.72audios/s, loss=0.27086, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 9m 41s , best accuracy: 94.00%, best loss 0.262310
epoch  39 with lr=1.00e-05


100%|██████████| 56256/56256 [01:22<00:00, 682.77audios/s, loss=1.18095, acc=50.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1666.54audios/s, loss=0.26159, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 11m 8s , best accuracy: 94.00%, best loss 0.261585
epoch  40 with lr=1.00e-05


100%|██████████| 56256/56256 [01:23<00:00, 676.57audios/s, loss=1.17899, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1552.16audios/s, loss=0.24389, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 12m 36s , best accuracy: 95.00%, best loss 0.243893
epoch  41 with lr=1.00e-05


100%|██████████| 56256/56256 [01:25<00:00, 654.15audios/s, loss=1.18075, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1778.43audios/s, loss=0.24863, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 14m 6s , best accuracy: 95.00%, best loss 0.243893
epoch  42 with lr=1.00e-05


100%|██████████| 56256/56256 [01:26<00:00, 650.18audios/s, loss=1.17917, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1720.21audios/s, loss=0.24400, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 15m 37s , best accuracy: 95.00%, best loss 0.243893
epoch  43 with lr=1.00e-05


100%|██████████| 56256/56256 [01:25<00:00, 657.01audios/s, loss=1.17453, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1699.16audios/s, loss=0.25274, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 17m 7s , best accuracy: 95.00%, best loss 0.243893
epoch  44 with lr=1.00e-05


100%|██████████| 56256/56256 [01:24<00:00, 668.97audios/s, loss=1.18000, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1704.17audios/s, loss=0.24929, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 18m 36s , best accuracy: 95.00%, best loss 0.243893
epoch  45 with lr=1.00e-05


100%|██████████| 56256/56256 [01:24<00:00, 668.15audios/s, loss=1.18097, acc=50.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1685.58audios/s, loss=0.26353, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 20m 4s , best accuracy: 95.00%, best loss 0.243893
epoch  46 with lr=1.00e-05


100%|██████████| 56256/56256 [01:24<00:00, 662.59audios/s, loss=1.17275, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1577.24audios/s, loss=0.24966, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 21m 34s , best accuracy: 95.00%, best loss 0.243893
epoch  47 with lr=1.00e-06


100%|██████████| 56256/56256 [01:22<00:00, 682.35audios/s, loss=1.17827, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1689.75audios/s, loss=0.24093, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 23m 1s , best accuracy: 95.00%, best loss 0.240934
epoch  48 with lr=1.00e-06


100%|██████████| 56256/56256 [01:23<00:00, 675.73audios/s, loss=1.17517, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1681.56audios/s, loss=0.25007, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 24m 29s , best accuracy: 95.00%, best loss 0.240934
epoch  49 with lr=1.00e-06


100%|██████████| 56256/56256 [01:24<00:00, 666.49audios/s, loss=1.17806, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1719.46audios/s, loss=0.25075, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 25m 57s , best accuracy: 95.00%, best loss 0.240934
epoch  50 with lr=1.00e-06


100%|██████████| 56256/56256 [01:22<00:00, 680.79audios/s, loss=1.18004, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1693.72audios/s, loss=0.24340, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 27m 24s , best accuracy: 95.00%, best loss 0.240934
epoch  51 with lr=1.00e-06


100%|██████████| 56256/56256 [01:23<00:00, 672.43audios/s, loss=1.17776, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1758.23audios/s, loss=0.24216, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 28m 52s , best accuracy: 95.00%, best loss 0.240934
epoch  52 with lr=1.00e-06


100%|██████████| 56256/56256 [01:22<00:00, 678.22audios/s, loss=1.17721, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1729.80audios/s, loss=0.24149, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 30m 20s , best accuracy: 95.00%, best loss 0.240934
epoch  53 with lr=1.00e-06


100%|██████████| 56256/56256 [01:22<00:00, 678.68audios/s, loss=1.17313, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1750.33audios/s, loss=0.26964, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 31m 47s , best accuracy: 95.00%, best loss 0.240934
epoch  54 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 674.34audios/s, loss=1.17191, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1746.02audios/s, loss=0.27740, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 33m 15s , best accuracy: 95.00%, best loss 0.240934
epoch  55 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 674.36audios/s, loss=1.17679, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1709.41audios/s, loss=0.22443, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 34m 43s , best accuracy: 95.00%, best loss 0.224434
epoch  56 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 677.37audios/s, loss=1.17594, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1773.22audios/s, loss=0.26107, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 36m 10s , best accuracy: 95.00%, best loss 0.224434
epoch  57 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 676.15audios/s, loss=1.17682, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1663.60audios/s, loss=0.24672, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 37m 38s , best accuracy: 95.00%, best loss 0.224434
epoch  58 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 676.49audios/s, loss=1.17091, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1691.92audios/s, loss=0.28404, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 39m 5s , best accuracy: 95.00%, best loss 0.224434
epoch  59 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 676.43audios/s, loss=1.17327, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1756.19audios/s, loss=0.23221, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 40m 33s , best accuracy: 95.00%, best loss 0.224434
epoch  60 with lr=1.00e-07


100%|██████████| 56256/56256 [01:22<00:00, 679.43audios/s, loss=1.17898, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1829.75audios/s, loss=0.25639, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 41m 60s , best accuracy: 95.00%, best loss 0.224434
epoch  61 with lr=1.00e-07


100%|██████████| 56256/56256 [01:23<00:00, 674.97audios/s, loss=1.17577, acc=50.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1681.50audios/s, loss=0.25425, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 43m 27s , best accuracy: 95.00%, best loss 0.224434
epoch  62 with lr=1.00e-08


100%|██████████| 56256/56256 [01:22<00:00, 682.31audios/s, loss=1.17673, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1732.33audios/s, loss=0.25892, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 44m 54s , best accuracy: 95.00%, best loss 0.224434
epoch  63 with lr=1.00e-08


100%|██████████| 56256/56256 [01:22<00:00, 680.41audios/s, loss=1.17576, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1816.91audios/s, loss=0.24432, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 46m 21s , best accuracy: 95.00%, best loss 0.224434
epoch  64 with lr=1.00e-08


100%|██████████| 56256/56256 [01:22<00:00, 680.11audios/s, loss=1.17981, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1744.37audios/s, loss=0.24525, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 47m 48s , best accuracy: 95.00%, best loss 0.224434
epoch  65 with lr=1.00e-08


100%|██████████| 56256/56256 [01:22<00:00, 678.25audios/s, loss=1.17803, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1730.41audios/s, loss=0.25848, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 49m 15s , best accuracy: 95.00%, best loss 0.224434
epoch  66 with lr=1.00e-08


100%|██████████| 56256/56256 [01:23<00:00, 676.48audios/s, loss=1.17477, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1667.75audios/s, loss=0.23957, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 50m 43s , best accuracy: 95.00%, best loss 0.224434
epoch  67 with lr=1.00e-08


100%|██████████| 56256/56256 [01:22<00:00, 680.39audios/s, loss=1.17670, acc=50.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1694.78audios/s, loss=0.25098, acc=94.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 52m 10s , best accuracy: 95.00%, best loss 0.224434
epoch  68 with lr=1.00e-08


100%|██████████| 56256/56256 [01:23<00:00, 676.65audios/s, loss=1.17717, acc=51.00%] 
100%|██████████| 7488/7488 [00:04<00:00, 1751.32audios/s, loss=0.24909, acc=95.00%]
  0%|          | 0/56256 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 1h 53m 38s , best accuracy: 95.00%, best loss 0.224434
epoch  69 with lr=1.00e-08


100%|██████████| 56256/56256 [01:22<00:00, 683.59audios/s, loss=1.17757, acc=51.00%]
100%|██████████| 7488/7488 [00:04<00:00, 1664.34audios/s, loss=0.23782, acc=95.00%]


<class 'float'>
total time elapsed: 1h 55m 5s , best accuracy: 95.00%, best loss 0.224434
finished
