In [1]:
#!/usr/bin/env python
"""Train a CNN for Google speech commands."""

__author__ = 'Yuan Xu, Erdene-Ochir Tuguldur'

import argparse
import time

from tqdm import *

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.sampler import WeightedRandomSampler

import torchvision
from torchvision.transforms import *

from tensorboardX import SummaryWriter

import models
from datasets import *
from transforms import *
from mixup import *

In [2]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

In [3]:
class VGG(nn.Module):

    def __init__(self, features, num_classes=1000, init_weights=True):
        super(VGG, self).__init__()
        self.features = features
        self.classifier = nn.Sequential(
            nn.Linear(512 * 1 * 1, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


def make_layers(cfg, batch_norm=False, in_channels = 3):
    layers = []
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, padding=1)]
        elif v == 'L':
            layers += [nn.AdaptiveMaxPool2d((1,1))]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=3)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfg = {
    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'L'],
}


In [4]:
model = VGG(make_layers(cfg['E'], batch_norm=True, in_channels=1), num_classes=len(CLASSES))
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

38982352

In [5]:
class Arg():
    def __init__(self):
        self.train_dataset="/home/cilab/LabMembers/YS/Speech/health1/experiments/train"
        self.valid_dataset="/home/cilab/LabMembers/YS/Speech/health1/experiments/valid"
        self.background_noise="/home/cilab/LabMembers/YS/Speech/_background_noise_"
        self.comment=""
        self.batch_size=64
        self.dataload_workers_nums=6
        self.weight_decay=1e-2
        self.optim='sgd'
        self.learning_rate=0.01
        self.lr_scheduler='plateau'
        self.lr_scheduler_patience=5
        self.lr_scheduler_step_size=50
        self.lr_scheduler_gamma=0.1
        self.max_epochs=70
        self.resume=None
        self.model="vgg19_bn-nomixup"
        self.input="mel40"
        self.mixup=False
args = Arg()

In [6]:
use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

n_mels = 32
if args.input == 'mel40':
    n_mels = 40

data_aug_transform = Compose([ChangeAmplitude(), ChangeSpeedAndPitchAudio(), FixAudioLength()])
bg_dataset = BackgroundNoiseDataset(args.background_noise, data_aug_transform)
add_bg_noise = AddBackgroundNoiseOnSTFT(bg_dataset)
train_feature_transform = Compose([ToMfcc(n_mels=n_mels), ToTensor('mfcc', 'input')])
train_dataset = SpeechCommandsDataset(args.train_dataset,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         train_feature_transform]), silence_percentage=0)

valid_feature_transform = Compose([ToMfcc(n_mels=n_mels), ToTensor('mfcc', 'input')])
valid_dataset = SpeechCommandsDataset(args.valid_dataset,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         valid_feature_transform]), silence_percentage=0)

weights = train_dataset.make_weights_for_balanced_classes()
sampler = WeightedRandomSampler(weights, len(weights))
train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler,
                              pin_memory=use_gpu, num_workers=args.dataload_workers_nums)
valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
                              pin_memory=use_gpu, num_workers=args.dataload_workers_nums)

use_gpu True
all :  ['갑자기', '마그네슘', '진통제', '타이레놀', '바이러스', '내시경', '비타민', '고혈압', '단백질', '스트레스', '카페인', '다이어트', '부작용', '에너지', '아스피린']
all :  ['갑자기', '마그네슘', '진통제', '타이레놀', '바이러스', '내시경', '비타민', '고혈압', '단백질', '스트레스', '카페인', '다이어트', '부작용', '에너지', '아스피린']


  weight_per_class = N / count


In [7]:
# a name used to save checkpoints etc.
full_name = '%s_%s_%s_bs%d_lr%.1e_wd%.1e' % (args.model, args.optim, args.lr_scheduler, args.batch_size, args.learning_rate, args.weight_decay)
if args.comment:
    full_name = '%s_%s' % (full_name, args.comment)

model = VGG(make_layers(cfg['E'], batch_norm=True, in_channels=1), num_classes=len(CLASSES))
#model = ResNet(BasicBlock, [2, 2, 2], num_classes=len(CLASSES), in_channels=1)
print(model)
if use_gpu:
    model = torch.nn.DataParallel(model).cuda()

criterion = torch.nn.CrossEntropyLoss()

if args.optim == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay)
else:
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

start_timestamp = int(time.time()*1000)
start_epoch = 0
best_accuracy = 0
best_loss = 1e100
global_step = 0

if args.resume:
    print("resuming a checkpoint '%s'" % args.resume)
    checkpoint = torch.load(args.resume)
    model.load_state_dict(checkpoint['state_dict'])
    model.float()
    optimizer.load_state_dict(checkpoint['optimizer'])

    best_accuracy = checkpoint.get('accuracy', best_accuracy)
    best_loss = checkpoint.get('loss', best_loss)
    start_epoch = checkpoint.get('epoch', start_epoch)
    global_step = checkpoint.get('step', global_step)

    del checkpoint  # reduce memory

if args.lr_scheduler == 'plateau':
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=args.lr_scheduler_patience, factor=args.lr_scheduler_gamma)
else:
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_scheduler_step_size, gamma=args.lr_scheduler_gamma, last_epoch=start_epoch-1)

VGG(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=1, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(3, 3))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=1, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

38982352

In [9]:
def get_lr():
    return optimizer.param_groups[0]['lr']

writer = SummaryWriter(comment=('_speech_commands_' + full_name))

def train(epoch):
    global global_step

    print("epoch %3d with lr=%.02e" % (epoch, get_lr()))
    phase = 'train'
    writer.add_scalar('%s/learning_rate' % phase,  get_lr(), epoch)

    model.train()  # Set model to training mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    pbar = tqdm(train_dataloader, unit="audios", unit_scale=train_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']
        #print(inputs.shape)
        if args.mixup:
            inputs, targets = mixup(inputs, targets, num_classes=len(CLASSES))

        inputs = Variable(inputs, requires_grad=True)
        targets = Variable(targets, requires_grad=False)
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward/backward
        outputs = model(inputs)
        if args.mixup:
            loss = mixup_cross_entropy_loss(outputs, targets)
        else:
            loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]
        if args.mixup:
            targets = batch['target']
            targets = Variable(targets, requires_grad=False).cuda(async=True)
        correct += pred.eq(targets.data.view_as(pred)).sum()
        total += targets.size(0)

        writer.add_scalar('%s/loss' % phase, loss.item(), global_step)

        # update the progress bar
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })

    accuracy = correct/total
    epoch_loss = running_loss / it
    writer.add_scalar('%s/accuracy' % phase, 100*accuracy, epoch)
    writer.add_scalar('%s/epoch_loss' % phase, epoch_loss, epoch)

In [10]:
def valid(epoch):
    global best_accuracy, best_loss, global_step

    phase = 'valid'
    model.eval()  # Set model to evaluate mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    pbar = tqdm(valid_dataloader, unit="audios", unit_scale=valid_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']

        inputs = Variable(inputs, volatile = True)
        targets = Variable(targets, requires_grad=False)

        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]
        correct += pred.eq(targets.data.view_as(pred)).sum()
        total += targets.size(0)

        writer.add_scalar('%s/loss' % phase, loss.item(), global_step)

        # update the progress bar
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })

    accuracy = 100*correct/total
    epoch_loss = running_loss / it
    writer.add_scalar('%s/accuracy' % phase, accuracy, epoch)
    writer.add_scalar('%s/epoch_loss' % phase, epoch_loss, epoch)
    checkpoint = {
        'epoch': epoch,
        'step': global_step,
        'state_dict': model.state_dict(),
        'loss': epoch_loss,
        'accuracy': accuracy,
        'optimizer' : optimizer.state_dict(),
    }
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(checkpoint, 'checkpoints/best-acc-resnet18-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-loss.pth' % (start_timestamp, full_name))
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(checkpoint, 'checkpoints/best-loss-resnet18-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-acc.pth' % (start_timestamp, full_name))
    torch.save(model, './res18.pth')
    #torch.save(checkpoint, 'checkpoints/Resnet18.pth')
    del checkpoint  # reduce memory

    return epoch_loss

In [11]:
print("training %s for Google speech commands..." % args.model)
since = time.time()
for epoch in range(start_epoch, args.max_epochs):
    if args.lr_scheduler == 'step':
        lr_scheduler.step()

    train(epoch)
    epoch_loss = valid(epoch)

    if args.lr_scheduler == 'plateau':
        print(type(epoch_loss))
        lr_scheduler.step(metrics=epoch_loss)

    time_elapsed = time.time() - since
    time_str = 'total time elapsed: {:.0f}h {:.0f}m {:.0f}s '.format(time_elapsed // 3600, time_elapsed % 3600 // 60, time_elapsed % 60)
    print("%s, best accuracy: %.02f%%, best loss %f" % (time_str, best_accuracy, best_loss))
    with open('./train_res18.log', 'a+') as f:
        f.write("%s, epoch: %s, best accuracy: %.02f%%, best loss %f\n" % (time_str, epoch,best_accuracy, best_loss))
print("finished")

  0%|          | 0/448 [00:00<?, ?audios/s]

training vgg19_bn-nomixup for Google speech commands...
epoch   0 with lr=1.00e-02


100%|██████████| 448/448 [00:16<00:00, 26.38audios/s, loss=3.84304, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 74.32audios/s, loss=3.05999, acc=5.00%]
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 20s , best accuracy: 5.00%, best loss 3.059989
epoch   1 with lr=1.00e-02


100%|██████████| 448/448 [00:02<00:00, 149.82audios/s, loss=2.90005, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 89.53audios/s, loss=4.02304, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 24s , best accuracy: 5.00%, best loss 3.059989
epoch   2 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 122.56audios/s, loss=2.80043, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 87.55audios/s, loss=2.93063, acc=2.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 31s , best accuracy: 5.00%, best loss 2.930629
epoch   3 with lr=1.00e-02


100%|██████████| 448/448 [00:02<00:00, 159.63audios/s, loss=2.76311, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 78.90audios/s, loss=3.23423, acc=10.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 36s , best accuracy: 10.00%, best loss 2.930629
epoch   4 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 123.88audios/s, loss=2.81264, acc=4.00%]
100%|██████████| 128/128 [00:01<00:00, 91.36audios/s, loss=2.95958, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 42s , best accuracy: 10.00%, best loss 2.930629
epoch   5 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 136.93audios/s, loss=2.75518, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 78.29audios/s, loss=2.75634, acc=11.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 49s , best accuracy: 11.00%, best loss 2.756341
epoch   6 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 124.31audios/s, loss=2.75221, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 91.71audios/s, loss=2.66969, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 55s , best accuracy: 11.00%, best loss 2.669687
epoch   7 with lr=1.00e-02


100%|██████████| 448/448 [00:02<00:00, 154.97audios/s, loss=2.74584, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 77.04audios/s, loss=2.67770, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 0m 60s , best accuracy: 11.00%, best loss 2.669687
epoch   8 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 140.99audios/s, loss=2.72478, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 89.94audios/s, loss=2.66742, acc=2.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 6s , best accuracy: 11.00%, best loss 2.667422
epoch   9 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 137.70audios/s, loss=2.74404, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 84.51audios/s, loss=2.77552, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 11s , best accuracy: 11.00%, best loss 2.667422
epoch  10 with lr=1.00e-02


100%|██████████| 448/448 [00:02<00:00, 156.07audios/s, loss=2.70951, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 82.12audios/s, loss=2.87136, acc=10.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 16s , best accuracy: 11.00%, best loss 2.667422
epoch  11 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 141.11audios/s, loss=2.72246, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 91.81audios/s, loss=2.79223, acc=22.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 22s , best accuracy: 22.00%, best loss 2.667422
epoch  12 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 138.14audios/s, loss=2.70862, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 81.09audios/s, loss=2.75655, acc=14.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 27s , best accuracy: 22.00%, best loss 2.667422
epoch  13 with lr=1.00e-02


100%|██████████| 448/448 [00:02<00:00, 156.33audios/s, loss=2.70539, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 81.58audios/s, loss=2.73665, acc=22.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 32s , best accuracy: 22.00%, best loss 2.667422
epoch  14 with lr=1.00e-02


100%|██████████| 448/448 [00:03<00:00, 145.80audios/s, loss=2.70405, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 91.04audios/s, loss=2.71294, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 37s , best accuracy: 22.00%, best loss 2.667422
epoch  15 with lr=1.00e-03


100%|██████████| 448/448 [00:03<00:00, 122.45audios/s, loss=2.70444, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 89.12audios/s, loss=2.72983, acc=10.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 42s , best accuracy: 22.00%, best loss 2.667422
epoch  16 with lr=1.00e-03


100%|██████████| 448/448 [00:03<00:00, 138.13audios/s, loss=2.69110, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 83.08audios/s, loss=2.74802, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 47s , best accuracy: 22.00%, best loss 2.667422
epoch  17 with lr=1.00e-03


100%|██████████| 448/448 [00:02<00:00, 155.34audios/s, loss=2.69707, acc=9.00%] 
100%|██████████| 128/128 [00:01<00:00, 82.76audios/s, loss=2.77510, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 52s , best accuracy: 22.00%, best loss 2.667422
epoch  18 with lr=1.00e-03


100%|██████████| 448/448 [00:03<00:00, 145.91audios/s, loss=2.69385, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 88.76audios/s, loss=2.77305, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 1m 57s , best accuracy: 22.00%, best loss 2.667422
epoch  19 with lr=1.00e-03


100%|██████████| 448/448 [00:03<00:00, 131.60audios/s, loss=2.69991, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 84.76audios/s, loss=2.75819, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 2s , best accuracy: 22.00%, best loss 2.667422
epoch  20 with lr=1.00e-03


100%|██████████| 448/448 [00:03<00:00, 126.19audios/s, loss=2.68398, acc=9.00%] 
100%|██████████| 128/128 [00:01<00:00, 90.06audios/s, loss=2.78326, acc=10.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 7s , best accuracy: 22.00%, best loss 2.667422
epoch  21 with lr=1.00e-04


100%|██████████| 448/448 [00:02<00:00, 150.84audios/s, loss=2.66526, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 80.65audios/s, loss=2.76596, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 12s , best accuracy: 22.00%, best loss 2.667422
epoch  22 with lr=1.00e-04


100%|██████████| 448/448 [00:02<00:00, 158.00audios/s, loss=2.68405, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 82.43audios/s, loss=2.77799, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 17s , best accuracy: 22.00%, best loss 2.667422
epoch  23 with lr=1.00e-04


100%|██████████| 448/448 [00:03<00:00, 137.34audios/s, loss=2.69492, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 91.09audios/s, loss=2.80520, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 21s , best accuracy: 22.00%, best loss 2.667422
epoch  24 with lr=1.00e-04


100%|██████████| 448/448 [00:03<00:00, 119.40audios/s, loss=2.67683, acc=9.00%]
100%|██████████| 128/128 [00:01<00:00, 89.86audios/s, loss=2.78619, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 27s , best accuracy: 22.00%, best loss 2.667422
epoch  25 with lr=1.00e-04


100%|██████████| 448/448 [00:03<00:00, 142.27audios/s, loss=2.70863, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 79.58audios/s, loss=2.78356, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 32s , best accuracy: 22.00%, best loss 2.667422
epoch  26 with lr=1.00e-04


100%|██████████| 448/448 [00:02<00:00, 157.11audios/s, loss=2.69228, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 82.19audios/s, loss=2.76088, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 37s , best accuracy: 22.00%, best loss 2.667422
epoch  27 with lr=1.00e-05


100%|██████████| 448/448 [00:03<00:00, 143.82audios/s, loss=2.67831, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 93.05audios/s, loss=2.79575, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 41s , best accuracy: 22.00%, best loss 2.667422
epoch  28 with lr=1.00e-05


100%|██████████| 448/448 [00:03<00:00, 123.32audios/s, loss=2.67655, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 93.24audios/s, loss=2.76400, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 47s , best accuracy: 22.00%, best loss 2.667422
epoch  29 with lr=1.00e-05


100%|██████████| 448/448 [00:03<00:00, 139.39audios/s, loss=2.68381, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 80.93audios/s, loss=2.79654, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 52s , best accuracy: 22.00%, best loss 2.667422
epoch  30 with lr=1.00e-05


100%|██████████| 448/448 [00:02<00:00, 164.20audios/s, loss=2.70669, acc=9.00%]
100%|██████████| 128/128 [00:01<00:00, 78.95audios/s, loss=2.77157, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 2m 56s , best accuracy: 22.00%, best loss 2.667422
epoch  31 with lr=1.00e-05


100%|██████████| 448/448 [00:03<00:00, 148.26audios/s, loss=2.69171, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 91.21audios/s, loss=2.81313, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 1s , best accuracy: 22.00%, best loss 2.667422
epoch  32 with lr=1.00e-05


100%|██████████| 448/448 [00:03<00:00, 121.50audios/s, loss=2.69336, acc=7.00%] 
100%|██████████| 128/128 [00:01<00:00, 91.91audios/s, loss=2.80104, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 7s , best accuracy: 22.00%, best loss 2.667422
epoch  33 with lr=1.00e-06


100%|██████████| 448/448 [00:03<00:00, 138.98audios/s, loss=2.68057, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 81.00audios/s, loss=2.78610, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 12s , best accuracy: 22.00%, best loss 2.667422
epoch  34 with lr=1.00e-06


100%|██████████| 448/448 [00:02<00:00, 163.42audios/s, loss=2.69134, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 78.92audios/s, loss=2.80816, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 16s , best accuracy: 22.00%, best loss 2.667422
epoch  35 with lr=1.00e-06


100%|██████████| 448/448 [00:03<00:00, 133.65audios/s, loss=2.67267, acc=9.00%] 
100%|██████████| 128/128 [00:01<00:00, 90.03audios/s, loss=2.77174, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 21s , best accuracy: 22.00%, best loss 2.667422
epoch  36 with lr=1.00e-06


100%|██████████| 448/448 [00:03<00:00, 117.70audios/s, loss=2.68891, acc=11.00%]
100%|██████████| 128/128 [00:01<00:00, 86.81audios/s, loss=2.79042, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 27s , best accuracy: 22.00%, best loss 2.667422
epoch  37 with lr=1.00e-06


100%|██████████| 448/448 [00:03<00:00, 138.36audios/s, loss=2.68573, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 80.50audios/s, loss=2.79577, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 32s , best accuracy: 22.00%, best loss 2.667422
epoch  38 with lr=1.00e-06


100%|██████████| 448/448 [00:02<00:00, 150.51audios/s, loss=2.70079, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 80.48audios/s, loss=2.81823, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 37s , best accuracy: 22.00%, best loss 2.667422
epoch  39 with lr=1.00e-07


100%|██████████| 448/448 [00:03<00:00, 145.16audios/s, loss=2.72229, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 90.62audios/s, loss=2.80164, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 42s , best accuracy: 22.00%, best loss 2.667422
epoch  40 with lr=1.00e-07


100%|██████████| 448/448 [00:03<00:00, 122.34audios/s, loss=2.69535, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 89.59audios/s, loss=2.79784, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 47s , best accuracy: 22.00%, best loss 2.667422
epoch  41 with lr=1.00e-07


100%|██████████| 448/448 [00:03<00:00, 137.91audios/s, loss=2.69183, acc=9.00%]
100%|██████████| 128/128 [00:01<00:00, 82.20audios/s, loss=2.78324, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 52s , best accuracy: 22.00%, best loss 2.667422
epoch  42 with lr=1.00e-07


100%|██████████| 448/448 [00:02<00:00, 162.73audios/s, loss=2.68605, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 80.13audios/s, loss=2.78459, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 3m 57s , best accuracy: 22.00%, best loss 2.667422
epoch  43 with lr=1.00e-07


100%|██████████| 448/448 [00:03<00:00, 143.39audios/s, loss=2.66213, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 91.18audios/s, loss=2.76844, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 2s , best accuracy: 22.00%, best loss 2.667422
epoch  44 with lr=1.00e-07


100%|██████████| 448/448 [00:03<00:00, 127.15audios/s, loss=2.67839, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 85.29audios/s, loss=2.78303, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 7s , best accuracy: 22.00%, best loss 2.667422
epoch  45 with lr=1.00e-08


100%|██████████| 448/448 [00:03<00:00, 137.31audios/s, loss=2.69515, acc=9.00%]
100%|██████████| 128/128 [00:01<00:00, 96.99audios/s, loss=2.78259, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 12s , best accuracy: 22.00%, best loss 2.667422
epoch  46 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 164.69audios/s, loss=2.67754, acc=13.00%]
100%|██████████| 128/128 [00:01<00:00, 94.18audios/s, loss=2.77810, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 16s , best accuracy: 22.00%, best loss 2.667422
epoch  47 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 163.10audios/s, loss=2.69380, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 99.09audios/s, loss=2.79155, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 21s , best accuracy: 22.00%, best loss 2.667422
epoch  48 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 165.47audios/s, loss=2.68736, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 96.52audios/s, loss=2.79003, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 25s , best accuracy: 22.00%, best loss 2.667422
epoch  49 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 161.53audios/s, loss=2.66670, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 96.11audios/s, loss=2.78476, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 29s , best accuracy: 22.00%, best loss 2.667422
epoch  50 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 161.04audios/s, loss=2.69481, acc=8.00%] 
100%|██████████| 128/128 [00:01<00:00, 96.89audios/s, loss=2.76724, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 34s , best accuracy: 22.00%, best loss 2.667422
epoch  51 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 168.16audios/s, loss=2.67766, acc=11.00%]
100%|██████████| 128/128 [00:01<00:00, 94.03audios/s, loss=2.77070, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 38s , best accuracy: 22.00%, best loss 2.667422
epoch  52 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 163.28audios/s, loss=2.69162, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 98.10audios/s, loss=2.78799, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 42s , best accuracy: 22.00%, best loss 2.667422
epoch  53 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 167.13audios/s, loss=2.67644, acc=11.00%]
100%|██████████| 128/128 [00:01<00:00, 93.40audios/s, loss=2.78612, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 47s , best accuracy: 22.00%, best loss 2.667422
epoch  54 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 163.53audios/s, loss=2.69028, acc=9.00%]
100%|██████████| 128/128 [00:01<00:00, 95.96audios/s, loss=2.79077, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 51s , best accuracy: 22.00%, best loss 2.667422
epoch  55 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 161.47audios/s, loss=2.66335, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 96.05audios/s, loss=2.80511, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 55s , best accuracy: 22.00%, best loss 2.667422
epoch  56 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 164.97audios/s, loss=2.69769, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 95.52audios/s, loss=2.79196, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 4m 60s , best accuracy: 22.00%, best loss 2.667422
epoch  57 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 165.40audios/s, loss=2.71090, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 96.16audios/s, loss=2.79579, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 4s , best accuracy: 22.00%, best loss 2.667422
epoch  58 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 166.01audios/s, loss=2.68138, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 96.51audios/s, loss=2.79703, acc=10.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 8s , best accuracy: 22.00%, best loss 2.667422
epoch  59 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 159.46audios/s, loss=2.69177, acc=8.00%] 
100%|██████████| 128/128 [00:01<00:00, 95.41audios/s, loss=2.79224, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 13s , best accuracy: 22.00%, best loss 2.667422
epoch  60 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 158.45audios/s, loss=2.69182, acc=9.00%] 
100%|██████████| 128/128 [00:01<00:00, 97.57audios/s, loss=2.80890, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 17s , best accuracy: 22.00%, best loss 2.667422
epoch  61 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 159.35audios/s, loss=2.68394, acc=9.00%] 
100%|██████████| 128/128 [00:01<00:00, 99.14audios/s, loss=2.76296, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 21s , best accuracy: 22.00%, best loss 2.667422
epoch  62 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 162.19audios/s, loss=2.69087, acc=9.00%]
100%|██████████| 128/128 [00:01<00:00, 96.48audios/s, loss=2.80565, acc=5.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 26s , best accuracy: 22.00%, best loss 2.667422
epoch  63 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 164.68audios/s, loss=2.69573, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 96.29audios/s, loss=2.78892, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 30s , best accuracy: 22.00%, best loss 2.667422
epoch  64 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 166.67audios/s, loss=2.69338, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 94.18audios/s, loss=2.79845, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 34s , best accuracy: 22.00%, best loss 2.667422
epoch  65 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 167.14audios/s, loss=2.69177, acc=10.00%]
100%|██████████| 128/128 [00:01<00:00, 97.97audios/s, loss=2.80558, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 39s , best accuracy: 22.00%, best loss 2.667422
epoch  66 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 169.76audios/s, loss=2.68718, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 96.29audios/s, loss=2.78902, acc=8.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 43s , best accuracy: 22.00%, best loss 2.667422
epoch  67 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 163.66audios/s, loss=2.67176, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 96.14audios/s, loss=2.80507, acc=10.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 47s , best accuracy: 22.00%, best loss 2.667422
epoch  68 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 163.73audios/s, loss=2.70047, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 95.16audios/s, loss=2.79487, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

<class 'float'>
total time elapsed: 0h 5m 52s , best accuracy: 22.00%, best loss 2.667422
epoch  69 with lr=1.00e-08


100%|██████████| 448/448 [00:02<00:00, 164.92audios/s, loss=2.68866, acc=9.00%] 
100%|██████████| 128/128 [00:01<00:00, 93.02audios/s, loss=2.80366, acc=4.00%]


<class 'float'>
total time elapsed: 0h 5m 56s , best accuracy: 22.00%, best loss 2.667422
finished
