In [15]:
#!/usr/bin/env python
"""Train a CNN for Google speech commands."""

__author__ = 'Yuan Xu, Erdene-Ochir Tuguldur'

import argparse
import time

from tqdm import *

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.sampler import WeightedRandomSampler

import torchvision
from torchvision.transforms import *

from tensorboardX import SummaryWriter
import torch.nn.functional as F

import models
from datasets import *
from transforms import *
from mixup import *

In [16]:
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo

In [17]:
class SpeechResModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        n_labels = config["n_labels"]
        n_maps = config["n_feature_maps"]
        self.conv0 = nn.Conv2d(1, n_maps, (3, 3), padding=(1, 1), bias=False)
        if "res_pool" in config:
            self.pool = nn.AvgPool2d(config["res_pool"])

        self.n_layers = n_layers = config["n_layers"]
        dilation = config["use_dilation"]
        if dilation:
            self.convs = [nn.Conv2d(n_maps, n_maps, (3, 3), padding=int(2**(i // 3)), dilation=int(2**(i // 3)),
                bias=False) for i in range(n_layers)]
        else:
            self.convs = [nn.Conv2d(n_maps, n_maps, (3, 3), padding=1, dilation=1,
                bias=False) for _ in range(n_layers)]
        for i, conv in enumerate(self.convs):
            self.add_module("bn{}".format(i + 1), nn.BatchNorm2d(n_maps, affine=False))
            self.add_module("conv{}".format(i + 1), conv)
        self.output = nn.Linear(n_maps, n_labels)

    def forward(self, x):
        x = x.unsqueeze(1)
        for i in range(self.n_layers + 1):
            y = F.relu(getattr(self, "conv{}".format(i))(x))
            if i == 0:
                if hasattr(self, "pool"):
                    y = self.pool(y)
                old_x = y
            if i > 0 and i % 2 == 0:
                x = y + old_x
                old_x = x
            else:
                x = y
            if i > 0:
                x = getattr(self, "bn{}".format(i))(x)
        x = x.view(x.size(0), x.size(1), -1) # shape: (batch, feats, o3)
        x = torch.mean(x, 2)
        return self.output(x)

In [18]:
from enum import Enum
class ConfigType(Enum):
    CNN_TRAD_POOL2 = "cnn-trad-pool2" # default full model (TF variant)
    CNN_ONE_STRIDE1 = "cnn-one-stride1" # default compact model (TF variant)
    CNN_ONE_FPOOL3 = "cnn-one-fpool3"
    CNN_ONE_FSTRIDE4 = "cnn-one-fstride4"
    CNN_ONE_FSTRIDE8 = "cnn-one-fstride8"
    CNN_TPOOL2 = "cnn-tpool2"
    CNN_TPOOL3 = "cnn-tpool3"
    CNN_TSTRIDE2 = "cnn-tstride2"
    CNN_TSTRIDE4 = "cnn-tstride4"
    CNN_TSTRIDE8 = "cnn-tstride8"
    RES15 = "res15"
    RES26 = "res26"
    RES8 = "res8"
    RES15_NARROW = "res15-narrow"
    RES8_NARROW = "res8-narrow"
    RES26_NARROW = "res26-narrow"

_configs = {
    ConfigType.CNN_TRAD_POOL2.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=64,
        n_feature_maps2=64, conv1_size=(20, 8), conv2_size=(10, 4), conv1_pool=(2, 2), conv1_stride=(1, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), tf_variant=True),
    ConfigType.CNN_ONE_STRIDE1.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=186,
        conv1_size=(101, 8), conv1_pool=(1, 1), conv1_stride=(1, 1), dnn1_size=128, dnn2_size=128, tf_variant=True),
    ConfigType.CNN_TSTRIDE2.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=78,
        n_feature_maps2=78, conv1_size=(16, 8), conv2_size=(9, 4), conv1_pool=(1, 3), conv1_stride=(2, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TSTRIDE4.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=100,
        n_feature_maps2=78, conv1_size=(16, 8), conv2_size=(5, 4), conv1_pool=(1, 3), conv1_stride=(4, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TSTRIDE8.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=126,
        n_feature_maps2=78, conv1_size=(16, 8), conv2_size=(5, 4), conv1_pool=(1, 3), conv1_stride=(8, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TPOOL2.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=64,
        n_feature_maps2=64, conv1_size=(21, 8), conv2_size=(6, 4), conv1_pool=(2, 3), conv1_stride=(1, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TPOOL3.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=len(CLASSES), n_feature_maps1=64,
        n_feature_maps2=64, conv1_size=(20, 8), conv2_size=(10, 4), conv1_pool=(3, 3), conv1_stride=(1, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128),
    ConfigType.CNN_ONE_FPOOL3.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=54,
        conv1_size=(101, 8), conv1_pool=(1, 3), conv1_stride=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_ONE_FSTRIDE4.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=186,
        conv1_size=(101, 8), conv1_pool=(1, 1), conv1_stride=(1, 4), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_ONE_FSTRIDE8.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=336,
        conv1_size=(101, 8), conv1_pool=(1, 1), conv1_stride=(1, 8), dnn1_size=128),
    ConfigType.RES15.value: dict(n_labels=12, use_dilation=True, n_layers=13, n_feature_maps=45),
    ConfigType.RES8.value: dict(n_labels=12, n_layers=6, n_feature_maps=45, res_pool=(4, 3), use_dilation=False),
    ConfigType.RES26.value: dict(n_labels=12, n_layers=24, n_feature_maps=45, res_pool=(2, 2), use_dilation=False),
    ConfigType.RES15_NARROW.value: dict(n_labels=12, use_dilation=True, n_layers=13, n_feature_maps=19),
    ConfigType.RES8_NARROW.value: dict(n_labels=12, n_layers=6, n_feature_maps=19, res_pool=(4, 3), use_dilation=False),
    ConfigType.RES26_NARROW.value: dict(n_labels=12, n_layers=24, n_feature_maps=19, res_pool=(2, 2), use_dilation=False)
}

In [20]:
_configs["res15"]

{'n_labels': 12, 'use_dilation': True, 'n_layers': 13, 'n_feature_maps': 45}

In [19]:
model = SpeechResModel(_configs["res15"])
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

KeyError: 'n_feature_maps1'

In [3]:
def truncated_normal(tensor, std_dev=0.01):
    tensor.zero_()
    tensor.normal_(std=std_dev)
    while torch.sum(torch.abs(tensor) > 2 * std_dev) > 0:
        t = tensor[torch.abs(tensor) > 2 * std_dev]
        t.zero_()
        tensor[torch.abs(tensor) > 2 * std_dev] = torch.normal(t, std=std_dev)

class KWS(nn.Module):
    def __init__(self, config):
        super().__init__()
        n_labels = config["n_labels"]
        n_featmaps1 = config["n_feature_maps1"]

        conv1_size = config["conv1_size"] # (time, frequency)
        conv1_pool = config["conv1_pool"]
        conv1_stride = tuple(config["conv1_stride"])
        dropout_prob = config["dropout_prob"]
        width = config["width"]
        height = config["height"]
        self.conv1 = nn.Conv2d(1, n_featmaps1, conv1_size, stride=conv1_stride, padding=(6,0))
        tf_variant = config.get("tf_variant")
        self.tf_variant = tf_variant
        if tf_variant:
            truncated_normal(self.conv1.weight.data)
            self.conv1.bias.data.zero_()
        self.pool1 = nn.MaxPool2d(conv1_pool)

        x = Variable(torch.zeros(1, 1, height, width), volatile=True)
        x = self.pool1(self.conv1(x))
        conv_net_size = x.view(1, -1).size(1)
        last_size = conv_net_size

        if "conv2_size" in config:
            conv2_size = config["conv2_size"]
            conv2_pool = config["conv2_pool"]
            conv2_stride = tuple(config["conv2_stride"])
            n_featmaps2 = config["n_feature_maps2"]
            self.conv2 = nn.Conv2d(n_featmaps1, n_featmaps2, conv2_size, stride=conv2_stride, padding=(4,0))
            if tf_variant:
                truncated_normal(self.conv2.weight.data)
                self.conv2.bias.data.zero_()
            self.pool2 = nn.AdaptiveMaxPool2d((1,1))
            x = self.pool2(self.conv2(x))
            conv_net_size = x.view(1, -1).size(1)
            last_size = conv_net_size
        if not tf_variant:
            self.lin = nn.Linear(conv_net_size, 32)

        if "dnn1_size" in config:
            dnn1_size = config["dnn1_size"]
            last_size = dnn1_size
            if tf_variant:
                self.dnn1 = nn.Linear(conv_net_size, dnn1_size)
                truncated_normal(self.dnn1.weight.data)
                self.dnn1.bias.data.zero_()
            else:
                self.dnn1 = nn.Linear(32, dnn1_size)
            if "dnn2_size" in config:
                dnn2_size = config["dnn2_size"]
                last_size = dnn2_size
                self.dnn2 = nn.Linear(dnn1_size, dnn2_size)
                if tf_variant:
                    truncated_normal(self.dnn2.weight.data)
                    self.dnn2.bias.data.zero_()
        self.output = nn.Linear(last_size, n_labels)
        if tf_variant:
            truncated_normal(self.output.weight.data)
            self.output.bias.data.zero_()
        self.dropout = nn.Dropout(dropout_prob)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.relu(self.conv1(x)) # shape: (batch, channels, i1, o1)
        x = self.dropout(x)
        x = self.pool1(x)
        if hasattr(self, "conv2"):
            x = F.relu(self.conv2(x)) # shape: (batch, o1, i2, o2)
            x = self.dropout(x)
            x = self.pool2(x)
        x = x.view(x.size(0), -1) # shape: (batch, o3)
        if hasattr(self, "lin"):
            x = self.lin(x)
        if hasattr(self, "dnn1"):
            x = self.dnn1(x)
            if not self.tf_variant:
                x = F.relu(x)
            x = self.dropout(x)
        if hasattr(self, "dnn2"):
            x = self.dnn2(x)
            x = self.dropout(x)
        return self.softmax(self.output(x))

In [4]:
from enum import Enum
class ConfigType(Enum):
    CNN_TRAD_POOL2 = "cnn-trad-pool2" # default full model (TF variant)
    CNN_ONE_STRIDE1 = "cnn-one-stride1" # default compact model (TF variant)
    CNN_ONE_FPOOL3 = "cnn-one-fpool3"
    CNN_ONE_FSTRIDE4 = "cnn-one-fstride4"
    CNN_ONE_FSTRIDE8 = "cnn-one-fstride8"
    CNN_TPOOL2 = "cnn-tpool2"
    CNN_TPOOL3 = "cnn-tpool3"
    CNN_TSTRIDE2 = "cnn-tstride2"
    CNN_TSTRIDE4 = "cnn-tstride4"
    CNN_TSTRIDE8 = "cnn-tstride8"
    RES15 = "res15"
    RES26 = "res26"
    RES8 = "res8"
    RES15_NARROW = "res15-narrow"
    RES8_NARROW = "res8-narrow"
    RES26_NARROW = "res26-narrow"

_configs = {
    ConfigType.CNN_TRAD_POOL2.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=64,
        n_feature_maps2=64, conv1_size=(20, 8), conv2_size=(10, 4), conv1_pool=(2, 2), conv1_stride=(1, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), tf_variant=True),
    ConfigType.CNN_ONE_STRIDE1.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=186,
        conv1_size=(101, 8), conv1_pool=(1, 1), conv1_stride=(1, 1), dnn1_size=128, dnn2_size=128, tf_variant=True),
    ConfigType.CNN_TSTRIDE2.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=78,
        n_feature_maps2=78, conv1_size=(16, 8), conv2_size=(9, 4), conv1_pool=(1, 3), conv1_stride=(2, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TSTRIDE4.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=100,
        n_feature_maps2=78, conv1_size=(16, 8), conv2_size=(5, 4), conv1_pool=(1, 3), conv1_stride=(4, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TSTRIDE8.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=126,
        n_feature_maps2=78, conv1_size=(16, 8), conv2_size=(5, 4), conv1_pool=(1, 3), conv1_stride=(8, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TPOOL2.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=64,
        n_feature_maps2=64, conv1_size=(21, 8), conv2_size=(6, 4), conv1_pool=(2, 3), conv1_stride=(1, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_TPOOL3.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=len(CLASSES), n_feature_maps1=64,
        n_feature_maps2=64, conv1_size=(20, 8), conv2_size=(10, 4), conv1_pool=(3, 3), conv1_stride=(1, 1),
        conv2_stride=(1, 1), conv2_pool=(1, 1), dnn1_size=128),
    ConfigType.CNN_ONE_FPOOL3.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=54,
        conv1_size=(101, 8), conv1_pool=(1, 3), conv1_stride=(1, 1), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_ONE_FSTRIDE4.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=186,
        conv1_size=(101, 8), conv1_pool=(1, 1), conv1_stride=(1, 4), dnn1_size=128, dnn2_size=128),
    ConfigType.CNN_ONE_FSTRIDE8.value: dict(dropout_prob=0.5, height=101, width=40, n_labels=4, n_feature_maps1=336,
        conv1_size=(101, 8), conv1_pool=(1, 1), conv1_stride=(1, 8), dnn1_size=128),
    ConfigType.RES15.value: dict(n_labels=12, use_dilation=True, n_layers=13, n_feature_maps=45),
    ConfigType.RES8.value: dict(n_labels=12, n_layers=6, n_feature_maps=45, res_pool=(4, 3), use_dilation=False),
    ConfigType.RES26.value: dict(n_labels=12, n_layers=24, n_feature_maps=45, res_pool=(2, 2), use_dilation=False),
    ConfigType.RES15_NARROW.value: dict(n_labels=12, use_dilation=True, n_layers=13, n_feature_maps=19),
    ConfigType.RES8_NARROW.value: dict(n_labels=12, n_layers=6, n_feature_maps=19, res_pool=(4, 3), use_dilation=False),
    ConfigType.RES26_NARROW.value: dict(n_labels=12, n_layers=24, n_feature_maps=19, res_pool=(2, 2), use_dilation=False)
}

In [5]:
_configs["cnn-tpool3"]

{'dropout_prob': 0.5,
 'height': 101,
 'width': 40,
 'n_labels': 16,
 'n_feature_maps1': 64,
 'n_feature_maps2': 64,
 'conv1_size': (20, 8),
 'conv2_size': (10, 4),
 'conv1_pool': (3, 3),
 'conv1_stride': (1, 1),
 'conv2_stride': (1, 1),
 'conv2_pool': (1, 1),
 'dnn1_size': 128}

In [6]:
model = KWS(_configs["cnn-tpool3"])
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)



182576

In [7]:
model

KWS(
  (conv1): Conv2d(1, 64, kernel_size=(20, 8), stride=(1, 1), padding=(6, 0))
  (pool1): MaxPool2d(kernel_size=(3, 3), stride=(3, 3), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 64, kernel_size=(10, 4), stride=(1, 1), padding=(4, 0))
  (pool2): AdaptiveMaxPool2d(output_size=(1, 1))
  (lin): Linear(in_features=64, out_features=32, bias=True)
  (dnn1): Linear(in_features=32, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=16, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (softmax): Softmax(dim=1)
)

In [8]:
class Arg():
    def __init__(self):
        self.train_dataset="/home/cilab/LabMembers/YS/Speech/life/experiments/train"
        self.valid_dataset="/home/cilab/LabMembers/YS/Speech/life/experiments/valid"
        self.background_noise="/home/cilab/LabMembers/YS/Speech/_background_noise_"
        self.comment=""
        self.batch_size=64
        self.dataload_workers_nums=6
        self.weight_decay=1e-2
        self.optim='adam'
        self.learning_rate=0.0001
        self.lr_scheduler='step'
        self.lr_scheduler_patience=5
        self.lr_scheduler_step_size=50
        self.lr_scheduler_gamma=0.1
        self.max_epochs=70
        self.resume=None
        self.model="kws-mfcc40"
        self.input="mel40"
        self.mixup=True
args = Arg()

In [9]:
use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)
if use_gpu:
    torch.backends.cudnn.benchmark = True

n_mels = 32
if args.input == 'mel40':
    n_mels = 40

data_aug_transform = Compose([ChangeAmplitude(), ChangeSpeedAndPitchAudio(), FixAudioLength()])
bg_dataset = BackgroundNoiseDataset(args.background_noise, data_aug_transform)
add_bg_noise = AddBackgroundNoiseOnSTFT(bg_dataset)
train_feature_transform = Compose([ToMfcc(n_mels=n_mels), ToTensor('mfcc', 'input')])
train_dataset = SpeechCommandsDataset(args.train_dataset,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         train_feature_transform]), silence_percentage=0)

valid_feature_transform = Compose([ToMfcc(n_mels=n_mels), ToTensor('mfcc', 'input')])
valid_dataset = SpeechCommandsDataset(args.valid_dataset,
                                Compose([LoadAudio(),
                                         data_aug_transform,
                                         valid_feature_transform]), silence_percentage=0)

weights = train_dataset.make_weights_for_balanced_classes()
sampler = WeightedRandomSampler(weights, len(weights))
train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler,
                              pin_memory=use_gpu, num_workers=args.dataload_workers_nums)
valid_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
                              pin_memory=use_gpu, num_workers=args.dataload_workers_nums)

use_gpu True
all :  ['미세먼지', '유튜브', '마이크', '캘린더', '메시지', '이야기', '네이버', '카카오', '커피숍', '인터넷', '이미지', '서비스', '카메라', '내비게이션', '이메일']
all :  ['미세먼지', '유튜브', '마이크', '캘린더', '메시지', '이야기', '네이버', '카카오', '커피숍', '인터넷', '이미지', '서비스', '카메라', '내비게이션', '이메일']


  weight_per_class = N / count


In [10]:
# a name used to save checkpoints etc.
full_name = '%s_%s_%s_bs%d_lr%.1e_wd%.1e' % (args.model, args.optim, args.lr_scheduler, args.batch_size, args.learning_rate, args.weight_decay)
if args.comment:
    full_name = '%s_%s' % (full_name, args.comment)

#model = models.create_model(model_name=args.model, num_classes=len(CLASSES), in_channels=1)
model = KWS(_configs["cnn-tpool3"])
print(model)
if use_gpu:
    model = torch.nn.DataParallel(model).cuda()

criterion = torch.nn.CrossEntropyLoss()

if args.optim == 'sgd':
    optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
else:
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

start_timestamp = int(time.time()*1000)
start_epoch = 0
best_accuracy = 0
best_loss = 1e100
global_step = 0

if args.resume:
    print("resuming a checkpoint '%s'" % args.resume)
    checkpoint = torch.load(args.resume)
    model.load_state_dict(checkpoint['state_dict'])
    model.float()
    optimizer.load_state_dict(checkpoint['optimizer'])

    best_accuracy = checkpoint.get('accuracy', best_accuracy)
    best_loss = checkpoint.get('loss', best_loss)
    start_epoch = checkpoint.get('epoch', start_epoch)
    global_step = checkpoint.get('step', global_step)

    del checkpoint  # reduce memory

if args.lr_scheduler == 'plateau':
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=args.lr_scheduler_patience, factor=args.lr_scheduler_gamma)
else:
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_scheduler_step_size, gamma=args.lr_scheduler_gamma, last_epoch=start_epoch-1)



KWS(
  (conv1): Conv2d(1, 64, kernel_size=(20, 8), stride=(1, 1), padding=(6, 0))
  (pool1): MaxPool2d(kernel_size=(3, 3), stride=(3, 3), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 64, kernel_size=(10, 4), stride=(1, 1), padding=(4, 0))
  (pool2): AdaptiveMaxPool2d(output_size=(1, 1))
  (lin): Linear(in_features=64, out_features=32, bias=True)
  (dnn1): Linear(in_features=32, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=16, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (softmax): Softmax(dim=1)
)


In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

182576

In [12]:
def get_lr():
    return optimizer.param_groups[0]['lr']

writer = SummaryWriter(comment=('_speech_commands_' + full_name))

def train(epoch):
    global global_step

    print("epoch %3d with lr=%.02e" % (epoch, get_lr()))
    phase = 'train'
    writer.add_scalar('%s/learning_rate' % phase,  get_lr(), epoch)

    model.train()  # Set model to training mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    pbar = tqdm(train_dataloader, unit="audios", unit_scale=train_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']
        #print(inputs.shape)
        if args.mixup:
            inputs, targets = mixup(inputs, targets, num_classes=len(CLASSES))

        inputs = Variable(inputs, requires_grad=True)
        targets = Variable(targets, requires_grad=False)
        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward/backward
        outputs = model(inputs)
        #print(outputs.shape)
        if args.mixup:
            loss = mixup_cross_entropy_loss(outputs, targets)
        else:
            loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]
        if args.mixup:
            targets = batch['target']
            targets = Variable(targets, requires_grad=False).cuda(async=True)
        correct += pred.eq(targets.data.view_as(pred)).sum()
        total += targets.size(0)

        writer.add_scalar('%s/loss' % phase, loss.item(), global_step)

        # update the progress bar
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })

    accuracy = correct/total
    epoch_loss = running_loss / it
    writer.add_scalar('%s/accuracy' % phase, 100*accuracy, epoch)
    writer.add_scalar('%s/epoch_loss' % phase, epoch_loss, epoch)

In [13]:
def valid(epoch):
    global best_accuracy, best_loss, global_step

    phase = 'valid'
    model.eval()  # Set model to evaluate mode

    running_loss = 0.0
    it = 0
    correct = 0
    total = 0

    pbar = tqdm(valid_dataloader, unit="audios", unit_scale=valid_dataloader.batch_size)
    for batch in pbar:
        inputs = batch['input']
        inputs = torch.unsqueeze(inputs, 1)
        targets = batch['target']

        inputs = Variable(inputs, volatile = True)
        targets = Variable(targets, requires_grad=False)

        if use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda(async=True)

        # forward
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # statistics
        it += 1
        global_step += 1
        running_loss += loss.item()
        pred = outputs.data.max(1, keepdim=True)[1]
        correct += pred.eq(targets.data.view_as(pred)).sum()
        total += targets.size(0)

        writer.add_scalar('%s/loss' % phase, loss.item(), global_step)

        # update the progress bar
        pbar.set_postfix({
            'loss': "%.05f" % (running_loss / it),
            'acc': "%.02f%%" % (100*correct/total)
        })

    accuracy = 100*correct/total
    epoch_loss = running_loss / it
    writer.add_scalar('%s/accuracy' % phase, accuracy, epoch)
    writer.add_scalar('%s/epoch_loss' % phase, epoch_loss, epoch)
    checkpoint = {
        'epoch': epoch,
        'step': global_step,
        'state_dict': model.state_dict(),
        'loss': epoch_loss,
        'accuracy': accuracy,
        'optimizer' : optimizer.state_dict(),
    }
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(checkpoint, 'checkpoints/best-acc-resnet18-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-loss.pth' % (start_timestamp, full_name))
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(checkpoint, 'checkpoints/best-loss-resnet18-%s.pth' % full_name)
        torch.save(model, '%d-%s-best-acc.pth' % (start_timestamp, full_name))
    torch.save(model, './res18.pth')
    #torch.save(checkpoint, 'checkpoints/Resnet18.pth')
    del checkpoint  # reduce memory

    return epoch_loss

In [14]:
print("training %s for Google speech commands..." % args.model)
since = time.time()
for epoch in range(start_epoch, args.max_epochs):
    if args.lr_scheduler == 'step':
        lr_scheduler.step()

    train(epoch)
    epoch_loss = valid(epoch)

    if args.lr_scheduler == 'plateau':
        print(type(epoch_loss))
        lr_scheduler.step(metrics=epoch_loss)

    time_elapsed = time.time() - since
    time_str = 'total time elapsed: {:.0f}h {:.0f}m {:.0f}s '.format(time_elapsed // 3600, time_elapsed % 3600 // 60, time_elapsed % 60)
    print("%s, best accuracy: %.02f%%, best loss %f" % (time_str, best_accuracy, best_loss))
    with open('./train_res18.log', 'a+') as f:
        f.write("%s, epoch: %s, best accuracy: %.02f%%, best loss %f\n" % (time_str, epoch,best_accuracy, best_loss))
print("finished")

  0%|          | 0/448 [00:00<?, ?audios/s]

training kws-mfcc40 for Google speech commands...
epoch   0 with lr=1.00e-04


100%|██████████| 448/448 [00:12<00:00, 35.37audios/s, loss=2.78736, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 124.69audios/s, loss=2.78670, acc=16.00%]
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 14s , best accuracy: 16.00%, best loss 2.786695
epoch   1 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 225.43audios/s, loss=2.79710, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 114.44audios/s, loss=2.78766, acc=13.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 17s , best accuracy: 16.00%, best loss 2.786695
epoch   2 with lr=1.00e-04


100%|██████████| 448/448 [00:02<00:00, 198.16audios/s, loss=2.80158, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 111.38audios/s, loss=2.79526, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 20s , best accuracy: 16.00%, best loss 2.786695
epoch   3 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 231.77audios/s, loss=2.80781, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 113.55audios/s, loss=2.80003, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 23s , best accuracy: 16.00%, best loss 2.786695
epoch   4 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 284.08audios/s, loss=2.78629, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 102.90audios/s, loss=2.80338, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 26s , best accuracy: 16.00%, best loss 2.786695
epoch   5 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 269.71audios/s, loss=2.80307, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 96.41audios/s, loss=2.80375, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 29s , best accuracy: 16.00%, best loss 2.786695
epoch   6 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 247.23audios/s, loss=2.79107, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 118.15audios/s, loss=2.80480, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 32s , best accuracy: 16.00%, best loss 2.786695
epoch   7 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 265.55audios/s, loss=2.79422, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 109.74audios/s, loss=2.81679, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 35s , best accuracy: 16.00%, best loss 2.786695
epoch   8 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 288.52audios/s, loss=2.81018, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 107.36audios/s, loss=2.82914, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 38s , best accuracy: 16.00%, best loss 2.786695
epoch   9 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 232.79audios/s, loss=2.79412, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 116.37audios/s, loss=2.83241, acc=1.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 41s , best accuracy: 16.00%, best loss 2.786695
epoch  10 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 254.04audios/s, loss=2.80222, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 96.86audios/s, loss=2.83263, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 44s , best accuracy: 16.00%, best loss 2.786695
epoch  11 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 254.24audios/s, loss=2.80009, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 126.67audios/s, loss=2.84058, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 47s , best accuracy: 16.00%, best loss 2.786695
epoch  12 with lr=1.00e-04


100%|██████████| 448/448 [00:02<00:00, 209.53audios/s, loss=2.79693, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 125.82audios/s, loss=2.84027, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 50s , best accuracy: 16.00%, best loss 2.786695
epoch  13 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 281.50audios/s, loss=2.82069, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 114.04audios/s, loss=2.83101, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 53s , best accuracy: 16.00%, best loss 2.786695
epoch  14 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 228.92audios/s, loss=2.78536, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 103.28audios/s, loss=2.83636, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 56s , best accuracy: 16.00%, best loss 2.786695
epoch  15 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 243.18audios/s, loss=2.79864, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 109.13audios/s, loss=2.82617, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 0m 59s , best accuracy: 16.00%, best loss 2.786695
epoch  16 with lr=1.00e-04


100%|██████████| 448/448 [00:02<00:00, 220.35audios/s, loss=2.80218, acc=6.00%]
100%|██████████| 128/128 [00:01<00:00, 119.55audios/s, loss=2.82360, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 2s , best accuracy: 16.00%, best loss 2.786695
epoch  17 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 274.36audios/s, loss=2.80166, acc=5.00%]
100%|██████████| 128/128 [00:01<00:00, 98.61audios/s, loss=2.82388, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 5s , best accuracy: 16.00%, best loss 2.786695
epoch  18 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 311.50audios/s, loss=2.79178, acc=8.00%]
100%|██████████| 128/128 [00:01<00:00, 125.42audios/s, loss=2.84402, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 7s , best accuracy: 16.00%, best loss 2.786695
epoch  19 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 383.65audios/s, loss=2.80596, acc=7.00%]
100%|██████████| 128/128 [00:01<00:00, 125.15audios/s, loss=2.84687, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 10s , best accuracy: 16.00%, best loss 2.786695
epoch  20 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 382.46audios/s, loss=2.79454, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 142.65audios/s, loss=2.83910, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 12s , best accuracy: 16.00%, best loss 2.786695
epoch  21 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 357.81audios/s, loss=2.80321, acc=6.00%]
100%|██████████| 128/128 [00:00<00:00, 143.33audios/s, loss=2.82886, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 14s , best accuracy: 16.00%, best loss 2.786695
epoch  22 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 368.58audios/s, loss=2.79254, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 144.55audios/s, loss=2.82022, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 16s , best accuracy: 16.00%, best loss 2.786695
epoch  23 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 387.82audios/s, loss=2.80728, acc=6.00%]
100%|██████████| 128/128 [00:00<00:00, 135.04audios/s, loss=2.82790, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 18s , best accuracy: 16.00%, best loss 2.786695
epoch  24 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 378.61audios/s, loss=2.78877, acc=6.00%]
100%|██████████| 128/128 [00:00<00:00, 138.02audios/s, loss=2.85176, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 20s , best accuracy: 16.00%, best loss 2.786695
epoch  25 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 385.29audios/s, loss=2.81426, acc=5.00%]
100%|██████████| 128/128 [00:00<00:00, 139.18audios/s, loss=2.85265, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 22s , best accuracy: 16.00%, best loss 2.786695
epoch  26 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 367.15audios/s, loss=2.78189, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 134.11audios/s, loss=2.84883, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 25s , best accuracy: 16.00%, best loss 2.786695
epoch  27 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 401.02audios/s, loss=2.78907, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 143.65audios/s, loss=2.84513, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 27s , best accuracy: 16.00%, best loss 2.786695
epoch  28 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 343.00audios/s, loss=2.79832, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 140.87audios/s, loss=2.85291, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 29s , best accuracy: 16.00%, best loss 2.786695
epoch  29 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 354.70audios/s, loss=2.79983, acc=6.00%]
100%|██████████| 128/128 [00:00<00:00, 143.74audios/s, loss=2.85556, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 31s , best accuracy: 16.00%, best loss 2.786695
epoch  30 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 336.59audios/s, loss=2.78021, acc=9.00%] 
100%|██████████| 128/128 [00:00<00:00, 144.57audios/s, loss=2.85141, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 33s , best accuracy: 16.00%, best loss 2.786695
epoch  31 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 345.07audios/s, loss=2.80651, acc=6.00%]
100%|██████████| 128/128 [00:00<00:00, 140.41audios/s, loss=2.83347, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 36s , best accuracy: 16.00%, best loss 2.786695
epoch  32 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 355.69audios/s, loss=2.78618, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 140.62audios/s, loss=2.81975, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 38s , best accuracy: 16.00%, best loss 2.786695
epoch  33 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 361.67audios/s, loss=2.78382, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 135.93audios/s, loss=2.82061, acc=4.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 40s , best accuracy: 16.00%, best loss 2.786695
epoch  34 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 355.08audios/s, loss=2.78192, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 137.66audios/s, loss=2.82619, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 42s , best accuracy: 16.00%, best loss 2.786695
epoch  35 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 352.50audios/s, loss=2.78662, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 136.43audios/s, loss=2.81555, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 44s , best accuracy: 16.00%, best loss 2.786695
epoch  36 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 343.93audios/s, loss=2.78527, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 129.24audios/s, loss=2.81170, acc=9.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 47s , best accuracy: 16.00%, best loss 2.786695
epoch  37 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 396.59audios/s, loss=2.78156, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 135.93audios/s, loss=2.80890, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 49s , best accuracy: 16.00%, best loss 2.786695
epoch  38 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 370.84audios/s, loss=2.79494, acc=5.00%]
100%|██████████| 128/128 [00:00<00:00, 147.72audios/s, loss=2.81490, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 51s , best accuracy: 16.00%, best loss 2.786695
epoch  39 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 360.89audios/s, loss=2.80478, acc=5.00%]
100%|██████████| 128/128 [00:00<00:00, 130.47audios/s, loss=2.82611, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 53s , best accuracy: 16.00%, best loss 2.786695
epoch  40 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 346.47audios/s, loss=2.77984, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 134.76audios/s, loss=2.82489, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 55s , best accuracy: 16.00%, best loss 2.786695
epoch  41 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 362.56audios/s, loss=2.78908, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 139.43audios/s, loss=2.82076, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 58s , best accuracy: 16.00%, best loss 2.786695
epoch  42 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 372.03audios/s, loss=2.78218, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 135.34audios/s, loss=2.81391, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 1m 60s , best accuracy: 16.00%, best loss 2.786695
epoch  43 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 361.04audios/s, loss=2.78322, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 137.79audios/s, loss=2.81606, acc=3.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 2s , best accuracy: 16.00%, best loss 2.786695
epoch  44 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 351.18audios/s, loss=2.78302, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 143.56audios/s, loss=2.81350, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 4s , best accuracy: 16.00%, best loss 2.786695
epoch  45 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 350.76audios/s, loss=2.79009, acc=6.00%]
100%|██████████| 128/128 [00:00<00:00, 138.39audios/s, loss=2.82689, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 6s , best accuracy: 16.00%, best loss 2.786695
epoch  46 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 370.09audios/s, loss=2.76504, acc=11.00%]
100%|██████████| 128/128 [00:00<00:00, 143.83audios/s, loss=2.82354, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 9s , best accuracy: 16.00%, best loss 2.786695
epoch  47 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 351.41audios/s, loss=2.78228, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 133.00audios/s, loss=2.81517, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 11s , best accuracy: 16.00%, best loss 2.786695
epoch  48 with lr=1.00e-04


100%|██████████| 448/448 [00:01<00:00, 354.17audios/s, loss=2.78426, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 143.28audios/s, loss=2.82241, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 13s , best accuracy: 16.00%, best loss 2.786695
epoch  49 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 366.01audios/s, loss=2.77642, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 143.80audios/s, loss=2.82158, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 15s , best accuracy: 16.00%, best loss 2.786695
epoch  50 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 348.21audios/s, loss=2.78555, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 142.29audios/s, loss=2.81803, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 17s , best accuracy: 16.00%, best loss 2.786695
epoch  51 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 360.55audios/s, loss=2.77044, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 137.18audios/s, loss=2.81518, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 20s , best accuracy: 16.00%, best loss 2.786695
epoch  52 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 348.05audios/s, loss=2.76728, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 144.69audios/s, loss=2.81684, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 22s , best accuracy: 16.00%, best loss 2.786695
epoch  53 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 343.35audios/s, loss=2.78330, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 140.21audios/s, loss=2.81246, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 24s , best accuracy: 16.00%, best loss 2.786695
epoch  54 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 355.81audios/s, loss=2.76641, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 140.81audios/s, loss=2.81210, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 26s , best accuracy: 16.00%, best loss 2.786695
epoch  55 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 413.12audios/s, loss=2.76710, acc=11.00%]
100%|██████████| 128/128 [00:00<00:00, 135.88audios/s, loss=2.81299, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 28s , best accuracy: 16.00%, best loss 2.786695
epoch  56 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 417.74audios/s, loss=2.76518, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 143.57audios/s, loss=2.81611, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 30s , best accuracy: 16.00%, best loss 2.786695
epoch  57 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 403.24audios/s, loss=2.76841, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 130.51audios/s, loss=2.81483, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 32s , best accuracy: 16.00%, best loss 2.786695
epoch  58 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 399.31audios/s, loss=2.78063, acc=7.00%]
100%|██████████| 128/128 [00:00<00:00, 132.86audios/s, loss=2.81123, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 34s , best accuracy: 16.00%, best loss 2.786695
epoch  59 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 409.62audios/s, loss=2.75567, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 136.59audios/s, loss=2.81004, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 37s , best accuracy: 16.00%, best loss 2.786695
epoch  60 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 417.51audios/s, loss=2.76212, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 135.11audios/s, loss=2.81515, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 39s , best accuracy: 16.00%, best loss 2.786695
epoch  61 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 393.50audios/s, loss=2.75981, acc=9.00%]
100%|██████████| 128/128 [00:00<00:00, 134.15audios/s, loss=2.81088, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 41s , best accuracy: 16.00%, best loss 2.786695
epoch  62 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 396.40audios/s, loss=2.75662, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 128.14audios/s, loss=2.81054, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 43s , best accuracy: 16.00%, best loss 2.786695
epoch  63 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 369.35audios/s, loss=2.76261, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 138.06audios/s, loss=2.81200, acc=7.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 45s , best accuracy: 16.00%, best loss 2.786695
epoch  64 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 394.71audios/s, loss=2.73055, acc=12.00%]
100%|██████████| 128/128 [00:00<00:00, 133.03audios/s, loss=2.81064, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 47s , best accuracy: 16.00%, best loss 2.786695
epoch  65 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 406.75audios/s, loss=2.76427, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 128.35audios/s, loss=2.81107, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 49s , best accuracy: 16.00%, best loss 2.786695
epoch  66 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 403.57audios/s, loss=2.76381, acc=10.00%]
100%|██████████| 128/128 [00:00<00:00, 133.50audios/s, loss=2.80952, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 51s , best accuracy: 16.00%, best loss 2.786695
epoch  67 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 402.23audios/s, loss=2.77038, acc=8.00%]
100%|██████████| 128/128 [00:00<00:00, 132.51audios/s, loss=2.80762, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 53s , best accuracy: 16.00%, best loss 2.786695
epoch  68 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 410.85audios/s, loss=2.75406, acc=11.00%]
100%|██████████| 128/128 [00:00<00:00, 133.82audios/s, loss=2.80544, acc=6.00%]
  0%|          | 0/448 [00:00<?, ?audios/s]

total time elapsed: 0h 2m 55s , best accuracy: 16.00%, best loss 2.786695
epoch  69 with lr=1.00e-05


100%|██████████| 448/448 [00:01<00:00, 402.26audios/s, loss=2.73187, acc=11.00%]
100%|██████████| 128/128 [00:00<00:00, 139.96audios/s, loss=2.79897, acc=7.00%]

total time elapsed: 0h 2m 58s , best accuracy: 16.00%, best loss 2.786695
finished



