### weighted loss training on AISHELL and 2018, 0.1+0.9

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '5,6,7'
import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy import signal
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms import Compose
from torch.nn import Parameter
from torch.autograd import Variable
import tensorboardX
from tqdm import tqdm

import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore') # scipy throws future warnings on fft (known bug)

In [3]:
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

In [4]:
!ulimit -n

2048


In [5]:
class IdentificationDataset(Dataset):
    
    def __init__(self, path, train, transform=None):
        #iden_split_path = os.path.join(path, 'iden_split.txt')
        split = pd.read_csv('united_data.csv', sep=',')
        split = split.loc[split.duration > 1]                              #get rid of <1s
        split = split.sample(frac=1).reset_index(drop=True)                #shuffer
        
        self.split = split
        self.dataset = split['file']
        self.phase = split['phase']
        self.path = path
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def spectrum(self, wav):
        audio_path = wav
        # read .wav
        rate, samples = wavfile.read(audio_path)
        ## parameters
        window = 'hamming'
        # window width and step size
        Tw = 25 # ms
        Ts = 10 # ms
        # frame duration (samples)
        Nw = int(rate * Tw * 1e-3)
        Ns = int(rate * (Tw - Ts) * 1e-3)
        # overlapped duration (samples)
        # 2 ** to the next pow of 2 of (Nw - 1)
        nfft = 2 ** (Nw - 1).bit_length()
        pre_emphasis = 0.97

        # preemphasis filter
        samples = np.append(samples[0], samples[1:] - pre_emphasis * samples[:-1])

        # removes DC component of the signal and add a small dither
        samples = signal.lfilter([1, -1], [1, -0.99], samples)
        dither = np.random.uniform(-1, 1, samples.shape)
        spow = np.std(samples)
        samples = samples + 1e-6 * spow * dither
        _, _, spec = signal.spectrogram(samples, rate, window, Nw, Ns, nfft, 
                                            mode='magnitude', return_onesided=False)
        spec *= rate / 10
        
        return spec
    
    def __getitem__(self, idx):
        
        phase = self.phase[idx]
        # path
        track_path = self.dataset[idx] # 'gaoqiong/201709121820301375220201.wav', 'wav/C0001/IC0001W0001.wav'
        
        if phase == 1:
            audio_path = os.path.join(self.path[0], track_path)
            samples = self.spectrum(audio_path)
            # extract label
            label = int(self.split.loc[self.split['file'] == track_path].label)
            mask = 0.1

            if self.train:
                if samples.shape[1] == 300:
                    spec = samples
                elif samples.shape[1] > 300:
                    upper_bound = samples.shape[1] - 300
                    start = np.random.randint(0, upper_bound)
                    spec = samples[:,start:start+300]
                else:#random pick wav to concat under same label
                    candit = self.split.loc[self.split.label == label].file.reset_index(drop=True) # signle out wavs
                    while(samples.shape[1] <= 300):# if <=300 pick&concat
                        intdx = np.random.randint(0, len(candit))
                        wav = candit.loc[intdx] #random pick one wav ->'wav/C0001/IC0001W0177.wav'
                        wav_path = os.path.join(self.path[0], wav)
                        t_sample = self.spectrum(wav_path)
                        samples = np.hstack((samples,t_sample)) #concat
                    upper_bound = samples.shape[1] - 300
                    start = np.random.randint(0, upper_bound)
                    spec = samples[:,start:start+300]

            if self.transform:
                spec = self.transform[0](spec)
                
        else:
            feature_path = os.path.join(self.path[1], track_path[:-3])
            feature_path = feature_path + 'npy'

            # read .npy
            samples = np.load(feature_path)
            # extract label
            label = int(self.split.loc[self.dataset == track_path].label)
            mask = 0.9

            if self.train:

                if samples.shape[1] == 300:
                    spec = samples
                elif samples.shape[1] > 300:
                    upper_bound = samples.shape[1] - 300
                    start = np.random.randint(0, upper_bound)
                    spec = samples[:,start:start+300]
                else:#random pick wav to concat under same label
                    candit = self.split.loc[self.split.label == label].file.reset_index(drop=True) # signle out wavs
                    while(samples.shape[1] <= 300):# if <=300 pick&concat
                        intdx = np.random.randint(0, len(candit))
                        wav = candit.loc[intdx] #random pick one wav
                        wav_feature = self.path[1] + wav[:-3] + 'npy'
                        t_sample = np.load(wav_feature)
                        samples = np.hstack((samples,t_sample)) #concat
                    upper_bound = samples.shape[1] - 300
                    start = np.random.randint(0, upper_bound)
                    spec = samples[:,start:start+300]

            else:
                spec = samples

            if self.transform:
                spec = self.transform[1](spec)

        return label, spec, mask

In [6]:
class ToTensor(object):
    """Convert spectogram to Tensor."""
    
    def __call__(self, spec):
        F, T = spec.shape
        
        # now specs are of size (Freq, Time) and 2D but has to be 3D (channel dim)
        spec = spec.reshape(1, F, T)
        
        # make the ndarray to be of a proper type (was float64)
        spec = spec.astype(np.float32)
        
        return torch.from_numpy(spec)
    
class Normalize(object):
    """Normalizes voice spectrogram (mean-varience)"""
    
    def __call__(self, spec):
        
        # (Freq, Time)
        # mean-variance normalization for every spectrogram (not batch-wise)
        mu = spec.mean(axis=1).reshape(512, 1)
        sigma = spec.std(axis=1).reshape(512, 1)
        spec = (spec - mu) / sigma

        return spec

In [7]:
#clr
import math
from bisect import bisect_right,bisect_left

import torch
import numpy as np
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.optimizer import Optimizer

class CyclicCosAnnealingLR(_LRScheduler):

    def __init__(self, optimizer,milestones, eta_min=0, last_epoch=-1):
        if not list(milestones) == sorted(milestones):
            raise ValueError('Milestones should be a list of'
                             ' increasing integers. Got {}', milestones)
        self.eta_min = eta_min
        self.milestones=milestones
        super(CyclicCosAnnealingLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        
        if self.last_epoch >= self.milestones[-1]:
            return [self.eta_min for base_lr in self.base_lrs]

        idx = bisect_right(self.milestones,self.last_epoch)
        
        left_barrier = 0 if idx==0 else self.milestones[idx-1]
        right_barrier = self.milestones[idx]

        width = right_barrier - left_barrier
        curr_pos = self.last_epoch- left_barrier 
    
        return [self.eta_min + (base_lr - self.eta_min) *
               (1 + math.cos(math.pi * curr_pos/ width)) / 2
                for base_lr in self.base_lrs]


class CyclicLinearLR(_LRScheduler):
    
    def __init__(self, optimizer,milestones, eta_min=0, last_epoch=-1):
        if not list(milestones) == sorted(milestones):
            raise ValueError('Milestones should be a list of'
                             ' increasing integers. Got {}', milestones)
        self.eta_min = eta_min
        self.milestones=milestones
        super(CyclicLinearLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        
        if self.last_epoch >= self.milestones[-1]:
            return [self.eta_min for base_lr in self.base_lrs]

        idx = bisect_right(self.milestones,self.last_epoch)
        
        left_barrier = 0 if idx==0 else self.milestones[idx-1]
        right_barrier = self.milestones[idx]

        width = right_barrier - left_barrier
        curr_pos = self.last_epoch- left_barrier 
    
        return [self.eta_min + (base_lr - self.eta_min) *
               (1. - 1.0*curr_pos/ width)
                for base_lr in self.base_lrs]

In [8]:
def myphi(x,m):
    x = x * m
    return 1-x**2/math.factorial(2)+x**4/math.factorial(4)-x**6/math.factorial(6) + \
            x**8/math.factorial(8) - x**9/math.factorial(9)

class AngleLinear(nn.Module):
    def __init__(self, in_features, out_features, m = 4, phiflag=True):
        super(AngleLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(in_features,out_features))
        self.weight.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5)
        self.phiflag = phiflag
        self.m = m
        self.mlambda = [
            lambda x: x**0,
            lambda x: x**1,
            lambda x: 2*x**2-1,
            lambda x: 4*x**3-3*x,
            lambda x: 8*x**4-8*x**2+1,
            lambda x: 16*x**5-20*x**3+5*x
        ]

    def forward(self, input):
        x = input   # size=(B,F)    F is feature len
        w = self.weight # size=(F,Classnum) F=in_features Classnum=out_features

        ww = w.renorm(2,1,1e-5).mul(1e5)
        xlen = x.pow(2).sum(1).pow(0.5) # size=B
        wlen = ww.pow(2).sum(0).pow(0.5) # size=Classnum

        cos_theta = x.mm(ww) # size=(B,Classnum)
        cos_theta = cos_theta / xlen.view(-1,1) / wlen.view(1,-1)
        cos_theta = cos_theta.clamp(-1,1)

        if self.phiflag:
            cos_m_theta = self.mlambda[self.m](cos_theta)
            theta = Variable(cos_theta.data.acos())
            k = (self.m*theta/3.14159265).floor()
            n_one = k*0.0 - 1
            phi_theta = (n_one**k) * cos_m_theta - 2*k
        else:
            theta = cos_theta.acos()
            phi_theta = myphi(theta,self.m)
            phi_theta = phi_theta.clamp(-1*self.m,1)

        cos_theta = cos_theta * xlen.view(-1,1)
        phi_theta = phi_theta * xlen.view(-1,1)
        output = (cos_theta,phi_theta)
        return output # size=(B,Classnum,2)


class AngleLoss(nn.Module):
    def __init__(self, gamma=0):
        super(AngleLoss, self).__init__()
        self.gamma   = gamma
        self.it = 0
        self.LambdaMin = 5.0
        self.LambdaMax = 1500.0
        self.lamb = 1500.0

    def forward(self, input, target, mask):
        self.it += 1
        cos_theta,phi_theta = input
        target = target.view(-1,1) #size=(B,1)

        index = cos_theta.data * 0.0 #size=(B,Classnum)
        index.scatter_(1,target.data.view(-1,1),1)
        index = index.byte()
        index = Variable(index)

        self.lamb = max(self.LambdaMin,self.LambdaMax/(1+0.1*self.it ))
        output = cos_theta * 1.0 #size=(B,Classnum)
        output[index] -= cos_theta[index]*(1.0+0)/(1+self.lamb)
        output[index] += phi_theta[index]*(1.0+0)/(1+self.lamb)

        logpt = F.log_softmax(output)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        loss = -1 * (1-pt)**self.gamma * logpt
        
        #weighted loss
        loss = loss * mask
        
        loss = loss.mean()

        return loss

In [9]:
import torch.nn as nn
import torch.utils.model_zoo as model_zoo


__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(512 * block.expansion, 512)
        self.fc2 = AngleLinear(512, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x1 = self.fc1(x)
        x2 = self.fc2(x1)

        return x1, x2


def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model


def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model


def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model


def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model

In [10]:
DATASET_PATH = ['/data/hktxt/AISHELL-2/iOS/data/', '/data/hktxt/2018-vad2-features/']
LOG_PATH = '/data/hktxt/e/CN/logs/trans_2018_10'
EPOCH_NUM = 10

torch.backends.cudnn.deterministic = True
B = 96

WEIGHT_DECAY = 5e-4
LR_INIT = 1e-2
LR = 1e-1
LR_LAST = 1e-4
# lr scheduler parameter
gamma = 10 ** (np.log10(LR_LAST / LR_INIT) / (EPOCH_NUM - 1))
MOMENTUM = 0.9
#DEVICE = "5,6,7"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_WORKERS = 40
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

In [11]:
def adjust_learning_rate(optimizer, epoch):#µ÷ÕûÑ§Ï°ÂÊ²ßÂÔ£¬ÓÅ»¯Æ÷£¬Ä¿Ç°ÂÖÊý
    if epoch <= 20:
        lr = LR
    elif epoch <= 40:
        lr = LR * 0.1
    elif epoch <= 60:
        lr = LR * 0.01
    else:
        lr = LR * 0.001
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [12]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [13]:
#load pretrained model
net = resnet50(pretrained=False, num_classes=2365)#1991+374
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    net = nn.DataParallel(net)
    
#net.to(DEVICE);
"""
model_dict = net.state_dict()
pretrained_dict = torch.load('/data/hktxt/Condadev/voxpy/CN/logs/Res34_ori_1s_1/model_snapshot_30.pkl')
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
if pretrained_dict:
    model_dict.update(pretrained_dict)
    net.load_state_dict(model_dict)
    print('sucessed')
else:
    print('failed')
"""

Let's use 3 GPUs!


"\nmodel_dict = net.state_dict()\npretrained_dict = torch.load('/data/hktxt/Condadev/voxpy/CN/logs/Res34_ori_1s_1/model_snapshot_30.pkl')\npretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}\nif pretrained_dict:\n    model_dict.update(pretrained_dict)\n    net.load_state_dict(model_dict)\n    print('sucessed')\nelse:\n    print('failed')\n"

In [14]:
#net.module.fc1.weight

In [15]:
#net.module.fc2 = AngleLinear(net.module.fc2.in_features, 374)

In [16]:
transforms1 = Compose([
    Normalize(),
    ToTensor()
])
transforms2 = Compose([
    ToTensor()
])
transforms = [transforms1, transforms2]
net.to(DEVICE);

trainset = IdentificationDataset(DATASET_PATH, train=True, transform=transforms)
trainsetloader = torch.utils.data.DataLoader(trainset, batch_size=B, num_workers=NUM_WORKERS, shuffle=True)

#testset = IdentificationDataset(DATASET_PATH, train=False, transform=transforms)
#testsetloader = torch.utils.data.DataLoader(testset, batch_size=1, num_workers=NUM_WORKERS)

criterion = AngleLoss()
optimizer = optim.SGD(net.parameters(), LR_INIT, MOMENTUM, weight_decay=WEIGHT_DECAY)
#lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)
#lr_scheduler = CyclicCosAnnealingLR(optimizer,milestones=[30,80],eta_min=1e-6) #clr

In [17]:
trainset[1][1].shape

torch.Size([1, 512, 300])

In [18]:
trainset[1][0]

920

In [19]:
len(trainsetloader)

11626

In [20]:
train_start = time.time()
for epoch_num in range(EPOCH_NUM):
    #lr_scheduler.step()
    adjust_learning_rate(optimizer, epoch_num)
    
    # train
    print('Epoch {}/{}'.format(epoch_num+1, EPOCH_NUM))
    net.train()
    
    for iter_num, (labels, specs, mask) in tqdm(enumerate(trainsetloader)):
        optimizer.zero_grad()
        labels, specs, mask = labels.to(DEVICE), specs.to(DEVICE), mask.float().to(DEVICE)
        _, scores = net(specs)
        loss = criterion(scores, labels, mask)
        loss.backward()
        optimizer.step()
        
        # TBoard
        step_num = epoch_num * len(trainsetloader) + iter_num
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), step_num)
        #TBoard.add_scalar('gMetrics/lr', lr_scheduler.get_lr()[0], step_num)
        TBoard.add_scalar('gMetrics/lr', get_lr(optimizer), step_num)
    
    #save model every epoch
    torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot_{}.pkl'.format(epoch_num+1)))

train_end = time.time() - train_start
print('Training complete in {:.0f}m {:.0f}s'.format(
    train_end // 60, train_end % 60))    

# when the training is finished save the model
#torch.save(net.state_dict(), os.path.join(LOG_PATH, 'model_snapshot.txt'))
TBoard.close()
print('loss @ the end: {}'.format(round(loss.item(), 3)))

Epoch 1/10


11626it [6:07:45,  1.81it/s]


Epoch 2/10


11626it [3:43:19,  1.84it/s]


Epoch 3/10


11626it [3:28:00,  1.76it/s]


Epoch 4/10


11626it [3:27:35,  1.82it/s]


Epoch 5/10


11626it [3:26:56,  1.78it/s]


Epoch 6/10


11626it [3:25:55,  1.76it/s]


Epoch 7/10


11626it [3:35:25,  1.76it/s]


Epoch 8/10


11626it [3:31:01,  1.84it/s]


Epoch 9/10


11626it [3:27:06,  1.81it/s]


Epoch 10/10


11626it [3:25:54,  1.83it/s]


Training complete in 2259m 54s
loss @ the end: 0.796


In [21]:
net.module.fc2.weight

Parameter containing:
tensor([[ 0.0152, -0.0038,  0.0016,  ..., -0.0184,  0.0031,  0.0195],
        [ 0.0090, -0.0167, -0.0012,  ...,  0.0231,  0.0340, -0.0253],
        [ 0.0036,  0.0048,  0.0088,  ..., -0.0071, -0.0191,  0.0023],
        ...,
        [-0.0189, -0.0048, -0.0014,  ..., -0.0034,  0.0057, -0.0007],
        [ 0.0156, -0.0060,  0.0034,  ...,  0.0352, -0.0168,  0.0143],
        [-0.0031,  0.0017, -0.0050,  ..., -0.0121, -0.0018,  0.0260]],
       device='cuda:0', requires_grad=True)