In [1]:
import argparse
import os, sys
import shutil
import time

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models
from utils import convert_secs2time, time_string, time_file_str
# from models import print_log
import models
import random
import numpy as np
from collections import OrderedDict

model_names = sorted(name for name in models.__dict__
                     if name.islower() and not name.startswith("__")
                     and callable(models.__dict__[name]))

model_names

['alexnet',
 'caffe_cifar',
 'preresnet110',
 'preresnet20',
 'preresnet32',
 'preresnet44',
 'preresnet56',
 'resnet101',
 'resnet101_small',
 'resnet110',
 'resnet152',
 'resnet152_small',
 'resnet18',
 'resnet18_small',
 'resnet20',
 'resnet32',
 'resnet34',
 'resnet34_small',
 'resnet44',
 'resnet50',
 'resnet50_small',
 'resnet56',
 'vgg11',
 'vgg11_bn',
 'vgg13',
 'vgg13_bn',
 'vgg16',
 'vgg16_bn',
 'vgg19',
 'vgg19_bn']

In [3]:
!nvidia-smi

import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())

Wed Oct  7 18:22:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |     11MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    40W / 300W |     11MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [4]:
from dotmap import DotMap

args = DotMap()

args.data = '/home/hongky/datasets/imagenet'
args.save_dir = '0710_resnet101/resnet101-rate-0.7/'
if not os.path.exists(args.save_dir):
    os.makedirs(args.save_dir)
    
args.arch = 'resnet101'
args.workers = 8
args.epochs = 10
args.start_epoch = 0
args.batch_size = 800
args.lr = 0.1
args.momentum = 0.9
args.weight_decay = 1e-4
args.print_freq = 200
args.resume='0510_resnet101/resnet101-rate-0.7/best.resnet101.2020-10-05-6841.pth.tar'

args.evaluate = False 
args.use_pretrain = True

# python pruning_train.py 
# -a resnet101 
# --save_dir ./snapshots/resnet101-rate-0.7 
# --rate 0.7 
# --layer_begin 0 
# --layer_end 309 
# --layer_inter 3  
# /path/to/Imagenet2012

# compress-rate
args.rate = 0.7
args.layer_begin = 0
args.layer_end = 309
args.layer_inter = 3

args.epoch_prune=1
args.skip_downsample=1
args.use_sparse=False 
args.sparse=''
args.lr_adjust=30


args.use_cuda = torch.cuda.is_available()
args.prefix = time_file_str()

print(args)

DotMap(data='/home/hongky/datasets/imagenet', save_dir='0710_resnet101/resnet101-rate-0.7/', arch='resnet101', workers=8, epochs=10, start_epoch=0, batch_size=800, lr=0.1, momentum=0.9, weight_decay=0.0001, print_freq=200, resume='0510_resnet101/resnet101-rate-0.7/best.resnet101.2020-10-05-6841.pth.tar', evaluate=False, use_pretrain=True, rate=0.7, layer_begin=0, layer_end=309, layer_inter=3, epoch_prune=1, skip_downsample=1, use_sparse=False, sparse='', lr_adjust=30, use_cuda=True, prefix='2020-10-07-9786')


In [5]:


class Mask:
    def __init__(self, model):
        self.model_size = {}
        self.model_length = {}
        self.compress_rate = {}
        self.mat = {}
        self.model = model
        self.mask_index = []

    def get_codebook(self, weight_torch, compress_rate, length):
        weight_vec = weight_torch.view(length)
        weight_np = weight_vec.cpu().numpy()

        weight_abs = np.abs(weight_np)
        weight_sort = np.sort(weight_abs)

        threshold = weight_sort[int(length * (1 - compress_rate))]
        weight_np[weight_np <= -threshold] = 1
        weight_np[weight_np >= threshold] = 1
        weight_np[weight_np != 1] = 0

        print("codebook done")
        return weight_np

    def get_filter_codebook(self, weight_torch, compress_rate, length):
        codebook = np.ones(length)
        if len(weight_torch.size()) == 4:
            filter_pruned_num = int(weight_torch.size()[0] * (1 - compress_rate))
            weight_vec = weight_torch.view(weight_torch.size()[0], -1)
            # norm1 = torch.norm(weight_vec, 1, 1)
            # norm1_np = norm1.cpu().numpy()
            norm2 = torch.norm(weight_vec, 2, 1)
            norm2_np = norm2.cpu().numpy()
            filter_index = norm2_np.argsort()[:filter_pruned_num]
            #            norm1_sort = np.sort(norm1_np)
            #            threshold = norm1_sort[int (weight_torch.size()[0] * (1-compress_rate) )]
            kernel_length = weight_torch.size()[1] * weight_torch.size()[2] * weight_torch.size()[3]
            for x in range(0, len(filter_index)):
                codebook[filter_index[x] * kernel_length: (filter_index[x] + 1) * kernel_length] = 0

            print("filter codebook done")
        else:
            pass
        return codebook

    def convert2tensor(self, x):
        x = torch.FloatTensor(x)
        return x

    def init_length(self):
        for index, item in enumerate(self.model.parameters()):
            self.model_size[index] = item.size()

        for index1 in self.model_size:
            for index2 in range(0, len(self.model_size[index1])):
                if index2 == 0:
                    self.model_length[index1] = self.model_size[index1][0]
                else:
                    self.model_length[index1] *= self.model_size[index1][index2]

    def init_rate(self, layer_rate):
        if 'vgg' in args.arch:
            cfg_5x = [24, 22, 41, 51, 108, 89, 111, 184, 276, 228, 512, 512, 512]
            cfg_official = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512]
            # cfg = [32, 64, 128, 128, 256, 256, 256, 256, 256, 256, 256, 256, 256]
            cfg_index = 0
            pre_cfg = True
            for index, item in enumerate(self.model.named_parameters()):
                self.compress_rate[index] = 1
                if len(item[1].size()) == 4:
                    print(item[1].size())
                    if not pre_cfg:
                        self.compress_rate[index] = layer_rate
                        self.mask_index.append(index)
                        print(item[0], "self.mask_index", self.mask_index)
                    else:
                        self.compress_rate[index] =  1 - cfg_5x[cfg_index] / item[1].size()[0]
                        self.mask_index.append(index)
                        print(item[0], "self.mask_index", self.mask_index, cfg_index, cfg_5x[cfg_index], item[1].size()[0],
                               )
                        cfg_index += 1
        elif "resnet" in args.arch:
            for index, item in enumerate(self.model.parameters()):
                self.compress_rate[index] = 1
            for key in range(args.layer_begin, args.layer_end + 1, args.layer_inter):
                self.compress_rate[key] = layer_rate
            if args.arch == 'resnet18':
                # last index include last fc layer
                last_index = 60
                skip_list = [21, 36, 51]
            elif args.arch == 'resnet34':
                last_index = 108
                skip_list = [27, 54, 93]
            elif args.arch == 'resnet50':
                last_index = 159
                skip_list = [12, 42, 81, 138]
            elif args.arch == 'resnet101':
                last_index = 312
                skip_list = [12, 42, 81, 291]
            elif args.arch == 'resnet152':
                last_index = 465
                skip_list = [12, 42, 117, 444]
            self.mask_index = [x for x in range(0, last_index, 3)]
            # skip downsample layer
            if args.skip_downsample == 1:
                for x in skip_list:
                    self.compress_rate[x] = 1
                    self.mask_index.remove(x)
                    print(self.mask_index)
            else:
                pass

    def init_mask(self, layer_rate):
        self.init_rate(layer_rate)
        for index, item in enumerate(self.model.parameters()):
            if (index in self.mask_index):
                self.mat[index] = self.get_filter_codebook(item.data, self.compress_rate[index],
                                                           self.model_length[index])
                self.mat[index] = self.convert2tensor(self.mat[index])
                if args.use_cuda:
                    self.mat[index] = self.mat[index].cuda()
        print("mask Ready")

    def do_mask(self):
        for index, item in enumerate(self.model.parameters()):
            if (index in self.mask_index):
                a = item.data.view(self.model_length[index])
                b = a * self.mat[index]
                item.data = b.view(self.model_size[index])
        print("mask Done")

    def if_zero(self):
        for index, item in enumerate(self.model.parameters()):
            #            if(index in self.mask_index):
            if index in [x for x in range(args.layer_begin, args.layer_end + 1, args.layer_inter)]:
                a = item.data.view(self.model_length[index])
                b = a.cpu().numpy()

                print("layer: %d, number of nonzero weight is %d, zero is %d" % (
                    index, np.count_nonzero(b), len(b) - np.count_nonzero(b)))

In [6]:

def import_sparse(model):
    checkpoint = torch.load(args.sparse)
    new_state_dict = OrderedDict()
    for k, v in checkpoint['state_dict'].items():
        name = k[7:]  # remove `module.`
        new_state_dict[name] = v
    model.load_state_dict(new_state_dict)
    print("sparse_model_loaded")
    return model



def validate(val_loader, model, criterion, log):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.cuda(non_blocking=True)
        input_var = torch.autograd.Variable(input, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        top5.update(prec5.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print_log('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                i, len(val_loader), batch_time=batch_time, loss=losses,
                top1=top1, top5=top5), log)

    print_log(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f} Error@1 {error1:.3f}'.format(top1=top1, top5=top5,
                                                                                           error1=100 - top1.avg), log)

    return top1.avg


def save_checkpoint(state, is_best, filename, bestname):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, bestname)


def print_log(print_string, log):
    print("{:}".format(print_string))
    log.write('{:}\n'.format(print_string))
    log.flush()


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.1 ** (epoch // args.lr_adjust))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res



In [7]:

def train(train_loader, model, criterion, optimizer, epoch, log):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(non_blocking=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        top5.update(prec5.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            print_log('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                      'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                epoch, i, len(train_loader), batch_time=batch_time,
                data_time=data_time, loss=losses, top1=top1, top5=top5), log)


In [8]:

best_prec1 = 0

if not os.path.isdir(args.save_dir):
    os.makedirs(args.save_dir)
log = open(os.path.join(args.save_dir, '{}.{}.log'.format(args.arch, args.prefix)), 'w')

# version information
print_log("PyThon  version : {}".format(sys.version.replace('\n', ' ')), log)
print_log("PyTorch version : {}".format(torch.__version__), log)
print_log("cuDNN   version : {}".format(torch.backends.cudnn.version()), log)
print_log("Vision  version : {}".format(torchvision.__version__), log)


PyThon  version : 3.6.10 |Anaconda, Inc.| (default, May  8 2020, 02:54:21)  [GCC 7.3.0]
PyTorch version : 1.5.0
cuDNN   version : 7603
Vision  version : 0.6.0a0+82fd1c8


In [9]:
# create model
print_log("=> creating model '{}'".format(args.arch), log)
model = models.__dict__[args.arch](pretrained=True)
if args.use_sparse:
    model = import_sparse(model)
print_log("=> Model : {}".format(model), log)
print_log("=> parameter : {}".format(args), log)
print_log("Compress Rate: {}".format(args.rate), log)
print_log("Layer Begin: {}".format(args.layer_begin), log)
print_log("Layer End: {}".format(args.layer_end), log)
print_log("Layer Inter: {}".format(args.layer_inter), log)
print_log("Epoch prune: {}".format(args.epoch_prune), log)
print_log("Skip downsample : {}".format(args.skip_downsample), log)
print_log("Workers         : {}".format(args.workers), log)
print_log("Learning-Rate   : {}".format(args.lr), log)
print_log("Use Pre-Trained : {}".format(args.use_pretrain), log)
print_log("lr adjust : {}".format(args.lr_adjust), log)

if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
    model.features = torch.nn.DataParallel(model.features)
    model.cuda()
else:
    model = torch.nn.DataParallel(model).cuda()
    
print('Model:: ', model)

=> creating model 'resnet101'
=> Model : ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d

Model::  DataParallel(
  (module): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequentia

In [10]:
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda()

optimizer = torch.optim.SGD(model.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay,
                            nesterov=True)

# optionally resume from a checkpoint
if args.resume:
    if os.path.isfile(args.resume):
        print_log("=> loading checkpoint '{}'".format(args.resume), log)
        checkpoint = torch.load(args.resume)
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print_log("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']), log)
    else:
        print_log("=> no checkpoint found at '{}'".format(args.resume), log)

cudnn.benchmark = True

=> loading checkpoint '0510_resnet101/resnet101-rate-0.7/best.resnet101.2020-10-05-6841.pth.tar'
=> loaded checkpoint '0510_resnet101/resnet101-rate-0.7/best.resnet101.2020-10-05-6841.pth.tar' (epoch 10)


In [11]:
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_dataset = datasets.ImageFolder(
    traindir,
    transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, shuffle=True,
    num_workers=args.workers, pin_memory=True, sampler=None)

val_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=args.batch_size, shuffle=False,
    num_workers=args.workers, pin_memory=True)

In [12]:
filename = os.path.join(args.save_dir, 'checkpoint.{:}.{:}.pth.tar'.format(args.arch, args.prefix))
bestname = os.path.join(args.save_dir, 'best.{:}.{:}.pth.tar'.format(args.arch, args.prefix))

m = Mask(model)

m.init_length()
print("-" * 10 + "one epoch begin" + "-" * 10)
print("the compression rate now is {:}".format(args.rate))

val_acc_1 = validate(val_loader, model, criterion, log)

print(">>>>> Accuracy_origin_model: {:}".format(val_acc_1))

m.model = model

m.init_mask(args.rate)
# m.if_zero()
m.do_mask()
model = m.model
# m.if_zero()
if args.use_cuda:
    model = model.cuda()
val_acc_2 = validate(val_loader, model, criterion, log)
print(">>>>> Accuracy_masked_model: {:}".format(val_acc_2))

----------one epoch begin----------
the compression rate now is 0.7




Test: [0/63]	Time 59.624 (59.624)	Loss 0.8042 (0.8042)	Prec@1 79.875 (79.875)	Prec@5 93.125 (93.125)
 * Prec@1 64.704 Prec@5 86.688 Error@1 35.296
>>>>> Accuracy_origin_model: 64.704
[0, 3, 6, 9, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111, 114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156, 159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246, 249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291, 294, 297, 300, 303, 306, 309]
[0, 3, 6, 9, 15, 18, 21, 24, 27, 30, 33, 36, 39, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111, 114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156, 159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225, 228

In [18]:
start_time = time.time()
epoch_time = AverageMeter()
print(args.start_epoch, args.epochs)
args.epochs=20
for epoch in range(args.start_epoch, args.epochs):
    adjust_learning_rate(optimizer, epoch)

    need_hour, need_mins, need_secs = convert_secs2time(epoch_time.val * (args.epochs - epoch))
    need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs)
    print_log(
        ' [{:s}] :: {:3d}/{:3d} ----- [{:s}] {:s}'.format(
            args.arch, epoch, args.epochs, time_string(), need_time),
        log)

    
    # 1. train for one epoch
    train(train_loader, model, criterion, optimizer, epoch, log)
    

    # 2. evaluate on validation set
    val_acc_1 = validate(val_loader, model, criterion, log)
    print('\n\n>>>>> Accuracy_model: ', val_acc_1)
    
    # 3. prune trained model - filter again after 1 training epoch
    if (epoch % args.epoch_prune == 0 or epoch == args.epochs - 1):
        #        if (random.randint(1,args.epoch_prune)==1 or epoch == args.epochs-1):
        m.model = model
        m.if_zero()
        m.init_mask(args.rate)
        m.do_mask()
        m.if_zero()
        model = m.model
        if args.use_cuda:
            model = model.cuda()

    # 4. validate pruned model
    val_acc_2 = validate(val_loader, model, criterion, log)
    print('\n\n>>>>> Accuracy_pruned: ', val_acc_2)
    
    
    # remember best prec@1 and save checkpoint
    is_best = val_acc_2 > best_prec1
    best_prec1 = max(val_acc_2, best_prec1)
    save_checkpoint({
        'epoch': epoch + 1,
        'arch': args.arch,
        'state_dict': model.state_dict(),
        'best_prec1': best_prec1,
        'optimizer': optimizer.state_dict(),
    }, is_best, filename, bestname)
    # measure elapsed time
    epoch_time.update(time.time() - start_time)
    start_time = time.time()
#log.close()

10 10
 [resnet101] ::  10/ 20 ----- [[2020-10-07 07:35:10]] [Need: 00:00:00]
Epoch: [10][0/1602]	Time 141.141 (141.141)	Data 135.446 (135.446)	Loss 1.5330 (1.5330)	Prec@1 64.250 (64.250)	Prec@5 84.125 (84.125)
Epoch: [10][200/1602]	Time 87.180 (14.090)	Data 86.601 (13.473)	Loss 1.6086 (1.5876)	Prec@1 60.375 (62.748)	Prec@5 83.500 (83.956)
Epoch: [10][400/1602]	Time 75.300 (13.808)	Data 74.720 (13.208)	Loss 1.3817 (1.5877)	Prec@1 65.500 (62.688)	Prec@5 87.750 (83.928)
Epoch: [10][600/1602]	Time 43.601 (13.474)	Data 43.018 (12.878)	Loss 1.6986 (1.5955)	Prec@1 59.750 (62.520)	Prec@5 82.500 (83.807)
Epoch: [10][800/1602]	Time 32.541 (13.156)	Data 31.957 (12.564)	Loss 1.6389 (1.6011)	Prec@1 62.500 (62.447)	Prec@5 83.125 (83.692)
Epoch: [10][1000/1602]	Time 18.822 (11.463)	Data 18.259 (10.872)	Loss 1.6665 (1.6042)	Prec@1 59.750 (62.389)	Prec@5 82.500 (83.623)




Epoch: [10][1200/1602]	Time 16.929 (10.204)	Data 16.348 (9.614)	Loss 1.6058 (1.6071)	Prec@1 63.000 (62.326)	Prec@5 82.125 (83.578)
Epoch: [10][1400/1602]	Time 13.106 (9.406)	Data 12.533 (8.816)	Loss 1.5680 (1.6088)	Prec@1 63.375 (62.286)	Prec@5 84.125 (83.544)
Epoch: [10][1600/1602]	Time 10.820 (8.870)	Data 10.246 (8.280)	Loss 1.6769 (1.6099)	Prec@1 60.625 (62.254)	Prec@5 81.250 (83.533)




Test: [0/63]	Time 36.474 (36.474)	Loss 0.8207 (0.8207)	Prec@1 78.625 (78.625)	Prec@5 93.500 (93.500)
 * Prec@1 62.930 Prec@5 85.536 Error@1 37.070


>>>>> Accuracy_model:  62.93
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 26325, zero is 10539
layer: 9, number of nonzero weight is 13774, zero is 2610
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26253, zero is 10611
layer: 21, number of nonzero weight is 14204, zero is 2180
layer: 24, number of nonzero weight is 11520, zero is 4864
layer: 27, number of nonzero weight is 27135, zero is 9729
layer: 30, number of nonzero weight is 15168, zero is 1216
layer: 33, number of nonzero weight is 23296, zero is 9472
layer: 36, number of nonzero weight is 121698, zero is 25758
layer: 39, number of nonzero weight is 55695, zero is 9841
l

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebo

Test: [0/63]	Time 13.935 (13.935)	Loss 0.8210 (0.8210)	Prec@1 78.500 (78.500)	Prec@5 93.750 (93.750)
 * Prec@1 62.960 Prec@5 85.526 Error@1 37.040


>>>>> Accuracy_pruned:  62.96
 [resnet101] ::  11/ 20 ----- [[2020-10-07 11:37:01]] [Need: 36:16:43]
Epoch: [11][0/1602]	Time 53.038 (53.038)	Data 52.403 (52.403)	Loss 1.6127 (1.6127)	Prec@1 62.000 (62.000)	Prec@5 82.875 (82.875)
Epoch: [11][200/1602]	Time 16.365 (4.602)	Data 15.781 (4.012)	Loss 1.7565 (1.5709)	Prec@1 60.250 (62.973)	Prec@5 81.500 (84.075)
Epoch: [11][400/1602]	Time 26.087 (4.381)	Data 25.500 (3.791)	Loss 1.4293 (1.5846)	Prec@1 66.500 (62.686)	Prec@5 86.875 (83.890)
Epoch: [11][600/1602]	Time 27.644 (4.232)	Data 27.064 (3.642)	Loss 1.7185 (1.5925)	Prec@1 59.875 (62.540)	Prec@5 81.250 (83.785)
Epoch: [11][800/1602]	Time 20.880 (4.251)	Data 20.297 (3.661)	Loss 1.5523 (1.5970)	Prec@1 62.000 (62.507)	Prec@5 84.250 (83.705)
Epoch: [11][1000/1602]	Time 24.937 (4.180)	Data 24.334 (3.589)	Loss 1.6302 (1.6006)	Prec@1 62.875 (62.424



Epoch: [11][1600/1602]	Time 19.384 (4.517)	Data 18.809 (3.927)	Loss 1.7218 (1.6070)	Prec@1 61.125 (62.307)	Prec@5 82.250 (83.571)
Test: [0/63]	Time 31.456 (31.456)	Loss 0.8120 (0.8120)	Prec@1 78.000 (78.000)	Prec@5 94.000 (94.000)
 * Prec@1 63.318 Prec@5 85.776 Error@1 36.682


>>>>> Accuracy_model:  63.318
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 26325, zero is 10539
layer: 9, number of nonzero weight is 13773, zero is 2611
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26253, zero is 10611
layer: 21, number of nonzero weight is 14204, zero is 2180
layer: 24, number of nonzero weight is 11520, zero is 4864
layer: 27, number of nonzero weight is 27128, zero is 9736
layer: 30, number of nonzero weight is 15168, zero is 1216
layer: 33, number of nonzero weight is 23296, zer

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebo

Epoch: [12][800/1602]	Time 33.524 (3.909)	Data 32.943 (3.310)	Loss 1.5727 (1.5908)	Prec@1 62.125 (62.600)	Prec@5 84.125 (83.828)
Epoch: [12][1000/1602]	Time 32.844 (4.194)	Data 32.262 (3.596)	Loss 1.5635 (1.5952)	Prec@1 62.375 (62.528)	Prec@5 83.000 (83.754)
Epoch: [12][1200/1602]	Time 41.190 (4.456)	Data 40.603 (3.859)	Loss 1.7432 (1.5995)	Prec@1 58.750 (62.417)	Prec@5 81.500 (83.690)
Epoch: [12][1400/1602]	Time 30.832 (4.620)	Data 30.268 (4.025)	Loss 1.5620 (1.6016)	Prec@1 64.125 (62.389)	Prec@5 84.500 (83.660)




Epoch: [12][1600/1602]	Time 34.392 (4.658)	Data 33.816 (4.063)	Loss 1.5854 (1.6050)	Prec@1 62.000 (62.328)	Prec@5 84.875 (83.618)
Test: [0/63]	Time 31.752 (31.752)	Loss 0.7587 (0.7587)	Prec@1 80.375 (80.375)	Prec@5 94.000 (94.000)
 * Prec@1 64.360 Prec@5 86.596 Error@1 35.640


>>>>> Accuracy_model:  64.36
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 26317, zero is 10547
layer: 9, number of nonzero weight is 13772, zero is 2612
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26253, zero is 10611
layer: 21, number of nonzero weight is 14167, zero is 2217
layer: 24, number of nonzero weight is 11520, zero is 4864
layer: 27, number of nonzero weight is 27020, zero is 9844
layer: 30, number of nonzero weight is 15168, zero is 1216
layer: 33, number of nonzero weight is 23296, zero

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebo



Epoch: [13][200/1602]	Time 23.876 (4.444)	Data 23.253 (3.851)	Loss 1.6730 (1.5659)	Prec@1 59.875 (63.179)	Prec@5 82.750 (84.083)
Epoch: [13][400/1602]	Time 17.940 (4.131)	Data 17.377 (3.539)	Loss 1.4916 (1.5791)	Prec@1 64.375 (62.909)	Prec@5 85.125 (83.948)
Epoch: [13][600/1602]	Time 26.608 (4.354)	Data 26.021 (3.764)	Loss 1.6165 (1.5843)	Prec@1 63.250 (62.808)	Prec@5 83.250 (83.873)
Epoch: [13][800/1602]	Time 13.682 (4.548)	Data 13.119 (3.958)	Loss 1.4913 (1.5906)	Prec@1 64.375 (62.705)	Prec@5 84.125 (83.787)
Epoch: [13][1000/1602]	Time 17.905 (4.606)	Data 17.314 (4.016)	Loss 1.5192 (1.5943)	Prec@1 63.375 (62.613)	Prec@5 85.000 (83.762)
Epoch: [13][1200/1602]	Time 16.063 (4.692)	Data 15.479 (4.102)	Loss 1.6475 (1.5980)	Prec@1 63.625 (62.535)	Prec@5 82.125 (83.720)
Epoch: [13][1400/1602]	Time 10.068 (4.632)	Data 9.451 (4.042)	Loss 1.7837 (1.6004)	Prec@1 58.125 (62.483)	Prec@5 81.000 (83.692)
Epoch: [13][1600/1602]	Time 3.241 (4.576)	Data 2.686 (3.986)	Loss 1.5682 (1.6023)	Prec@1 61.625

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebo

Epoch: [14][800/1602]	Time 32.207 (5.169)	Data 31.623 (4.577)	Loss 1.3856 (1.5899)	Prec@1 66.625 (62.587)	Prec@5 87.000 (83.832)
Epoch: [14][1000/1602]	Time 26.957 (5.175)	Data 26.375 (4.585)	Loss 1.5926 (1.5909)	Prec@1 63.000 (62.585)	Prec@5 83.875 (83.838)
Epoch: [14][1200/1602]	Time 26.585 (5.282)	Data 26.012 (4.693)	Loss 1.5864 (1.5941)	Prec@1 62.750 (62.527)	Prec@5 84.125 (83.776)




Epoch: [14][1400/1602]	Time 21.080 (5.299)	Data 20.497 (4.710)	Loss 1.5770 (1.5980)	Prec@1 64.000 (62.439)	Prec@5 84.625 (83.729)
Epoch: [14][1600/1602]	Time 28.930 (5.329)	Data 28.357 (4.739)	Loss 1.6214 (1.6013)	Prec@1 63.500 (62.361)	Prec@5 84.250 (83.692)
Test: [0/63]	Time 30.085 (30.085)	Loss 0.8313 (0.8313)	Prec@1 78.125 (78.125)	Prec@5 93.250 (93.250)
 * Prec@1 63.202 Prec@5 85.764 Error@1 36.798


>>>>> Accuracy_model:  63.202
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 26200, zero is 10664
layer: 9, number of nonzero weight is 13751, zero is 2633
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26253, zero is 10611
layer: 21, number of nonzero weight is 14093, zero is 2291
layer: 24, number of nonzero weight is 11520, zero is 4864
layer: 27, number of nonzero weight i

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
mask Ready
mask Done
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 25749, zero is 11115
layer: 9, number of nonzero weight is 11394, zero is 4990
layer: 12, nu



Epoch: [15][600/1602]	Time 18.527 (5.421)	Data 17.943 (4.830)	Loss 1.5719 (1.5815)	Prec@1 64.000 (62.747)	Prec@5 85.500 (83.921)
Epoch: [15][800/1602]	Time 17.522 (5.095)	Data 16.932 (4.504)	Loss 1.5474 (1.5858)	Prec@1 61.875 (62.656)	Prec@5 84.000 (83.866)
Epoch: [15][1000/1602]	Time 14.725 (4.871)	Data 14.115 (4.281)	Loss 1.5950 (1.5907)	Prec@1 64.750 (62.590)	Prec@5 83.250 (83.813)
Epoch: [15][1200/1602]	Time 16.857 (4.693)	Data 16.274 (4.103)	Loss 1.5934 (1.5924)	Prec@1 62.250 (62.571)	Prec@5 83.375 (83.805)
Epoch: [15][1400/1602]	Time 7.401 (4.549)	Data 6.821 (3.959)	Loss 1.7751 (1.5963)	Prec@1 61.375 (62.496)	Prec@5 80.875 (83.736)
Epoch: [15][1600/1602]	Time 11.263 (4.499)	Data 10.691 (3.909)	Loss 1.5732 (1.5983)	Prec@1 61.250 (62.451)	Prec@5 84.750 (83.708)
Test: [0/63]	Time 28.337 (28.337)	Loss 0.8746 (0.8746)	Prec@1 78.250 (78.250)	Prec@5 91.750 (91.750)
 * Prec@1 62.870 Prec@5 85.244 Error@1 37.130


>>>>> Accuracy_model:  62.87
layer: 0, number of nonzero weight is 6615, ze

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
mask Ready
mask Done
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of

Epoch: [16][1000/1602]	Time 18.615 (4.388)	Data 17.955 (3.787)	Loss 1.6181 (1.5906)	Prec@1 62.125 (62.637)	Prec@5 81.875 (83.784)
Epoch: [16][1200/1602]	Time 20.492 (4.668)	Data 19.905 (4.068)	Loss 1.6330 (1.5939)	Prec@1 62.000 (62.579)	Prec@5 83.750 (83.732)




Epoch: [16][1400/1602]	Time 15.038 (4.928)	Data 14.387 (4.328)	Loss 1.5526 (1.5948)	Prec@1 65.375 (62.568)	Prec@5 84.500 (83.718)
Epoch: [16][1600/1602]	Time 12.235 (4.956)	Data 11.644 (4.357)	Loss 1.7463 (1.5969)	Prec@1 59.250 (62.520)	Prec@5 81.125 (83.683)
Test: [0/63]	Time 32.584 (32.584)	Loss 0.8097 (0.8097)	Prec@1 78.875 (78.875)	Prec@5 95.000 (95.000)
 * Prec@1 64.732 Prec@5 86.690 Error@1 35.268


>>>>> Accuracy_model:  64.732
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 25749, zero is 11115
layer: 9, number of nonzero weight is 13638, zero is 2746
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26244, zero is 10620
layer: 21, number of nonzero weight is 13977, zero is 2407
layer: 24, number of nonzero weight is 11520, zero is 4864
layer: 27, number of nonzero weight i

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
mask Ready
mask Done
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 25749, zero is 11115
layer: 9, number of nonzero weight is 11412, zero is 4972
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of



Epoch: [17][1200/1602]	Time 18.832 (4.756)	Data 18.162 (4.147)	Loss 1.7096 (1.5897)	Prec@1 59.750 (62.675)	Prec@5 81.125 (83.873)
Epoch: [17][1400/1602]	Time 14.180 (4.682)	Data 13.584 (4.072)	Loss 1.5832 (1.5920)	Prec@1 62.000 (62.632)	Prec@5 85.250 (83.835)
Epoch: [17][1600/1602]	Time 18.465 (4.723)	Data 17.807 (4.112)	Loss 1.5863 (1.5949)	Prec@1 62.125 (62.555)	Prec@5 85.125 (83.783)
Test: [0/63]	Time 35.987 (35.987)	Loss 0.7111 (0.7111)	Prec@1 81.875 (81.875)	Prec@5 95.125 (95.125)
 * Prec@1 65.326 Prec@5 86.906 Error@1 34.674


>>>>> Accuracy_model:  65.326
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 25749, zero is 11115
layer: 9, number of nonzero weight is 13628, zero is 2756
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26244, zero is 10620
layer: 21, number of nonz

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebo



Epoch: [18][600/1602]	Time 17.078 (4.083)	Data 16.398 (3.470)	Loss 1.4985 (1.5774)	Prec@1 64.750 (62.905)	Prec@5 85.250 (84.032)
Epoch: [18][800/1602]	Time 19.250 (4.039)	Data 18.656 (3.425)	Loss 1.6469 (1.5818)	Prec@1 63.625 (62.838)	Prec@5 82.875 (83.975)
Epoch: [18][1000/1602]	Time 22.508 (4.045)	Data 21.830 (3.432)	Loss 1.7343 (1.5845)	Prec@1 59.125 (62.769)	Prec@5 82.625 (83.937)
Epoch: [18][1200/1602]	Time 26.940 (4.078)	Data 26.333 (3.466)	Loss 1.5797 (1.5879)	Prec@1 62.625 (62.697)	Prec@5 84.250 (83.879)
Epoch: [18][1400/1602]	Time 34.659 (4.132)	Data 33.975 (3.520)	Loss 1.6334 (1.5916)	Prec@1 63.750 (62.615)	Prec@5 82.375 (83.815)
Epoch: [18][1600/1602]	Time 15.795 (4.094)	Data 15.200 (3.483)	Loss 1.6106 (1.5930)	Prec@1 62.625 (62.594)	Prec@5 83.750 (83.790)
Test: [0/63]	Time 34.458 (34.458)	Loss 0.7399 (0.7399)	Prec@1 82.125 (82.125)	Prec@5 93.375 (93.375)
 * Prec@1 64.552 Prec@5 86.542 Error@1 35.448


>>>>> Accuracy_model:  64.552
layer: 0, number of nonzero weight is 6615,

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
mask Ready
mask Done
layer: 0, number of nonzero weight

Epoch: [19][1000/1602]	Time 8.593 (3.072)	Data 8.001 (2.457)	Loss 1.6919 (1.5834)	Prec@1 61.125 (62.718)	Prec@5 81.750 (83.923)
Epoch: [19][1200/1602]	Time 8.766 (3.121)	Data 8.075 (2.506)	Loss 1.5748 (1.5856)	Prec@1 62.375 (62.672)	Prec@5 84.500 (83.896)




Epoch: [19][1400/1602]	Time 3.193 (3.145)	Data 2.605 (2.530)	Loss 1.5429 (1.5895)	Prec@1 61.875 (62.611)	Prec@5 84.625 (83.849)
Epoch: [19][1600/1602]	Time 1.955 (3.124)	Data 1.290 (2.510)	Loss 1.6276 (1.5926)	Prec@1 61.375 (62.552)	Prec@5 84.250 (83.806)
Test: [0/63]	Time 30.115 (30.115)	Loss 0.7353 (0.7353)	Prec@1 80.250 (80.250)	Prec@5 94.500 (94.500)
 * Prec@1 62.950 Prec@5 85.584 Error@1 37.050


>>>>> Accuracy_model:  62.95
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 25749, zero is 11115
layer: 9, number of nonzero weight is 13564, zero is 2820
layer: 12, number of nonzero weight is 16384, zero is 0
layer: 15, number of nonzero weight is 11520, zero is 4864
layer: 18, number of nonzero weight is 26244, zero is 10620
layer: 21, number of nonzero weight is 13844, zero is 2540
layer: 24, number of nonzero weight is 11520, zero is 4864
layer: 27, number of nonzero weight is 254

filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
filter codebook done
mask Ready
mask Done
layer: 0, number of nonzero weight is 6615, zero is 2793
layer: 3, number of nonzero weight is 2880, zero is 1216
layer: 6, number of nonzero weight is 25749, zero is 11115
layer: 9, number of nonzero w

In [5]:
!nvidia-smi


Wed Oct  7 17:33:11 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   36C    P0    58W / 300W |  21124MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    40W / 300W |     11MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   