## Import torch and model

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

In [3]:
import sys
sys.path.append("../common")

from model_generator import ModelGenerator
from net import Net

## Set hyper params

In [4]:
batch_size = 64

dropout_on = True
batchnorm_on = True 

scheduler_step_size = 20

## for recasting

lr_recasting = 0.001
num_epoch_recasting = 60

## for fine tune

lr_fine_tune = 0.001
num_epoch_fine_tune = 100

In [5]:
model_gen = ModelGenerator(dropout = dropout_on, batchnorm = batchnorm_on)

model_gen.CifarDensenetConfig(k = 12, num_layers = 100, cifar = 100)

# Recasting block
# 0: conv layer, 1, 3, 5: Dense block, 2, 4: Transition block
recasting_block_indices = [1, 3, 5]
target_block_type = 'ResidualBlock'

# Compression rate
# the number of filters decreased to [compression_rate]

compression_ratio = 1

## file path
pretrained_model = './cifar100_densenet100_pretrained.pth'
compressed_model = './cifar100_densenet100_to_convenet.pth'

## Load dataset

In [6]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010))])

transform_train = transforms.Compose(
    [transforms.RandomHorizontalFlip(),
     transforms.RandomCrop(32, 4),
     transforms.ToTensor(),
     transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010))])

trainset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                        download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)


Files already downloaded and verified
Files already downloaded and verified


## Load pre-trained model (teacher network)

In [7]:
model = model_gen.GetCifarDensenet()
teacher = Net(model)

state = torch.load(pretrained_model)
teacher.LoadFromStateDict(state)

teacher.Gpu()

correct = 0
total = 0
teacher.TestMode()
for data in testloader:
    images, labels = data
    outputs = teacher(Variable(images.cuda()))
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels.cuda()).sum()

print('Accuracy of the network on the 10000 test images: %4.2f %%' % (100 * correct / total))

Accuracy of the network on the 10000 test images: 95.20 %


## Define student network

In [8]:
model = model_gen.GetCifarDensenet()
student = Net(model)

state = torch.load(pretrained_model)
student.LoadFromStateDict(state)

student.Gpu()

In [9]:
correct = 0
total = 0
student.TestMode()
for data in testloader:
    images, labels = data
    outputs = student(Variable(images.cuda()))
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels.cuda()).sum()

print('Accuracy of the network on the 10000 test images: %4.2f %%' % (100 * correct / total))

Accuracy of the network on the 10000 test images: 95.20 %


## Sequential recasting

In [10]:
# define MSE loss
MSE = nn.MSELoss()

teacher.TestMode()

for block_idx in recasting_block_indices:
    
    ################################################    Recasting process ######################################################
    # current block recasting
    
    config = student.GetBlockConfig(block_idx)
    
    config[2] = round(config[2] * compression_ratio)    # apply compression ratio
    
    # Handling corner case: bottleneck block recasting
    if len(config) == 5:                         
        is_bottleneck = True
        mid_feature = config[4]
        # We reduce the output dimension of bottleneck block.
        # output dimension of new block is the same with output dimension of 3x3 conv in bottleneck block
        config[4] = round(mid_feature * compression_ratio)
    else :
        is_bottleneck = False
        
    new_block = model_gen.GenNewBlock([target_block_type, config])
    source_block_type = config[0]
    
    student.Recasting(block_idx, new_block)
    
    
    # next block recasting
    
    config = student.GetBlockConfig(block_idx + 1)
    
    config[1] = round(config[1] * compression_ratio)    # apply compression ratio
    
    # Handling corner case: bottleneck block recasting
    if is_bottleneck == True:                         
        # Change next input dim to output dim of target block
        config[1] = round(mid_feature * compression_ratio)
    
    new_block = model_gen.GenNewBlock([config[0], config])
    student.Recasting(block_idx + 1, new_block)
    
    ################################################    Recasting process end ##################################################
    
    
    student.Gpu()
    
    params = student.GetCurrParams(block_idx)
    
    optimizer = optim.Adam(params, lr = lr_recasting)
    scheduler = lr_scheduler.StepLR(optimizer, step_size = scheduler_step_size)
    
    print('\nBlock %d recasting is done (%s -> %s).' %(block_idx, source_block_type, target_block_type))
    print('Training start\n')
    for epoch in range(num_epoch_recasting):  # loop over the dataset multiple times
        
        running_loss = 0.0
        scheduler.step()
        
        student.TrainMode()
            
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data

            # wrap them in Variable
            inputs = Variable(inputs.cuda())
            labels = Variable(labels.cuda())

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            corrects = teacher(inputs, next_block= block_idx + 2)
            outputs = student(inputs, next_block = block_idx + 2)

            targets = Variable(corrects.data.clone())
            
            loss = MSE(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss = (running_loss * i + loss.cpu().data.numpy()) / (i+1)

        
        correct = 0
        total = 0
        student.TestMode()
        for data in testloader:
            images, labels = data
            outputs = student(Variable(images.cuda()))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.cuda()).sum()
        
        test_acc = 100 * correct / total
        

        print('(%d/%d) epoch end, loss: %3.6f, Test Acc: %4.2f %%' %(epoch + 1, num_epoch_recasting, running_loss, test_acc))
    
    
print('\nSequential recasting is finished')


Block 1 recasting is done (DenseBlock -> ResidualBlock).
Training start

(1/60) epoch end, loss: 0.016885, Test Acc: 92.28 %
(2/60) epoch end, loss: 0.007277, Test Acc: 93.51 %
(3/60) epoch end, loss: 0.005723, Test Acc: 94.07 %
(4/60) epoch end, loss: 0.004950, Test Acc: 94.23 %
(5/60) epoch end, loss: 0.004424, Test Acc: 94.55 %
(6/60) epoch end, loss: 0.004003, Test Acc: 94.28 %
(7/60) epoch end, loss: 0.003642, Test Acc: 94.59 %
(8/60) epoch end, loss: 0.003345, Test Acc: 94.79 %
(9/60) epoch end, loss: 0.003108, Test Acc: 94.43 %
(10/60) epoch end, loss: 0.002938, Test Acc: 94.79 %
(11/60) epoch end, loss: 0.002801, Test Acc: 94.81 %
(12/60) epoch end, loss: 0.002684, Test Acc: 94.73 %
(13/60) epoch end, loss: 0.002588, Test Acc: 94.80 %
(14/60) epoch end, loss: 0.002501, Test Acc: 94.66 %
(15/60) epoch end, loss: 0.002417, Test Acc: 94.68 %
(16/60) epoch end, loss: 0.002348, Test Acc: 94.91 %
(17/60) epoch end, loss: 0.002290, Test Acc: 94.82 %
(18/60) epoch end, loss: 0.002234,

(32/60) epoch end, loss: 1.181832, Test Acc: 94.82 %
(33/60) epoch end, loss: 1.174728, Test Acc: 94.77 %
(34/60) epoch end, loss: 1.160874, Test Acc: 94.72 %
(35/60) epoch end, loss: 1.154601, Test Acc: 94.74 %
(36/60) epoch end, loss: 1.144249, Test Acc: 94.81 %
(37/60) epoch end, loss: 1.140575, Test Acc: 94.89 %
(38/60) epoch end, loss: 1.128954, Test Acc: 94.89 %
(39/60) epoch end, loss: 1.122969, Test Acc: 94.89 %
(40/60) epoch end, loss: 1.115632, Test Acc: 94.87 %
(41/60) epoch end, loss: 1.079099, Test Acc: 94.91 %
(42/60) epoch end, loss: 1.065370, Test Acc: 94.97 %
(43/60) epoch end, loss: 1.070482, Test Acc: 94.83 %
(44/60) epoch end, loss: 1.065442, Test Acc: 94.86 %
(45/60) epoch end, loss: 1.069893, Test Acc: 94.98 %
(46/60) epoch end, loss: 1.066901, Test Acc: 94.90 %
(47/60) epoch end, loss: 1.066294, Test Acc: 94.93 %
(48/60) epoch end, loss: 1.067490, Test Acc: 94.98 %
(49/60) epoch end, loss: 1.057463, Test Acc: 94.85 %
(50/60) epoch end, loss: 1.061201, Test Acc: 9

## Fine-tuning (KD + Cross-entropy)

In [11]:
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

# define loss functions
MSE = nn.MSELoss()
criterion = nn.CrossEntropyLoss()

# pruning ratio for every layer    
optimizer = optim.Adam(student.GetTotalParams(), lr = lr_fine_tune)
scheduler = lr_scheduler.StepLR(optimizer, step_size = scheduler_step_size)
teacher.TestMode()
student.Gpu()

print('Fine tuning start\n')

for epoch in range(num_epoch_fine_tune):  # loop over the dataset multiple times

    running_loss = 0.0
    scheduler.step()
    student.TrainMode()
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs = Variable(inputs.cuda())
        labels = Variable(labels.cuda())

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        corrects = teacher(inputs)
        outputs = student(inputs)

        targets = Variable(corrects.data.clone())
        loss_KD = MSE(outputs, targets)
        loss_CE = criterion(outputs, labels)
        
        loss = loss_KD + loss_CE
        
        loss.backward()
        optimizer.step()

        running_loss = (running_loss * i + loss.cpu().data.numpy()) / (i+1)

    correct = 0
    total = 0
    student.TestMode()
    for data in testloader:
        images, labels = data
        outputs = student(Variable(images.cuda()))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.cuda()).sum()

    print('(%d/%d) epoch end, loss: %3.6f, Test Acc: %4.2f %%' %(epoch + 1, num_epoch_fine_tune, running_loss, 100 * correct / total))
    
print('\nFine tuning is finished')

Fine tuning start

(1/100) epoch end, loss: 1.663972, Test Acc: 92.00 %
(2/100) epoch end, loss: 1.680449, Test Acc: 92.37 %
(3/100) epoch end, loss: 1.642483, Test Acc: 93.03 %
(4/100) epoch end, loss: 1.603471, Test Acc: 93.37 %
(5/100) epoch end, loss: 1.589487, Test Acc: 89.87 %
(6/100) epoch end, loss: 1.569978, Test Acc: 93.04 %
(7/100) epoch end, loss: 1.542852, Test Acc: 93.18 %
(8/100) epoch end, loss: 1.523194, Test Acc: 93.15 %
(9/100) epoch end, loss: 1.511104, Test Acc: 93.60 %
(10/100) epoch end, loss: 1.472328, Test Acc: 93.62 %
(11/100) epoch end, loss: 1.448380, Test Acc: 93.96 %
(12/100) epoch end, loss: 1.446963, Test Acc: 93.81 %
(13/100) epoch end, loss: 1.442419, Test Acc: 93.97 %
(14/100) epoch end, loss: 1.404044, Test Acc: 92.75 %
(15/100) epoch end, loss: 1.395079, Test Acc: 93.67 %
(16/100) epoch end, loss: 1.383039, Test Acc: 93.75 %
(17/100) epoch end, loss: 1.351895, Test Acc: 93.62 %
(18/100) epoch end, loss: 1.344397, Test Acc: 93.30 %
(19/100) epoch end

In [12]:
student.PrintBlocksDetail()

[[Conv2d(3, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
  'ConvBlock'],
 [Conv2d(24, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
  BatchNorm2d(216, eps=1e-05, momentum=0.1, affine=True),
  ReLU(inplace),
  Conv2d(216, 216, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
  BatchNorm2d(216, eps=1e-05, momentum=0.1, affine=True),
  [Conv2d(24, 216, kernel_size=(1, 1), stride=(1, 1), bias=False),
   BatchNorm2d(216, eps=1e-05, momentum=0.1, affine=True)],
  ReLU(inplace),
  'ResidualBlock'],
 [Conv2d(216, 108, kernel_size=(1, 1), stride=(1, 1)), 'ConvBlock'],
 AvgPool2d(kernel_size=2, stride=2, padding=0, ceil_mode=False, count_include_pad=True),
 [Conv2d(108, 300, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
  BatchNorm2d(300, eps=1e-05, momentum=0.1, affine=True),
  ReLU(inplace),
  Conv2d(300, 300, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False),
  BatchNorm2d(300, eps=1e-05, momentum=0.1, affine=True