# Imagenet Training SqueezeNet

### References
* [Paper](https://arxiv.org/pdf/1602.07360.pdf)
* [Pytorch(reference) implementation](https://github.com/pytorch/vision/blob/master/torchvision/models/squeezenet.py)
* [Training Imagenet with Pytorch](https://github.com/pytorch/examples/tree/master/imagenet)
* [Python3 Profiling](https://docs.python.org/3/library/profile.html)
* [Issue with the pretrained models](https://github.com/DeepScale/SqueezeNet/issues/34)
* [SqueezeNet Neural Style](https://github.com/lizeng614/SqueezeNet-Neural-Style-Pytorch)
* [Converters](https://github.com/ysh329/deep-learning-model-convertor)

In [1]:
import os

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.nn.init as init

# Just some functions to average stuff, and save the model
from utils_pytorch import *

# Trainning parameters
learning_rate = 0.04
batch_size = 64
momentum = 0.9
weight_decay = 1e-4
workers = 4
print_freq = 100
epochs = 2
#IMAGENET_PATH ='/mnt/eulbh-nas01/qa_analitics/Apical_CNN_training_data/ImageNet/ILSVRC/Data/DET'
IMAGENET_PATH = '/home/leoara01/work/IMAGENET/ILSVRC/Data/CLS-LOC'

### Fire Module
![title](FireModule.png)

In [2]:
class Fire(nn.Module):

    def __init__(self, inplanes, squeeze_planes,
                 expand1x1_planes, expand3x3_planes):
        super(Fire, self).__init__()
        self.inplanes = inplanes
        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
        self.squeeze_activation = nn.ReLU(inplace=True)
        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
                                   kernel_size=1)
        self.expand1x1_activation = nn.ReLU(inplace=True)
        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
                                   kernel_size=3, padding=1)
        self.expand3x3_activation = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.squeeze_activation(self.squeeze(x))
        # Concatenate results
        return torch.cat([
            self.expand1x1_activation(self.expand1x1(x)),
            self.expand3x3_activation(self.expand3x3(x))
        ], 1)

### Architecture v1.1
![title](SqueezeNetArch.png)

In [3]:
class SqueezeNet(nn.Module):
    def __init__(self, num_classes=1000):
        super(SqueezeNet, self).__init__()        
        self.num_classes = num_classes
        self.features = nn.Sequential(
                nn.Conv2d(3, 64, kernel_size=3, stride=2),
                nn.ReLU(inplace=True),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(64, 16, 64, 64),
                Fire(128, 16, 64, 64),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(128, 32, 128, 128),
                Fire(256, 32, 128, 128),
                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
                Fire(256, 48, 192, 192),
                Fire(384, 48, 192, 192),
                Fire(384, 64, 256, 256),
                Fire(512, 64, 256, 256),)
        # Final convolution is initialized differently form the rest
        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
        
        # Add dropout, Relu and Average pool
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),final_conv,nn.ReLU(inplace=True),nn.AvgPool2d(13, stride=1))

        # Initialize layers
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                if m is final_conv:
                    init.normal(m.weight.data, mean=0.0, std=0.01)
                else:
                    init.kaiming_uniform(m.weight.data)
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x.view(x.size(0), self.num_classes)

### Initialize model and pass to the GPU

In [4]:
model = SqueezeNet()
#print(model)
model = torch.nn.DataParallel(model).cuda()

### Define Loss

In [5]:
criterion = nn.CrossEntropyLoss().cuda()

### Define solver(SGD)

In [6]:
optimizer = torch.optim.SGD(model.parameters(), learning_rate,
                                momentum=momentum,
                                weight_decay=weight_decay)

### Data loading specifics for ImageNet

In [7]:
# Data loading code
traindir = os.path.join(IMAGENET_PATH, 'train')
valdir = os.path.join(IMAGENET_PATH, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])

# Operations that will be done on data
train_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(traindir, transforms.Compose([
            transforms.RandomSizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=batch_size, shuffle=True,
        num_workers=workers, pin_memory=True)

val_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=batch_size, shuffle=False,
    num_workers=workers, pin_memory=True)

### Train

In [None]:
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0], input.size(0))
        top5.update(prec5[0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5))

In [None]:
for epoch in range(0, epochs):
        adjust_learning_rate(optimizer, epoch, learning_rate)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)

Epoch: [0][0/20019]	Time 6.216 (6.216)	Data 3.115 (3.115)	Loss 7.1151 (7.1151)	Prec@1 0.000 (0.000)	Prec@5 0.000 (0.000)
Epoch: [0][100/20019]	Time 0.682 (0.863)	Data 0.189 (0.322)	Loss 6.9071 (6.9100)	Prec@1 0.000 (0.139)	Prec@5 1.562 (0.557)
Epoch: [0][200/20019]	Time 0.683 (0.846)	Data 0.000 (0.345)	Loss 6.9078 (6.9089)	Prec@1 0.000 (0.124)	Prec@5 0.000 (0.544)
Epoch: [0][300/20019]	Time 0.681 (0.849)	Data 0.000 (0.349)	Loss 6.9075 (6.9085)	Prec@1 0.000 (0.104)	Prec@5 0.000 (0.555)
Epoch: [0][400/20019]	Time 1.094 (0.843)	Data 0.864 (0.336)	Loss 6.9096 (6.9084)	Prec@1 0.000 (0.097)	Prec@5 0.000 (0.526)
Epoch: [0][500/20019]	Time 0.683 (0.846)	Data 0.386 (0.354)	Loss 6.9078 (6.9082)	Prec@1 0.000 (0.094)	Prec@5 0.000 (0.515)
Epoch: [0][600/20019]	Time 0.679 (0.846)	Data 0.000 (0.360)	Loss 6.9074 (6.9082)	Prec@1 0.000 (0.099)	Prec@5 0.000 (0.528)
Epoch: [0][700/20019]	Time 0.687 (0.848)	Data 0.000 (0.363)	Loss 6.9098 (6.9081)	Prec@1 0.000 (0.091)	Prec@5 0.000 (0.506)
Epoch: [0][800/200