In [1]:
import torch
import torchvision
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, utils
from torchvision import datasets
from torchvision.utils import save_image

import skimage 
import math
# import io
# import requests
# from PIL import Image

import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sys
import os


Bibliography:

* [Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction](http://people.idsia.ch/~ciresan/data/icann2011.pdf)
    

In [2]:
# Mostly Taken from examples here:
# https://github.com/pytorch/examples/blob/master/mnist/main.py
# https://github.com/csgwon/pytorch-deconvnet/blob/master/models/vgg16_deconv.py
# Other resources
# https://github.com/pgtgrly/Convolution-Deconvolution-Network-Pytorch/blob/master/conv_deconv.py
# https://github.com/kvfrans/variational-autoencoder
# https://github.com/SherlockLiao/pytorch-beginner/blob/master/08-AutoEncoder/conv_autoencoder.py
# https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/convolutional_neural_network/main-gpu.py
# https://pgaleone.eu/neural-networks/2016/11/24/convolutional-autoencoders/

In [5]:
class CAEEncoder(nn.Module):
    """
    The Encoder = Q(z|X) for the Network
    """
    def __init__(self, w,h, channels=3, levels=2, kernel_size=3, first_feature_count=16):
        super(CAEEncoder, self).__init__()
        self.width = width
        self.heigth = height
        self.channels = channels
        self.levels = levels
        self.kernel_size = kernel_size
        self.first_feature_count = first_feature_count
        
        self.indices = []
        
        padding = math.floor(kernel_size/2)

        self.l_features = [channels]
        self.layers = []
        
        for i in range(levels + 1):
            l_features.append(first_feature_count * (2**(i)) )
            
        for i in range(levels):
            
            nfeat = self.l_features[i+1]
            layer = nn.Sequential(
                nn.Conv2d(self.l_features[i], nfeat, kernel_size=kernel_size, padding=padding),
                nn.ReLU(),
                nn.Conv2d(nfeat, nfeat, kernel_size=kernel_size, padding=padding),
                nn.ReLU(),
                torch.nn.MaxPool2d(2, stride=2, return_indices=True)
            )
            self.layers.append(layer)
        
        self.conv_dim = int(((w*h)/ ((2**levels)**2)) * self.l_features[-1])

    def forward(self, x):
        self.indices = []
        out = x
        for i in range(self.levels):
            layer = self.layers[i]
            out, idx  = layer(out)
            self.indices.append(idx)
        return out

In [4]:
class CAEDecoder(torch.nn.Module):
    """
    The Decoder = P(X|z) for the Network
    """
    def __init__(self, maxpool_indices, width, height, channels=3, levels=2, kernel_size=3, first_feature_count=16):
        super(CAEDecoder, self).__init__()
        padding = math.floor(kernel_size/2)
        self.width = width
        self.heigth = height
        self.channels = channels
        self.levels = levels
        self.indices = maxpool_indices
        
        self.l_features = [channels]
        self.layers = []
        
        for i in range(levels + 1):
            l_features.append(first_feature_count * (2**(i)) )
        
        self.encoder = encoder
        self.conv_dim = int(((w*h)/ ((2**levels)**2)) * self.l_features[-1])

        self.unpool_1 = nn.MaxUnpool2d(2, stride=2)
        self.deconv_layer_1 = torch.nn.Sequential(
            nn.ConvTranspose2d(self.l2_feat, self.l2_feat, kernel_size=kernel_size, padding=padding),
            nn.ReLU(),
            nn.ConvTranspose2d(self.l2_feat, self.l1_feat, kernel_size=kernel_size, padding=padding),
            nn.ReLU()
        )
        self.unpool_2 = nn.MaxUnpool2d(2, stride=2)
        self.deconv_layer_2 = torch.nn.Sequential(
            nn.ConvTranspose2d(self.l1_feat, self.l1_feat, kernel_size=kernel_size, padding=padding),
            nn.ReLU(),
            nn.ConvTranspose2d(self.l1_feat, channels, kernel_size=kernel_size, padding=padding),
            nn.Tanh()
        )

    def forward(self, x):
        out = x
        for i in range(self.levels):
            rev_i = -(i+1)
            out = F.max_unpool2d(out, self.indices[rev_i],2, stride=2)
            out = self.layers[i](out)
        return out

In [5]:
class CAE(nn.Module):
    def __init__(self, width, height, channels, levels=2, conv_layer_feat=16):
        super(CAE, self).__init__()
        self.width = width
        self.height = height
        self.channels = channels
        self.encoder = CAEEncoder(width, height, channels, levels, 3, conv_layer_feat)
        self.decoder = CAEDecoder(self.encoder, width, height, channels, levels, 3, conv_layer_feat)
    
    def forward(self, x):
        out = self.encoder(x)
        out = self.decoder(out)
        return out
        
    def save_model(self, name, path):
        torch.save(self.encoder, os.path.join(path, "cae_encoder_"+name+".pth"))
        torch.save(self.decoder, os.path.join(path, "cae_decoder_"+name+".pth"))
        

In [10]:
def prime_factors(n):
    i = 2
    factors = []
    while i * i <= n:
        if n % i:
            i += 1
        else:
            n //= i
            factors.append(i)
    if n > 1:
        factors.append(n)
    return sorted(factors)

In [25]:
#definitions of the operations for the full image autoencoder
normalize = transforms.Normalize(
   mean=[0.485, 0.456, 0.406], # from example here https://github.com/pytorch/examples/blob/409a7262dcfa7906a92aeac25ee7d413baa88b67/imagenet/main.py#L94-L95
   std=[0.229, 0.224, 0.225]
#   mean=[0.5, 0.5, 0.5], # from example here http://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
#    std=[0.5, 0.5, 0.5]
)

#the whole image gets resized to a small image that can be quickly analyzed to get important points
def fullimage_preprocess(w=48,h=48):
    return transforms.Compose([
        transforms.Resize((w,h)), #this should be used ONLY if the image is bigger than this size
        transforms.ToTensor(),
        normalize
    ])

#the full resolution fovea just is a small 12x12 patch 
def crop_fovea(size=12):
    sample = transforms.Compose([
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    normalize
    ])
    return sample

def downsampleTensor(crop_size, final_size=16):
    sample = transforms.Compose([
        transforms.CenterCrop(crop_size),
        transforms.Resize(final_size), 
        transforms.ToTensor(),
        normalize
    ])
    return sample


In [21]:
# NOTE: all patches will be square
# full size image will be resized to a square image, beacause it's easy

class MultiResCAE(nn.Module):
    """
    Multi Resolution group of Convolutional Autoencoders
    This module intends to group several autoencoders that accompany different resolutions
    The goal of this module is be able to train and maintain all the filters in one place
    This model can be saved and loaded as a single element
    This model outputs tensor of dimension 1x1xN that is the concatenation of the output of all the encoders ensemble
    the FULL IMAGE is not considered in this 
    """
    def __init__(self, channels=3, res_levels=3, conv_layer_feat=[32,16,16], res_px=[12,16,20],
                 crop_sizes=[12,32,64],
                 # conv_sizes = [(3,5,7), (3,5,7,11), (3,5,7,11)],  # this is too much I think
                 conv_sizes = [(3,5,7), (3,5), (3,5)], 
                 ds_full_image_cae=True, full_image_size=32, full_img_conv_feat=16, full_conv_sizes=(3,5,7)):
        super(CAE, self).__init__()
        self.channels = channels  # number of channels in the input image
        self.res_levels = res_levels  #number of resolution levels (NOT including the full image)
        self.conv_layer_feat = conv_layer_feat  # number of convolutional filters per CAE in the first level
        self.res_px = res_px  # downsampled resolution in pixels for each resolution
        self.conv_sizes = conv_sizes  # conv filter sizes per layer, one encoder per size per layer
        self.crop_sizes = crop_sizes  # size of the patch to crop
        #compute the maximum number of levels that this resolution can handle, 
        #this will be the parameter given to create the resolution encoder
        self.max_levels = [ prime_factors[i].count(2) for i in res_px]
        self.conv_sizes = conv_sizes  # filter sizes to create for each resolution
        
        self.encoders = []
        self.full_encoders = []
        self.full_decoders = []
        self.decoders = []
        
        #separated as functions to be able to later LOAD the encoders instead of creating them each time
        self._create_encoders()
        self._create_decoders()
        
    def _create_encoders():
        for i in range(self.res_levels):
            res_encoders = []
            # Conv Encoder description
            conv_features = self.conv_layer_feat[i]
            l_conv_sizes = self.conv_sizes[i]
            levels = self.max_levels[i]
            # Image size
            c = self.channels
            w = h = r_px = self.res_px[i]  # resolution of the image for the encoder
            # crop_px = self.crop_sizes[i]  # pre-processing this is the crop size for the input image

            for j in range(l_conv_sizes):
                enc = CAEEncoder(w, h, c, levels, j, conv_features)
                res_encoders.append(enc)
            self.encoders.append(res_encoders)
        
    def _create_decoders():
        for i in range(self.res_levels):
            res_decoders = []
            # Conv Encoder description
            conv_features = self.conv_layer_feat[i]
            l_conv_sizes = self.conv_sizes[i]
            levels = self.max_levels[i]
            # Image size
            c = self.channels
            w = h = r_px = self.res_px[i]  # resolution of the image for the encoder
            # crop_px = self.crop_sizes[i]  # pre-processing this is the crop size for the input image

            for j in range(l_conv_sizes):
                enc = CAEDecoder(self.encoders[i][j], w, h, c, levels, j, conv_features)
                res_decoders.append(enc)
            self.decoders.append(res_encoders)
    
    def forward(self, x):
        in_img = x
        ########################
        # BEGIN Encoding
        ###
        # 
        # For the moment all crops are in the center of the given image
        # fovea crop
        # encode fovea
        #
        # next crops and downsamples
        # encode downsamples
        #
        # reshape and concatenate all encodings -> create a simple embedding (maybe later work with a multinomial probability distribution)
        ###
        # END Encoding
        ########################
        #BEGIN decoding
        return out
        
    def save_models(self, name, path):
        raise NotImplementedError()
        pass
        

In [21]:
# NOTE: all patches will be square
# full size image will be resized to a square image, beacause it's easy

class MultiFullCAE(nn.Module):
    """
    Group of Convolutional Autoencoders for a single input resolution
    The image is treated as monochrome
    
    """
    def __init__(self, channels=1, ds_full_image_cae=True, full_image_size=32, full_img_conv_feat=16, full_conv_sizes=(3,5,7)):
        super(CAE, self).__init__()
        self.channels = channels  # number of channels in the input image
        #this will be the parameter given to create the resolution encoder
        self.levels = prime_factors[full_image_size].count(2)
        self.conv_sizes = conv_sizes  # filter sizes to create for each resolution
        self.ds_full_img_cae = ds_full_image_cae  # indicate if create or not the full image downsample conv encoder
        self.full_image_size = full_image_size  # image to which to redimension the entire input image (if previous is True)
        self.full_img_conv_feat = full_img_conv_feat  # number of convolutional filters to use per layer 
        self.full_conv_sizes = full_conv_sizes  # sizes of the convolutional filters, one encoder per size
        
        self.full_encoders = []
        self.full_decoders = []
        #separated as functions to be able to later LOAD the encoders instead of creating them each time
        self._create_full_encoders()
        self._create_full_decoders()
        
        
    def _create_full_encoders(channels=1):
        for cs in self.full_conv_sizes:
            width = height = self.full_image_size
            channels = self.channels  # although I'm thinking in making this monochrome instead to save processing time
            enc = CAEEncoder(width, height, channels, self.levels, cs, self.full_img_conv_feat)
            self.full_encoders.append(enc)
        
    def _create_full_decoders(channels=1):
        for i in range(self.full_conv_sizes):
            cs = full_conv_sizes[i]
            width = height = self.full_image_size
            channels = self.channels  # although I'm thinking in making this monochrome instead to save processing time
            enc = CAEDecoder(self.full_encoders[i], width, height, channels, self.levels, cs, self.full_img_conv_feat)
            self.full_decoders.append(enc)
        
    def forward(self, x):
        #input = downsampled full image converted to monochrome
        ########################
        # BEGIN Encoding
        ###
        
        # for the moment this full image is computed each time, but in the future this will be 
        #     done ONLY if the input image changes
        #     maybe what we want to work with is only the difference from previous frames -> future when working in dynamic environments
        # encoder full downsampled image
        # 
        # join  all encodings into a single vector
        # END Encoding
        ########################
        #BEGIN decoding
        return out
        
    def save_models(self, name, path):
        raise NotImplementedError()
        pass
        

In [22]:
tg = transforms.Grayscale()

In [23]:
tg?

[0;31mSignature:[0m      [0mtg[0m[0;34m([0m[0mimg[0m[0;34m)[0m[0;34m[0m[0m
[0;31mType:[0m           Grayscale
[0;31mString form:[0m    <torchvision.transforms.transforms.Grayscale object at 0x7f9c6080ef60>
[0;31mFile:[0m           ~/DeepLearning/venv3/lib/python3.5/site-packages/torchvision/transforms/transforms.py
[0;31mDocstring:[0m     
Convert image to grayscale.

Args:
    num_output_channels (int): (1 or 3) number of channels desired for output image

Returns:
    PIL Image: Grayscale version of the input.
    - If num_output_channels == 1 : returned image is single channel
    - If num_output_channels == 3 : returned image is 3 channel with r == g == b
[0;31mCall docstring:[0m
Args:
    img (PIL Image): Image to be converted to grayscale.

Returns:
    PIL Image: Randomly grayscaled image.


In [7]:
def get_loaders(batch_size, transformation, dataset = datasets.CIFAR100, cuda=True):

    kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
    train_loader = torch.utils.data.DataLoader(
        dataset('../data', train=True, download=True,
                       transform=transformation),
        batch_size=batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        dataset('../data', train=False, transform=transformation),
        batch_size=batch_size, shuffle=True, **kwargs)

    return train_loader, test_loader


In [13]:
# Hyper Parameters
# num_epochs = 5
# batch_size = 100
# learning_rate = 0.001

num_epochs = 100
batch_size = 128
learning_rate = 0.0001

In [14]:
model = CAE(12,12,3,500,200,32).cuda()

criterion = nn.MSELoss()
#criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)


In [15]:
def to_img(x):
    x = 0.5 * (x + 1)
    x = x.clamp(0, 1)
    x = x.view(x.size(0), 3, 12, 12)
    return x


In [16]:
transformation = full_resolution_crop
train_loader, test_loader = get_loaders(batch_size, transformation)

Files already downloaded and verified


In [17]:
%%time

for epoch in range(num_epochs):
    for i, (img, labels) in enumerate(train_loader):
        img = Variable(img).cuda()
        # ===================forward=====================
#         print("encoding batch of  images")
        output = model(img)
#         print("computing loss")
        loss = criterion(output, img)
        # ===================backward====================
#         print("Backward ")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.data[0]))
    if epoch % 10 == 0:
        pic = to_img(output.cpu().data)
        in_pic = to_img(img.cpu().data)
        save_image(pic, './cae_results/2x2-2xfc-out_image_{}.png'.format(epoch))
        save_image(in_pic, './cae_results/2x2-2xfc-in_image_{}.png'.format(epoch))
    if loss.data[0] < 0.15: #arbitrary number because I saw that it works well enough
        break


model.save_model("2x2-2xfc-layer", "CAE")

epoch [1/100], loss:0.4138
epoch [2/100], loss:0.2646
epoch [3/100], loss:0.3089
epoch [4/100], loss:0.2290
epoch [5/100], loss:0.2640
epoch [6/100], loss:0.3082
epoch [7/100], loss:0.2789
epoch [8/100], loss:0.2025
epoch [9/100], loss:0.1942
epoch [10/100], loss:0.2041
epoch [11/100], loss:0.2508
epoch [12/100], loss:0.2682
epoch [13/100], loss:0.2567
epoch [14/100], loss:0.2532
epoch [15/100], loss:0.2498
epoch [16/100], loss:0.2509
epoch [17/100], loss:0.2485
epoch [18/100], loss:0.2437
epoch [19/100], loss:0.2815
epoch [20/100], loss:0.1903
epoch [21/100], loss:0.2225
epoch [22/100], loss:0.2277
epoch [23/100], loss:0.2364
epoch [24/100], loss:0.2252
epoch [25/100], loss:0.2694
epoch [26/100], loss:0.1984
epoch [27/100], loss:0.2271
epoch [28/100], loss:0.3034
epoch [29/100], loss:0.2541
epoch [30/100], loss:0.2344
epoch [31/100], loss:0.2076
epoch [32/100], loss:0.2221
epoch [33/100], loss:0.2719
epoch [34/100], loss:0.1923
epoch [35/100], loss:0.2095
epoch [36/100], loss:0.2596
e

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


## Preliminary Results 

Experiments with the following configurations:
- 2 layers with 2 convolutional stages each  <- **best result**
- 2 layers with 2 convolutional stages each and 2 fully connected layers  <- bigger model and a bit slower to converge, but results are good too
- 2 layers with 2 convolutional stages each with batch normalization
- 2 layers with 4 convolutional stages each <- **worst result**


2 layers with 4 conv stages each does not give the same results as 2 layers with 2 conv stages

It not only converges MUCH faster and the models are smaller, but the actually the convergence is much better

For bathc normalization happens the same, without batchnorm2d converges faster and model is smaller