!mkdir cifar10
!curl -o cifar-10-python.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
!tar -xvzf cifar-10-python.tar.gz -C cifar10

In [1]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import sys
import os
import time
import string
import random
import pickle

import numpy as np
import theano
import theano.tensor as T
import lasagne

# for the larger networks (n>=9), we need to adjust pythons recursion limit
sys.setrecursionlimit(10000)

# ##################### Load data from CIFAR-10 dataset #######################
# this code assumes the cifar dataset from 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
# has been extracted in current working directory

def unpickle(file):
    import pickle
    fo = open(file, 'rb')
    dict = pickle.load(fo, encoding='bytes')
    fo.close()
    return dict

def load_data():
    xs = []
    ys = []
    for j in range(5):
      d = unpickle('cifar10/cifar-10-batches-py/data_batch_' + str(j+1))
      x = d[b'data']
      y = d[b'labels']
      xs.append(x)
      ys.append(y)

    d = unpickle('cifar10/cifar-10-batches-py/test_batch')
    xs.append(d[b'data'])
    ys.append(d[b'labels'])

    x = np.concatenate(xs) / np.float32(255)
    y = np.concatenate(ys)
    x = np.dstack((x[:, :1024], x[:, 1024:2048], x[:, 2048:]))
    x = x.reshape((x.shape[0], 32, 32, 3)).transpose(0, 3, 1, 2)

    # subtract per-pixel mean
    pixel_mean = np.mean(x[0:50000],axis=0)
    #pickle.dump(pixel_mean, open("cifar10-pixel_mean.pkl","wb"))
    x -= pixel_mean

    # create mirrored images
    X_train = x[0:50000,:,:,:]
    Y_train = y[0:50000]
    X_train_flip = X_train[:,:,:,::-1]
    Y_train_flip = Y_train
    X_train = np.concatenate((X_train,X_train_flip),axis=0)
    Y_train = np.concatenate((Y_train,Y_train_flip),axis=0)

    X_test = x[50000:,:,:,:]
    Y_test = y[50000:]

    return (
        lasagne.utils.floatX(X_train),
        Y_train.astype('int32'),
        lasagne.utils.floatX(X_test),
        Y_test.astype('int32'),
    )

Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GRID K520 (0000:00:03.0)


In [3]:
X_train, y_train, X_test, y_test = load_data()

In [4]:
import lasagne
from theano import tensor as T
from lasagne.nonlinearities import *

input_X = T.tensor4("X")
target_y = T.vector("target Y integer",dtype='int32')

In [5]:
from lasagne.layers import Conv2DLayer as ConvLayer
from lasagne.layers import ElemwiseSumLayer
from lasagne.layers import InputLayer
from lasagne.layers import DenseLayer
from lasagne.layers import GlobalPoolLayer
from lasagne.layers import PadLayer
from lasagne.layers import ExpressionLayer
from lasagne.layers import NonlinearityLayer
from lasagne.nonlinearities import softmax, rectify
from lasagne.layers import batch_norm

def build_cnn(input_var=None, n=5):
    # create a residual learning building block with two stacked 3x3 convlayers as in paper
    def residual_block(layer, increase_dim=False, projection=False):
        input_num_filters = layer.output_shape[1]
        if increase_dim:
            first_stride = (2, 2)
            out_num_filters = input_num_filters * 2
        else:
            first_stride = (1, 1)
            out_num_filters = input_num_filters

        stack_1 = batch_norm(
            ConvLayer(
                layer, num_filters=out_num_filters, filter_size=(3, 3),
                stride=first_stride, nonlinearity=rectify, pad='same',
                W=lasagne.init.HeNormal(gain='relu'), flip_filters=False
            )
        )
        stack_2 = batch_norm(
            ConvLayer(
                stack_1, num_filters=out_num_filters, filter_size=(3, 3),
                stride=(1, 1), nonlinearity=None, pad='same',
                W=lasagne.init.HeNormal(gain='relu'), flip_filters=False
            )
        )

        # add shortcut connections
        if increase_dim:
            # identity shortcut, as option A in paper
            identity = ExpressionLayer(
                layer, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], int(s[2] / 2), int(s[3] / 2))
            )
            padding = PadLayer(identity, [int(out_num_filters / 4), 0, 0], batch_ndim=1)
            block = NonlinearityLayer(ElemwiseSumLayer([stack_2, padding]),nonlinearity=rectify)
        else:
            block = NonlinearityLayer(ElemwiseSumLayer([stack_2, layer]), nonlinearity=rectify)

        return block

    # Building the network
    l_in = InputLayer(shape=(None, 3, 32, 32), input_var=input_var)

    # first layer, output is 16 x 32 x 32
    layer = batch_norm(
        ConvLayer(
            l_in, num_filters=16, filter_size=(3, 3), stride=(1, 1),
            nonlinearity=rectify, pad='same',
            W=lasagne.init.HeNormal(gain='relu'), flip_filters=False
        )
    )

    # first stack of residual blocks, output is 16 x 32 x 32
    for i in range(n):
        layer = residual_block(layer)

    # second stack of residual blocks, output is 32 x 16 x 16
    layer = residual_block(layer, increase_dim=True)
    for i in range(1, n):
        layer = residual_block(layer)

    # third stack of residual blocks, output is 64 x 8 x 8
    layer = residual_block(layer, increase_dim=True)
    for i in range(1, n):
        layer = residual_block(layer)

    # average pooling
    layer = GlobalPoolLayer(layer)

    # fully connected layer
    layer = DenseLayer(
        layer, num_units=10,
        W=lasagne.init.HeNormal(),
        nonlinearity=softmax
    )

    return layer

In [6]:
def iterate_minibatches(inputs, targets, batchsize, shuffle=False, augment=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        if augment:
            # as in paper : 
            # pad feature arrays with 4 pixels on each side
            # and do random cropping of 32x32
            padded = np.pad(inputs[excerpt], ((0, 0), (0, 0), (4, 4), (4, 4)), mode='edge')
            random_cropped = np.zeros(inputs[excerpt].shape, dtype=np.float32)
            crops = np.random.randint(0, 9, size=(batchsize, 2))
            for r in range(batchsize):
                random_cropped[r,:,:,:] = padded[r,:,crops[r,0]:(crops[r,0]+32),crops[r,1]:(crops[r,1]+32)]
            inp_exc = random_cropped
        else:
            inp_exc = inputs[excerpt]

        yield inp_exc, targets[excerpt]

In [7]:
net = build_cnn(input_X, n=9)

In [8]:
y_predicted = lasagne.layers.get_output(net)
all_weights = lasagne.layers.get_all_params(net, trainable=True)
all_layers = lasagne.layers.get_all_layers(net)
print(all_weights)

[W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, beta, gamma, W, b]


In [9]:
reg_lambda = 0.0001
current_learning_rate = 0.1
loss = lasagne.objectives.categorical_crossentropy(y_predicted, target_y).mean()
loss += reg_lambda * lasagne.regularization.regularize_layer_params(all_layers, lasagne.regularization.l2)
accuracy = lasagne.objectives.categorical_accuracy(y_predicted, target_y).mean()
grad_update = lasagne.updates.momentum(loss, all_weights, learning_rate=current_learning_rate, momentum=0.9)
train_fun = theano.function([input_X, target_y], [loss, accuracy], updates=grad_update, allow_input_downcast=True)
accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)

In [10]:
import time

epohs_number = 100
batch_size = 128 #размер мини-батча
batch_counter = 0

for epoch in range(epohs_number):
    batch_counter += 1

    # In each epoch, we do a full pass over the training data:
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, batch_size, augment=True, shuffle=True):
        inputs, targets = batch
        train_err_batch, train_acc_batch = train_fun(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    # And a full pass over the validation data:
    val_acc = 0
    val_batches = 0
    for batch in iterate_minibatches(X_test, y_test, batch_size):
        inputs, targets = batch
        val_acc += accuracy_fun(inputs, targets)
        val_batches += 1

    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(batch_counter, epohs_number, time.time() - start_time))
    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{:.2f} %".format(train_acc / train_batches * 100))
    print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))
    
    if val_acc / val_batches * 100 > 93:
        break

    if epoch == 40 or epoch == 60:
        current_learning_rate = current_learning_rate * 0.1
        print("New learning rate: {}".format(current_learning_rate))
        grad_update = lasagne.updates.momentum(
            loss, all_weights, learning_rate=current_learning_rate, momentum=0.9
        )
        train_fun = theano.function(
            [input_X, target_y], [loss, accuracy], updates=grad_update, allow_input_downcast=True
        )
        accuracy_fun = theano.function([input_X, target_y], accuracy, allow_input_downcast=True)

Epoch 1 of 100 took 758.898s
  training loss (in-iteration):		2.388492
  train accuracy:		34.05 %
  validation accuracy:		56.42 %
Epoch 2 of 100 took 758.955s
  training loss (in-iteration):		1.450225
  train accuracy:		65.05 %
  validation accuracy:		72.66 %
Epoch 3 of 100 took 725.152s
  training loss (in-iteration):		1.094057
  train accuracy:		75.70 %
  validation accuracy:		77.49 %
Epoch 4 of 100 took 758.881s
  training loss (in-iteration):		0.929493
  train accuracy:		79.88 %
  validation accuracy:		81.07 %
Epoch 5 of 100 took 756.663s
  training loss (in-iteration):		0.829539
  train accuracy:		82.17 %
  validation accuracy:		82.92 %
Epoch 6 of 100 took 816.154s
  training loss (in-iteration):		0.766049
  train accuracy:		83.62 %
  validation accuracy:		83.55 %
Epoch 7 of 100 took 1124.616s
  training loss (in-iteration):		0.718687
  train accuracy:		84.76 %
  validation accuracy:		84.49 %
Epoch 8 of 100 took 1120.120s
  training loss (in-iteration):		0.684288
  train accuracy:

In [11]:
np.savez('93.29_params_.npz', *lasagne.layers.get_all_param_values(net))

In [12]:
test_acc = 0
test_batches = 0
for batch in iterate_minibatches(X_test, y_test, 500):
    inputs, targets = batch
    acc = accuracy_fun(inputs, targets)
    test_acc += acc
    test_batches += 1
print("Final results:")
print("  test accuracy:\t\t{:.2f} %".format(
    test_acc / test_batches * 100))

if test_acc / test_batches * 100 > 92.5:
    print("Achievement unlocked: колдун 80 уровня")
else:
    print("Нужно больше магии!")

Final results:
  test accuracy:		93.36 %
Achievement unlocked: колдун 80 уровня


Взял сетку и препроцессинг отсюда: https://github.com/Lasagne/Recipes/blob/master/papers/deep_residual_learning/Deep_Residual_Learning_CIFAR-10.py

Поменял только чтобы покраям картинок (при сдвиге) добавлялся не константный фон, а те же пиксели, что и по краям (еще немного улучшил код по стилю). Обучалось на одном серве амазона g2 на двоих, так что если бы удалось получить собственный серв, обучлось бы в 2 раза быстрее. Когда достигся результат больше 93% слелана остановка. Но сильно больше из этой сетки не вытянуть.