In [1]:
# config theano to use GPU, must be done before theano is imported
import os    
os.environ['THEANO_FLAGS'] = "device=cuda0,floatX=float32"#,optimizer=None,exception_verbosity=high"  

In [3]:
import theano

In [4]:
# http://deeplearning.net/tutorial/lenet.html
    
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict

from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d

class LeNetConvPoolLayer():
    def __init__(self, 
        rng, 
        input, # with 4 dimensions: rows of input(i.e. batch) * num of input maps per input * height per map * width per map; see: https://www.quora.com/Why-are-there-4-dimensions-to-convolve-over-the-Stanford-UFDL-example-in-convolutional-neural-networks
        image_shape, # tuple/list of len 4 representing the 4 dims of input: (batch size, num input feature maps, image height, image width)
        filter_shape, # tuple/list of len 4 representing: (number of filters, num input feature maps, filter height, filter width)
        poolsize=(2, 2), # eg. downsample every 2x2 bits to 1 bit
        activation=T.nnet.relu,
        border_mode='valid', # no padding; see doc for more options
        subsample=(1, 1) # unit stride        
    ):
        assert image_shape[1] == filter_shape[1] # num input feataure maps should be same for both arrs
        
        self.input = input
        fan_in = numpy.prod(filter_shape[1:]) # (num input feature maps * filter height * filter width) input nodes to each hidden node/unit(?) 
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # (num output feature maps * filter height * filter width /  pooling size) output nodes per layer
        
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(
            numpy.asarray(
                rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
                dtype=theano.config.floatX
            ),
            borrow=True
        )
        
        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) # a 1D tensor(i.e. array) of len(num of filters) for adding a bias per output feature map
        self.b = theano.shared(value=b_values, borrow=True)

        # convolve input feature maps with filters
        # doc: http://deeplearning.net/software/theano/library/tensor/nnet/conv.html#theano.tensor.nnet.conv2d
        # simply explained; conv2d iterates over every sample, apply the SAME weights on each 'cropped' input(map) to create each output map
        # taking an analogy with MLP, each output feature map can be considered as one hidden node in an MLP, where each value in the map represents a different cropped position of the input image
        # (programmatically, I believe it flattens the num of maps * input height * input width for every sample; apply the transformtion; and then reshape it back to the expected output dimensions)
        conv_out = conv2d(
            input=input,
            filters=self.W,
            filter_shape=filter_shape,
            input_shape=image_shape,
            border_mode=border_mode,
            subsample=subsample
        )

        # pool each feature map individually, using maxpooling
        # doc: http://deeplearning.net/software/theano/library/tensor/signal/pool.html
        # note: 'Max pooling will be done over the 2 last dimensions', i.e. height * width
        pooled_out = pool.pool_2d(
            input=conv_out,
            ws=poolsize,
            ignore_border=True
        )

        # add the bias term. Since the bias is a vector (1D array), we reshape it to a tensor of shape (1, n_filters, 1, 1). 
        # since every output map uses one shared weight, only 1 bias is needed for each output map
        self.output = activation(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # explanation of dimshuffle: https://stackoverflow.com/questions/42401420/how-the-function-dimshuffle-works-in-theano

        # shared_params
        self.params = [self.W, self.b]

In [42]:
class HiddenLayer(object):
    def __init__(
        self, 
        rng, 
        input, 
        n_in, 
        n_out, 
        p_dropout=0.0, 
        W=None, 
        b=None, 
        activation=T.tanh # if set actication=T.nnet.sigmoid, becomes logistic regresssion layer
    ): 
        self.input = input
        self.theano_rng = RandomStreams(rng.randint(2 ** 30))
        
        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.
        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, borrow=True)

        self.W = W
        self.b = b

        dropout_input = self.get_corrupted_input(input, p_dropout)
        lin_output = T.dot(dropout_input, self.W) + self.b
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        # parameters of the model
        self.params = [self.W, self.b]
        
    # add noise by setting corruption_level% of data to 0s
    def get_corrupted_input(self, input, corruption_level):   
        return self.theano_rng.binomial(size=input.shape, n=1,
                                        p=1 - corruption_level,
                                        dtype=theano.config.floatX) * input

In [43]:
class LogisticRegression(object):
    def __init__(self, input, n_in, n_out):
        self.input = input
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        # predict_proba
        self.p_y_given_x = T.nnet.softmax(T.dot(self.input, self.W) + self.b) # softmax=normalized sigmoid
        # predict
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = [self.W, self.b]


    # this is akin to cost = -1/m * sigma(ylog(wx) + (1-y)log(1-wx)) when y is binomial
    # in the current case y has n-labels, and only the prediction of the right label is picked out
    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    # perc of wrong predictions
    def errors(self, y):
        return T.mean(T.neq(self.y_pred, y)) # T.neq(a,b) checks a != b

In [44]:
# note: output size calculated by formula:
# (Width - Filter_size + 2*Padding / Stride) + 1
# stride is usually 1 in implementations such as conv2d

In [98]:
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict

class CNN():
    def __init__(
        self,
        numpy_rng=numpy.random.RandomState(1234),
        n_in_maps=3, 
        in_width=32,
        in_height=32,
        n_out=10,
        conv_layer_sizes=[40,80],
        conv_filter_sizes=[(5, 5), (5, 5)],
        conv_padding_sizes=[(0, 0), (0, 0)],
        conv_subsample_sizes=[(1, 1), (1, 1)],
        conv_pooling_sizes=[(2, 2), (2, 2)],
        fully_connected_layer_sizes=[2000,2000],
        dropout_levels=[0.1,0.1],
        batch_size=100,
        lmbda=0.1
    ):        
        # init var
        self.rng = numpy_rng or numpy.random.RandomState(23455)
        self.x = T.matrix('x')
        self.y = T.ivector('y')
        self.index = T.lscalar('index')
        self.x_batch = self.x[self.index * batch_size: (self.index + 1) * batch_size]
        self.y_batch = self.y[self.index * batch_size: (self.index + 1) * batch_size]
        self.conv_layers = []
        self.fully_connected_layers = []
        self.params = [] # holds the shared/updatable vars
        weights = [] # just store the weights for calc regularization
        
        input_shape = (batch_size, n_in_maps, in_width, in_height)
        layer0_input = self.x_batch.reshape(input_shape) # cifar data comes in RGB(3) channels in 32x32 size

        # conv layers
        out_dims = [] # store (width, height) of each conv layers output
        for i in range(len(conv_layer_sizes)):
            out_n_maps = conv_layer_sizes[i]
            filter_size = conv_filter_sizes[i]
            border_mode = conv_padding_sizes[i]
            subsample = conv_subsample_sizes[i]
            poolsize = conv_pooling_sizes[i]
            
            if i == 0:
                layer_input = layer0_input
            else:
                # the input is the prev layer
                n_in_maps, in_width, in_height = out_dims[i-1]             
                layer_input = self.conv_layers[i-1].output # output is the final activation
            
            out_width  = ((in_width - filter_size[0] + 2 * border_mode[0]) / subsample[0] + 1 ) / poolsize[0]
            out_height = ((in_height - filter_size[1] + 2 * border_mode[1]) / subsample[1] + 1) / poolsize[1]
            out_dims.append((out_n_maps, out_width, out_height))

            conv_layer = LeNetConvPoolLayer(
                self.rng,
                input=layer_input,
                image_shape=(batch_size, n_in_maps, in_width, in_height), # apparently, theano conv2d accepts a defined list/tuple as arg only, therefore this arg cannot be simplified in to input.shape (which would be a tensor vector)
                filter_shape=(out_n_maps, n_in_maps, filter_size[0], filter_size[1]),
                poolsize=poolsize,
                border_mode=border_mode,
                subsample=subsample
            )
            weights.append(conv_layer.W)
            self.conv_layers.append(conv_layer)
            self.params.extend(conv_layer.params)
        
        final_conv_output = self.conv_layers[-1].output.flatten(2)
        final_conv_output_features = int(numpy.prod(out_dims[-1]))
        
        # fully connected layers
        for i in range(len(fully_connected_layer_sizes)):
            if i == 0:
                fully_connected_layer_input = self.conv_layers[-1].output.flatten(2)
                fully_connected_layer_n_in = int(numpy.prod(out_dims[-1]))
            else:
                fully_connected_layer_input = self.fully_connected_layers[i-1].output
                fully_connected_layer_n_in = fully_connected_layer_sizes[i-1]

            hidden_layer = HiddenLayer(
                self.rng,
                input=fully_connected_layer_input,
                n_in=fully_connected_layer_n_in,
                n_out=fully_connected_layer_sizes[i],
                p_dropout=dropout_levels[i],
            )
            weights.append(hidden_layer.W)
            self.fully_connected_layers.append(hidden_layer)
            self.params.extend(hidden_layer.params)

        # sigmoidal LR output layer
        log_layer = LogisticRegression(
            input=self.fully_connected_layers[-1].output, 
            n_in=fully_connected_layer_sizes[-1], 
            n_out=n_out
        )
        weights.append(log_layer.W)
        self.params.extend(log_layer.params)
    
        # L2 regularization
        l2_norm_squared = sum([(weight**2).sum() for weight in weights])
        self.cost = log_layer.negative_log_likelihood(self.y_batch) +  0.5 * lmbda * l2_norm_squared/batch_size
        self.errors = log_layer.errors(self.y_batch)

    def build_train_fn(self, train_x, train_y, learning_rate):
        # create a list of gradients for all model parameters
        grads = T.grad(self.cost, self.params)
        
        updates = OrderedDict()
        for grad, param in zip(grads, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - grad * T.cast(learning_rate, dtype=theano.config.floatX)

        train_fn = theano.function(
            inputs=[self.index],
            outputs=self.cost,
            updates=updates,
            givens={
                self.x: train_x,
                self.y: train_y
            }
        )
        return train_fn

    def get_errors(self, test_x, test_y, batch_size):  
        n_batches = int(test_x.shape[0] / batch_size)
        get_batch_error = theano.function(
            inputs=[self.index],
            outputs = self.errors, # perc of wrong preds
            givens={
                self.x: test_x,
                self.y: test_y
            }
        )

        def score_func():
            return [get_batch_error(i) for i in range(n_batches)]

        return score_func

In [99]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [100]:
def load_cifar10_train():
    X = numpy.empty((0,3072), dtype=theano.config.floatX)
    y = []
    for i in range(5):
        i += 1
        unpacked = unpickle(f'cifar-10-batches-py/data_batch_{i}')
        X = numpy.concatenate((X, unpacked[b'data']))
        y = numpy.concatenate((y, unpacked[b'labels']))
    return X,y.astype('int32')

def load_cifar10_test():
    unpacked = unpickle(f'cifar-10-batches-py/test_batch')
    return numpy.array(unpacked[b'data'], dtype=theano.config.floatX), numpy.array(unpacked[b'labels'], dtype='int32')

X, y = load_cifar10_train()

from sklearn.model_selection import train_test_split
train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split(X, y, test_size=0.2, random_state=0)
test_set_x, test_set_y = load_cifar10_test()

# for test
train_set_x = train_set_x[0:10000,:]
valid_set_x = valid_set_x[0:1000,:]
test_set_x = test_set_x[0:1000,:]

train_set_y = train_set_y[0:10000]
valid_set_y = valid_set_y[0:1000]
test_set_y = test_set_y[0:1000]

In [119]:
batch_size = 50
cnn = CNN(conv_layer_sizes=[64,128], fully_connected_layer_sizes=[2048,2048], batch_size = batch_size)
train_fn = cnn.build_train_fn(train_set_x, train_set_y, 0.03)
get_validate_errors = cnn.get_errors(valid_set_x, valid_set_y, batch_size)
get_test_errors = cnn.get_errors(test_set_x, test_set_y, batch_size)

In [120]:
import timeit

def train_cnn(training_epochs=1000):
    print('... training the model')
    n_train_batches = int(train_set_x.shape[0] / batch_size)
    
    # early-stopping parameters
    patience = 10000 # look as this many examples regardless
    patience_increase = 2. # loop for n times more when a new best is found
    improvement_threshold = 0.995 # a relative improvement of this much is considered significant

    # go through this many minibatches before checking the network on
    # the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2) # = n_train_batches

    best_validation_loss = numpy.inf
    test_score = 0.

    for epoch in range(training_epochs):
        for minibatch_index in range(n_train_batches):
            train_fn(minibatch_index)
            iter = epoch * n_train_batches + minibatch_index

            # for every 'validation_frequency' iters
            if (iter + 1) % validation_frequency == 0:
                validation_losses = get_validate_errors()
                curr_mean_validation_loss = numpy.mean(validation_losses, dtype='float64')
                print(f'epoch {epoch}, minibatch {minibatch_index + 1}/{n_train_batches}, validation error {curr_mean_validation_loss * 100.}%')

                # if we got the least validation errors until now
                if curr_mean_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough; which will allow more training = double of the curr loop count
                    if (curr_mean_validation_loss < best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = curr_mean_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = get_test_errors()
                    test_score = numpy.mean(test_losses, dtype='float64')
                    print(f'     epoch {epoch}, minibatch minibatch {minibatch_index + 1}/{n_train_batches}, test error of best model {test_score * 100.}%')

            # if no improvement in validation score for the last 50% iters
            if patience <= iter:
                return best_validation_loss, best_iter, test_score
    return best_validation_loss, best_iter, test_score

In [None]:
start_time = timeit.default_timer()
best_validation_loss, best_iter, test_score = train_cnn()
end_time = timeit.default_timer()

print(f'training time: {end_time - start_time}s.')
print(f'Optimization complete with best validation score of {best_validation_loss * 100.}%,\n'
    f'obtained at iteration {best_iter + 1},\n'
    f'with test performance {test_score * 100.}%')

... training the model
epoch 0, minibatch 200/200, validation error 75.6%
     epoch 0, minibatch minibatch 200/200, test error of best model 73.70000000000002%
epoch 1, minibatch 200/200, validation error 72.7%
     epoch 1, minibatch minibatch 200/200, test error of best model 73.10000000000001%


In [21]:
## MNIST dataset
import numpy as np
import pickle
import gzip

def load_data(dataset):
    f = gzip.open(dataset, 'rb')
    train_set, valid_set, test_set = pickle.load(f,encoding='latin1')
    f.close()
    return train_set, valid_set, test_set

datasets = load_data('mnist.pkl.gz')
# datasets = load_data('mnist_expanded.pkl.gz')
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x,  test_set_y  = datasets[2]

train_set_x = np.array(train_set_x, dtype=theano.config.floatX)
valid_set_x = np.array(valid_set_x, dtype=theano.config.floatX)
test_set_x = np.array(test_set_x, dtype=theano.config.floatX)

train_set_y = np.array(train_set_y).astype('int32')
valid_set_y = np.array(valid_set_y).astype('int32')
test_set_y = np.array(test_set_y).astype('int32')

In [25]:
cnn = CNN(
    n_in_maps=1, 
    in_width=28,
    in_height=28,
    n_out=10,
    conv_layer_sizes=[20,40],
    fully_connected_layer_sizes=[100,100],
    dropout_levels=[0.3,0.3]
)
train_fn = cnn.build_train_fn(train_set_x, train_set_y, 0.1)
get_validate_errors = cnn.get_errors(valid_set_x, valid_set_y, 500)
get_test_errors = cnn.get_errors(test_set_x, test_set_y, 500)

In [26]:
start_time = timeit.default_timer()
best_validation_loss, best_iter, test_score = train_cnn()
end_time = timeit.default_timer()

print(f'training time: {end_time - start_time}s.')
print(f'Optimization complete with best validation score of {best_validation_loss * 100.}%,\n'
    f'obtained at iteration {best_iter + 1},\n'
    f'with test performance {test_score * 100.}%')

... training the model
epoch 0, minibatch 100/100, validation error 9.590000000000002%
     epoch 0, minibatch minibatch 100/100, test error of best model 9.82%
epoch 1, minibatch 100/100, validation error 5.510000000000001%
     epoch 1, minibatch minibatch 100/100, test error of best model 5.49%


KeyboardInterrupt: 

In [None]:
# expanding the data

In [21]:
"""
Take the 50,000 MNIST training images, and create an expanded set of
250,000 images, by displacing each training image up, down, left and
right, by one pixel.  Save the resulting file to
../data/mnist_expanded.pkl.gz.
Note that this program is memory intensive, and may not run on small
systems.
"""

import pickle
import gzip
import os.path
import random
import numpy as np
    
print("Expanding the MNIST training set")

if os.path.exists("mnist_expanded.pkl.gz"):
    print("The expanded training set already exists.  Exiting.")
else:
    with gzip.open("mnist.pkl.gz", 'rb') as f:
        u = pickle._Unpickler(f)
        u.encoding = 'latin1'
        training_data, validation_data, test_data = u.load()

    expanded_training_pairs = []
    j = 0 # counter
    for x, y in zip(training_data[0], training_data[1]):
        expanded_training_pairs.append((x, y))
        image = np.reshape(x, (-1, 28))
        j += 1
        if j % 1000 == 0: print("Expanding image number", j)
        # iterate over data telling us the details of how to
        # do the displacement
        for d, axis, index_position, index in [
                (1,  0, "first", 0),
                (-1, 0, "first", 27),
                (1,  1, "last",  0),
                (-1, 1, "last",  27)]:
            new_img = np.roll(image, d, axis)
            if index_position == "first": 
                new_img[index, :] = np.zeros(28)
            else: 
                new_img[:, index] = np.zeros(28)
            expanded_training_pairs.append((np.reshape(new_img, 784), y))
    random.shuffle(expanded_training_pairs)
    expanded_training_data = [list(d) for d in zip(*expanded_training_pairs)]
    print("Saving expanded data. This may take a few minutes.")
    f = gzip.open("mnist_expanded.pkl.gz", "w")
    pickle.dump((expanded_training_data, validation_data, test_data), f)
    f.close()

Expanding the MNIST training set
Expanding image number 1000
Expanding image number 2000
Expanding image number 3000
Expanding image number 4000
Expanding image number 5000
Expanding image number 6000
Expanding image number 7000
Expanding image number 8000
Expanding image number 9000
Expanding image number 10000
Expanding image number 11000
Expanding image number 12000
Expanding image number 13000
Expanding image number 14000
Expanding image number 15000
Expanding image number 16000
Expanding image number 17000
Expanding image number 18000
Expanding image number 19000
Expanding image number 20000
Expanding image number 21000
Expanding image number 22000
Expanding image number 23000
Expanding image number 24000
Expanding image number 25000
Expanding image number 26000
Expanding image number 27000
Expanding image number 28000
Expanding image number 29000
Expanding image number 30000
Expanding image number 31000
Expanding image number 32000
Expanding image number 33000
Expanding image num

In [39]:
np.array(train_set_x).shape

(250000, 784)