In [3]:
# config theano to use GPU, must be done before theano is imported
import os    
os.environ['THEANO_FLAGS'] = "device=cuda,floatX=float32"#,optimizer=None,exception_verbosity=high"  

In [4]:
import theano

In [5]:
# http://deeplearning.net/tutorial/lenet.html
    
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict

from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d

class LeNetConvPoolLayer():
    def __init__(self, 
        rng, 
        input, # with 4 dimensions: rows of input(i.e. batch) * num of input maps per input * height per map * width per map; see: https://www.quora.com/Why-are-there-4-dimensions-to-convolve-over-the-Stanford-UFDL-example-in-convolutional-neural-networks
        image_shape, # tuple/list of len 4 representing the 4 dims of input: (batch size, num input feature maps, image height, image width)
        filter_shape, # tuple/list of len 4 representing: (number of filters, num input feature maps, filter height, filter width)
        poolsize=(2, 2), # eg. downsample every 2x2 bits to 1 bit
        activation=T.nnet.relu,
        border_mode='valid', # no padding; see doc for more options
        subsample=(1, 1) # unit stride        
    ):
        assert image_shape[1] == filter_shape[1] # num input feataure maps should be same for both arrs
        
        self.input = input
        fan_in = numpy.prod(filter_shape[1:]) # (num input feature maps * filter height * filter width) input nodes to each hidden node/unit(?) 
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # (num output feature maps * filter height * filter width /  pooling size) output nodes per layer
        
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(
            numpy.asarray(
                rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
                dtype=theano.config.floatX
            ),
            borrow=True
        )
        
        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) # a 1D tensor(i.e. array) of len(num of filters) for adding a bias per output feature map
        self.b = theano.shared(value=b_values, borrow=True)

        # convolve input feature maps with filters
        # doc: http://deeplearning.net/software/theano/library/tensor/nnet/conv.html#theano.tensor.nnet.conv2d
        # simply explained; conv2d iterates over every sample, apply the SAME weights on each 'cropped' input(map) to create each output map
        # taking an analogy with MLP, each output feature map can be considered as one hidden node in an MLP, where each value in the map represents a different cropped position of the input image
        # (programmatically, I believe it flattens the num of maps * input height * input width for every sample; apply the transformtion; and then reshape it back to the expected output dimensions)
        conv_out = conv2d(
            input=input,
            filters=self.W,
            filter_shape=filter_shape,
            input_shape=image_shape,
            border_mode=border_mode,
            subsample=subsample
        )

        # pool each feature map individually, using maxpooling
        # doc: http://deeplearning.net/software/theano/library/tensor/signal/pool.html
        # note: 'Max pooling will be done over the 2 last dimensions', i.e. height * width
        pooled_out = pool.pool_2d(
            input=conv_out,
            ws=poolsize,
            ignore_border=True
        )

        # add the bias term. Since the bias is a vector (1D array), we reshape it to a tensor of shape (1, n_filters, 1, 1). 
        # since every output map uses one shared weight, only 1 bias is needed for each output map
        self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # explanation of dimshuffle: https://stackoverflow.com/questions/42401420/how-the-function-dimshuffle-works-in-theano

        # shared_params
        self.params = [self.W, self.b]

In [6]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [7]:
def load_cifar10_train():
    X = numpy.empty((0,3072), dtype=theano.config.floatX)
    y = []
    for i in range(5):
        i += 1
        unpacked = unpickle(f'cifar-10-batches-py/data_batch_{i}')
        X = numpy.concatenate((X, unpacked[b'data']))
        y = numpy.concatenate((y, unpacked[b'labels']))
    return X,y.astype('int32')

def load_cifar10_test():
    unpacked = unpickle(f'cifar-10-batches-py/test_batch')
    return numpy.array(unpacked[b'data'], dtype=theano.config.floatX), numpy.array(unpacked[b'labels'], dtype='int32')

In [8]:
X, y = load_cifar10_train()

In [9]:
from sklearn.model_selection import train_test_split
train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
test_set_x, test_set_y = load_cifar10_test()

In [11]:
# for test
train_set_x = train_set_x[0:1000,:]
valid_set_x = valid_set_x[0:1000,:]
test_set_x = test_set_x[0:1000,:]

train_set_y = train_set_y[0:1000]
valid_set_y = valid_set_y[0:1000]
test_set_y = test_set_y[0:1000]

In [12]:
class HiddenLayer(object):
    def __init__(
        self, 
        rng, 
        input, 
        n_in, 
        n_out, 
        p_dropout=0.0, 
        W=None, 
        b=None, 
        activation=T.tanh # if set actication=T.nnet.sigmoid, becomes logistic regresssion layer
    ): 
        self.input = input
        self.theano_rng = RandomStreams(rng.randint(2 ** 30))
        
        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.
        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, borrow=True)

        self.W = W
        self.b = b

        dropout_input = self.get_corrupted_input(input, p_dropout)
        lin_output = T.dot(dropout_input, self.W) + self.b
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        # parameters of the model
        self.params = [self.W, self.b]
        
    # add noise by setting corruption_level% of data to 0s
    def get_corrupted_input(self, input, corruption_level):   
        return self.theano_rng.binomial(size=input.shape, n=1,
                                        p=1 - corruption_level,
                                        dtype=theano.config.floatX) * input

In [13]:
class LogisticRegression(object):
    def __init__(self, input, n_in, n_out):
        self.input = input
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        # predict_proba
        self.p_y_given_x = T.nnet.softmax(T.dot(self.input, self.W) + self.b) # softmax=normalized sigmoid
        # predict
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = [self.W, self.b]


    # this is akin to cost = -1/m * sigma(ylog(wx) + (1-y)log(1-wx)) when y is binomial
    # in the current case y has n-labels, and only the prediction of the right label is picked out
    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    # perc of wrong predictions
    def errors(self, y):
        return T.mean(T.neq(self.y_pred, y)) # T.neq(a,b) checks a != b

In [14]:
# note: output size calculated by formula:
# (Width - Filter_size + 2*Padding / Stride) + 1
# stride is usually 1 in implementations such as conv2d

In [19]:
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict

learning_rate=0.1 
nkerns=[20, 40] # num of feature maps per conv layer
batch_size=500

# init var
rng = numpy.random.RandomState(23455)
x = T.matrix('x')
y = T.ivector('y')
index = T.lscalar('index')
x_batch = x[index * batch_size: (index + 1) * batch_size]
y_batch = y[index * batch_size: (index + 1) * batch_size]

layer0_input = x_batch.reshape((batch_size, 3, 32, 32)) # cifar data comes in RGB(3) channels in 32x32 size

#layer0 output size before pooling = (32 - 5 +2*2) + 1 = 32; after pooling = 32/2 = 16
layer0 = LeNetConvPoolLayer(
    rng,
    input=layer0_input,
    image_shape=(batch_size, 3, 32, 32), # apparently, theano conv2d accepts a defined list/tuple as arg only, therefore this arg cannot be simplified in to input.shape (which would be a tensor vector)
    filter_shape=(nkerns[0], 3, 5, 5),
    poolsize=(2, 2),
    border_mode=(2, 2)
)

# layer1 output size before pooling = (16 - 5 + 2*2) + 1 = 16; after pooling = 16/2 = 8
layer1 = LeNetConvPoolLayer(
    rng,
    input=layer0.output,
    image_shape=(batch_size, nkerns[0], 16, 16),
    filter_shape=(nkerns[1], nkerns[0], 5, 5),
    poolsize=(2, 2),
    border_mode=(2, 2)
)

layer2_input = layer1.output.flatten(2) # theano function that flattens all dims after the first into a single dim, ie. row *(num_feature_maps * width * height)

# mlp layer(s) after 2 conv layers
layer2 = HiddenLayer(
    rng,
    input=layer2_input,
    n_in=nkerns[1] * 8 * 8,
#     n_in=T.shape(layer2_input)[1],
    n_out=1000,
    p_dropout=0.3,
    activation=T.tanh
)

layer3 = HiddenLayer(
    rng,
    input=layer2.output,
    n_in=1000,
    n_out=1000,
    p_dropout=0.3,
    activation=T.tanh
)

# uses sigmoidal activation by default
log_layer = LogisticRegression(
    input=layer3.output, 
    n_in=1000, 
    n_out=10
)

cost = log_layer.negative_log_likelihood(y_batch)
errors = log_layer.errors(y_batch)

# create a list of all model parameters to be fit by gradient descent
params = log_layer.params + layer3.params + layer2.params + layer1.params + layer0.params

# create a list of gradients for all model parameters
grads = T.grad(cost, params)

updates = OrderedDict()
for grad, param in zip(grads, params):
    # make sure that the learning rate is of the right dtype
    updates[param] = param - grad * T.cast(learning_rate, dtype=theano.config.floatX)

train_fn = theano.function(
    inputs=[index],
    outputs=cost,
    updates=updates,
    givens={
        x: train_set_x,
        y: train_set_y
    }
)

def get_errors(X_, y_, batch_size):  
    n_batches = int(X_.shape[0] / batch_size)
    get_batch_error = theano.function(
        inputs=[index],
        outputs = errors, # perc of wrong preds
        givens={
#             batch_size: batch_size,
            x: X_,
            y: y_
        }
    )

    def score_func():
        return [get_batch_error(i) for i in range(n_batches)]

    return score_func

get_validate_errors = get_errors(valid_set_x, valid_set_y, batch_size)
get_test_errors = get_errors(test_set_x, test_set_y, batch_size)

In [20]:
import timeit

n_train_batches = int(train_set_x.shape[0] / batch_size)
training_epochs=1000

def train_cnn():
    print('... training the model')
    
    
    # early-stopping parameters
    patience = 10000 # look as this many examples regardless
    patience_increase = 2. # loop for n times more when a new best is found
    improvement_threshold = 0.995 # a relative improvement of this much is considered significant

    # go through this many minibatches before checking the network on
    # the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2) # = n_train_batches

    best_validation_loss = numpy.inf
    test_score = 0.

    for epoch in range(training_epochs):
        for minibatch_index in range(n_train_batches):
            train_fn(minibatch_index)
            iter = epoch * n_train_batches + minibatch_index

            # for every 'validation_frequency' iters
            if (iter + 1) % validation_frequency == 0:
                validation_losses = get_validate_errors()
                curr_mean_validation_loss = numpy.mean(validation_losses, dtype='float64')
                print(f'epoch {epoch}, minibatch {minibatch_index + 1}/{n_train_batches}, validation error {curr_mean_validation_loss * 100.}%')

                # if we got the least validation errors until now
                if curr_mean_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough; which will allow more training = double of the curr loop count
                    if (curr_mean_validation_loss < best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = curr_mean_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = get_test_errors()
                    test_score = numpy.mean(test_losses, dtype='float64')
                    print(f'     epoch {epoch}, minibatch minibatch {minibatch_index + 1}/{n_train_batches}, test error of best model {test_score * 100.}%')

            # if no improvement in validation score for the last 50% iters
            if patience <= iter:
                return best_validation_loss, best_iter, test_score
    return best_validation_loss, best_iter, test_score

start_time = timeit.default_timer()
best_validation_loss, best_iter, test_score = train_cnn()
end_time = timeit.default_timer()

print(f'training time: {end_time - start_time}s.')
print(f'Optimization complete with best validation score of {best_validation_loss * 100.}%,\n'
    f'obtained at iteration {best_iter + 1},\n'
    f'with test performance {test_score * 100.}%')

... training the model
epoch 0, minibatch 2/2, validation error 88.8%
     epoch 0, minibatch minibatch 2/2, test error of best model 88.0%
epoch 1, minibatch 2/2, validation error 85.00000000000001%
     epoch 1, minibatch minibatch 2/2, test error of best model 83.1%
epoch 2, minibatch 2/2, validation error 83.5%
     epoch 2, minibatch minibatch 2/2, test error of best model 82.30000000000001%
epoch 3, minibatch 2/2, validation error 85.2%
epoch 4, minibatch 2/2, validation error 88.5%
epoch 5, minibatch 2/2, validation error 78.7%
     epoch 5, minibatch minibatch 2/2, test error of best model 76.5%
epoch 6, minibatch 2/2, validation error 91.0%
epoch 7, minibatch 2/2, validation error 82.30000000000001%
epoch 8, minibatch 2/2, validation error 77.8%
     epoch 8, minibatch minibatch 2/2, test error of best model 76.8%
epoch 9, minibatch 2/2, validation error 85.8%
epoch 10, minibatch 2/2, validation error 82.4%
epoch 11, minibatch 2/2, validation error 77.8%
epoch 12, minibatch 2/

KeyboardInterrupt: 