In [1]:
# config theano to use GPU, must be done before theano is imported
import os    
os.environ['THEANO_FLAGS'] = "device=cpu,floatX=float32,optimizer=None,exception_verbosity=high"  

In [2]:
import theano

In [23]:
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict

class dA:
    def __init__(
        self,
        numpy_rng=numpy.random.RandomState(1234),
        theano_rng=RandomStreams(numpy.random.RandomState(1234).randint(2 ** 30)), # to generate random numbers in theano, a RandomStream need to initialized with a numpy rng
        input=T.dmatrix(name='input'),
        n_visible=784,
        n_hidden=500,
        W=None,
        bhid=None,
        bvis=None
    ):
        self.x = input
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.numpy_rng = numpy_rng 
        self.theano_rng = theano_rng
        
        self.W = W or self.initial_W(rng=self.numpy_rng, n_hidden=n_hidden, n_visible=n_visible) # weights of visible layer to hidden layer
        self.W_prime = self.W.T # weights of hidden layer to recontruction/visible layer = shared weights
        self.b = bhid or self.bias_obj(n=n_hidden, name='b') # bias of hidden layer
        self.b_prime = bvis or self.bias_obj(n=n_visible, name='b_prime') # bias of reconstruction/visible layer
        
        # shared variables
        self.params = [self.W, self.b, self.b_prime]
    
    def initial_W(self, rng=None, n_hidden=None, n_visible=None):
        W = numpy.asarray(
            rng.uniform(
                low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                size=(n_visible, n_hidden)
            ),
            dtype=theano.config.floatX
        )
        return theano.shared(
            value= W, 
            name='W', 
            borrow=True
        )
    
    def bias_obj(self, n=None, name=None):
        return theano.shared(
            value=numpy.zeros(
                n,
                dtype=theano.config.floatX
            ),
            name=name,
            borrow=True
        )
    
    # add noise by setting corruption_level% of data to 0s
    def get_corrupted_input(self, input, corruption_level):   
        return self.theano_rng.binomial(size=input.shape, n=1,
                                        p=1 - corruption_level,
                                        dtype=theano.config.floatX) * input
    
    def get_hidden_values(self, input):
        return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
    
    def get_reconstructed_input(self, hidden):
        return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
    
    def get_cost_updates(self, corruption_level, learning_rate):
        tilde_x = self.get_corrupted_input(self.x, corruption_level)
        y = self.get_hidden_values(tilde_x)
        z = self.get_reconstructed_input(y)
        # it was originally: L = T.mean((0.5 * (z – self.x)) ** 2), ie. 1/2 ||z-x)||^2
        # now it's the similar to logistic regression, i.e. cross-entropy ( suitable when the input are bit vectors (either 1 or 0))
        L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) 
        cost = T.mean(L)

        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = OrderedDict()
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(learning_rate, dtype=theano.config.floatX)

        return (cost, updates)

In [24]:
# helper method to plot hidden layer

import numpy


def scale_to_unit_interval(ndar, eps=1e-8):
    """ Scales all values in the ndarray ndar to be between 0 and 1 """
    ndar = ndar.copy()
    ndar -= ndar.min()
    ndar *= 1.0 / (ndar.max() + eps)
    return ndar


def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
                       scale_rows_to_unit_interval=True,
                       output_pixel_vals=True):
    """
    Transform an array with one flattened image per row, into an array in
    which images are reshaped and layed out like tiles on a floor.

    This function is useful for visualizing datasets whose rows are images,
    and also columns of matrices for transforming those rows
    (such as the first layer of a neural net).

    :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
    be 2-D ndarrays or None;
    :param X: a 2-D array in which every row is a flattened image.

    :type img_shape: tuple; (height, width)
    :param img_shape: the original shape of each image

    :type tile_shape: tuple; (rows, cols)
    :param tile_shape: the number of images to tile (rows, cols)

    :param output_pixel_vals: if output should be pixel values (i.e. int8
    values) or floats

    :param scale_rows_to_unit_interval: if the values need to be scaled before
    being plotted to [0,1] or not


    :returns: array suitable for viewing as an image.
    (See:`Image.fromarray`.)
    :rtype: a 2-d array with same dtype as X.

    """

    assert len(img_shape) == 2
    assert len(tile_shape) == 2
    assert len(tile_spacing) == 2

    # The expression below can be re-written in a more C style as
    # follows :
    #
    # out_shape    = [0,0]
    # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
    #                tile_spacing[0]
    # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
    #                tile_spacing[1]
    out_shape = [
        (ishp + tsp) * tshp - tsp
        for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
    ]

    if isinstance(X, tuple):
        assert len(X) == 4
        # Create an output numpy ndarray to store the image
        if output_pixel_vals:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
                                    dtype='uint8')
        else:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
                                    dtype=X.dtype)

        #colors default to 0, alpha defaults to 1 (opaque)
        if output_pixel_vals:
            channel_defaults = [0, 0, 0, 255]
        else:
            channel_defaults = [0., 0., 0., 1.]

        for i in range(4):
            if X[i] is None:
                # if channel is None, fill it with zeros of the correct
                # dtype
                dt = out_array.dtype
                if output_pixel_vals:
                    dt = 'uint8'
                out_array[:, :, i] = numpy.zeros(
                    out_shape,
                    dtype=dt
                ) + channel_defaults[i]
            else:
                # use a recurrent call to compute the channel and store it
                # in the output
                out_array[:, :, i] = tile_raster_images(
                    X[i], img_shape, tile_shape, tile_spacing,
                    scale_rows_to_unit_interval, output_pixel_vals)
        return out_array

    else:
        # if we are dealing with only one channel
        H, W = img_shape
        Hs, Ws = tile_spacing

        # generate a matrix to store the output
        dt = X.dtype
        if output_pixel_vals:
            dt = 'uint8'
        out_array = numpy.zeros(out_shape, dtype=dt)

        for tile_row in range(tile_shape[0]):
            for tile_col in range(tile_shape[1]):
                if tile_row * tile_shape[1] + tile_col < X.shape[0]:
                    this_x = X[tile_row * tile_shape[1] + tile_col]
                    if scale_rows_to_unit_interval:
                        # if we should scale values to be between 0 and 1
                        # do this by calling the `scale_to_unit_interval`
                        # function
                        this_img = scale_to_unit_interval(
                            this_x.reshape(img_shape))
                    else:
                        this_img = this_x.reshape(img_shape)
                    # add the slice to the corresponding position in the
                    # output array
                    c = 1
                    if output_pixel_vals:
                        c = 255
                    out_array[
                        tile_row * (H + Hs): tile_row * (H + Hs) + H,
                        tile_col * (W + Ws): tile_col * (W + Ws) + W
                    ] = this_img * c
        return out_array

In [62]:
import pickle
import gzip

def load_data(dataset):
    f = gzip.open(dataset, 'rb')
    train_set, valid_set, test_set = pickle.load(f,encoding='latin1')
    f.close()
    return train_set, valid_set, test_set

datasets = load_data('mnist.pkl.gz')
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x,  test_set_y  = datasets[2]

In [33]:
import os
import timeit

def test_da(
    learning_rate=0.1,
    training_epochs=15,
    batch_size=20,
    output_folder='dA_plots',
    X = train_set_x,
    corruption_level = 0.0
):
    index = T.lscalar()
    train = T.matrix('x')
    x = train[index * batch_size: (index + 1) * batch_size]

    numpy_rng = numpy.random.RandomState(123)
    theano_rng=RandomStreams(numpy_rng.randint(2 ** 30))

    # if not os.path.isdir(output_folder):
    #     os.makedirs(output_folder)
    # os.chdir(output_folder)

    da = dA(
        numpy_rng=numpy_rng,
        theano_rng=theano_rng,
        input=x,
        n_visible=28 * 28,
        n_hidden=500
    )

    cost, updates = da.get_cost_updates(
        corruption_level=corruption_level,
        learning_rate=learning_rate
    )

    train_da = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            train: X
        }
    )

    # start training
    plotting_time = 0.
    start_time = timeit.default_timer()
    n_train_batches = int(X.shape[0] / batch_size)

    for epoch in range(training_epochs):
        mean_cost = []
        for batch_index in range(n_train_batches):
            mean_cost.append(train_da(batch_index))
        print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost))
    end_time = timeit.default_timer()
    pretraining_time = (end_time - start_time) 
    print(f'The no corruption code training an for {pretraining_time / 60.} m')

    return da

In [34]:
da = test_da()

Training epoch 0, cost is  63.28917
Training epoch 1, cost is  55.786655
Training epoch 2, cost is  54.76311
Training epoch 3, cost is  54.24205
Training epoch 4, cost is  53.888668
Training epoch 5, cost is  53.62035
Training epoch 6, cost is  53.40375
Training epoch 7, cost is  53.221992
Training epoch 8, cost is  53.0658
Training epoch 9, cost is  52.92956
Training epoch 10, cost is  52.809414
Training epoch 11, cost is  52.70244
Training epoch 12, cost is  52.60631
Training epoch 13, cost is  52.51917
Training epoch 14, cost is  52.439526
The no corruption code training an for 1.8524673257833153 m


<__main__.dA at 0x1c567e7390>

In [32]:
import PIL.Image as Image
image = Image.fromarray(
    tile_raster_images(
        X=da.W.get_value(borrow=True).T,
        img_shape=(28, 28), 
        tile_shape=(10, 10),
        tile_spacing=(1, 1)
    )
)
image.save('filters_corruption_0.png')

In [36]:
da = test_da(corruption_level=0.3)

Training epoch 0, cost is  81.771416
Training epoch 1, cost is  73.42857
Training epoch 2, cost is  70.86327
Training epoch 3, cost is  69.33966
Training epoch 4, cost is  68.41347
Training epoch 5, cost is  67.72368
Training epoch 6, cost is  67.240135
Training epoch 7, cost is  66.8493
Training epoch 8, cost is  66.56639
Training epoch 9, cost is  66.35912
Training epoch 10, cost is  66.13366
Training epoch 11, cost is  65.989395
Training epoch 12, cost is  65.83441
Training epoch 13, cost is  65.71854
Training epoch 14, cost is  65.601074
The no corruption code training an for 2.0588454288666376 m


In [37]:
import PIL.Image as Image
image = Image.fromarray(
    tile_raster_images(
        X=da.W.get_value(borrow=True).T,
        img_shape=(28, 28), 
        tile_shape=(10, 10),
        tile_spacing=(1, 1)
    )
)
image.save('filters_corruption_0_3.png')

In [None]:
################################
########## Stacked dA ##########
################################

In [38]:
class HiddenLayer(object):
    def __init__(self, rng, input, n_in, n_out, W=None, b=None,activation=T.tanh): # if set actication=T.nnet.sigmoid, becomes logistic regresssion layer
        self.input = input
        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.
        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.W = W
        self.b = b

        lin_output = T.dot(input, self.W) + self.b
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        # parameters of the model
        self.params = [self.W, self.b]

In [39]:
class LogisticRegression(object):
    def __init__(self, input, n_in, n_out):
        self.input = input
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        # predict_proba
        self.p_y_given_x = T.nnet.softmax(T.dot(self.input, self.W) + self.b) # softmax=normalized sigmoid
        # predict
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = [self.W, self.b]


    # this is akin to cost = -1/m * sigma(ylog(wx) + (1-y)log(1-wx)) when y is binomial
    # in the current case y has n-labels, and only the prediction of the right label is picked out
    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    # perc of wrong predictions
    def errors(self, y):
        return T.mean(T.neq(self.y_pred, y)) # T.neq(a,b) checks a != b

In [56]:
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams

class SdA:
    def __init__(
        self, 
        numpy_rng=numpy.random.RandomState(1234),
        theano_rng=None, 
        n_ins=784,
        n_outs=10,
        hidden_layers_sizes=[500, 500],  # each elem represents on layer with n(value of elem) nodes
        corruption_levels=[0.1, 0.1]
    ):
        
        self.sigmoid_layers = []
        self.da_layers = []
        self.params = [] # holds the shared/updatable vars
        self.n_layers = len(hidden_layers_sizes)
        assert self.n_layers > 0
        
        self.numpy_rng = numpy_rng
        self.theano_rng = theano_rng or RandomStreams(numpy_rng.randint(2 ** 30))
        self.index = T.lscalar('index')
        self.batch_size = T.lscalar('batch_size')
        self.x = T.matrix('x')
        self.x_batch = self.x[self.index * self.batch_size : (self.index + 1) * self.batch_size]
        self.y = T.ivector('y') # the labels are presented as 1D vector of [int] labels
        self.y_batch = self.y[self.index * self.batch_size : (self.index + 1) * self.batch_size]
        
        for i in range(self.n_layers):
            output_size = hidden_layers_sizes[i]
            if i == 0:
                # first layer is to input
                input_size = n_ins
                layer_input = self.x_batch
            else:
                # subseq layers are dAs, and the input is the prev layer
                input_size = hidden_layers_sizes[i - 1]
                layer_input = self.sigmoid_layers[-1].output # output is the final activation
            
            # logistic regression layer
            sigmoid_layer = HiddenLayer(
                rng=self.numpy_rng,
                input=layer_input,
                n_in=input_size,
                n_out=output_size,
                activation=T.nnet.sigmoid
            )
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params) # sigmoid_layer.params is [W,b]; Note: extend: [a] + [b]; append [a] << [b]
            
            # DA layer, which pretrains the W and b that will be used by the SdA
            da_layer = dA(
                numpy_rng=self.numpy_rng,
                theano_rng=self.theano_rng,
                input=layer_input,
                n_visible=input_size,
                n_hidden=output_size,
                W=sigmoid_layer.W,
                bhid=sigmoid_layer.b
            )
            self.da_layers.append(da_layer)
            # note for this implementation, vbias of the RBMs are not treated as a param of the DBN (whereas W and hbias is already included in the DBN params)
            
        # note the sigmoid_layers do not generate a prediction or return the error of the model
        # thus a LogisticRegression class that has those functions is added to the end of the sigmoid_layers
        # the input is the activation of the final sigmoid_layer
        # output is the actual prediction
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        
        # finetune_cost = cost of regression model
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y_batch)
        # perc of wrong preds
        self.errors = self.logLayer.errors(self.y_batch)
        
    def pretraining_functions(self, X, batch_size): 
        corruption_level = T.scalar('corruption')
        learning_rate = T.scalar('lr')
        
        pretrain_fns = []
        for da in self.da_layers:
            cost, updates = da.get_cost_updates(corruption_level, learning_rate)

            fn = theano.function(
                inputs=[self.index, corruption_level, learning_rate],
                outputs=cost,
                updates=updates,
                givens={
                    self.batch_size: batch_size,
                    self.x: X
                }
            )
            # append theano function for each layer to output
            pretrain_fns.append(fn)

        return pretrain_fns
    
    ### the rest below is the same as DBN's implementation
    def build_train_function(
        self, 
        train_x, 
        train_y, 
        batch_size, 
        learning_rate
    ):
        index = T.lscalar('index')  # index to a [mini]batch

        # gradients of MLP, computed by theano automatically
        gparams = T.grad(self.finetune_cost, self.params)

        # create updates list
        updates = {}
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * T.cast(learning_rate, dtype=theano.config.floatX)

        train_fn = theano.function(
            inputs=[self.index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.batch_size: batch_size,
                self.x: train_x,
                self.y: train_y
            }
        )
        return train_fn

    def get_errors(self, X, y, batch_size):  
        n_batches = int(X.shape[0] / batch_size)
        index = T.lscalar('index')  # index to a [mini]batch
        get_batch_error = theano.function(
            inputs=[self.index],
            outputs =self.errors, # perc of wrong preds
            givens={
                self.batch_size: batch_size,
                self.x: X,
                self.y: y
            }
        )
        
        def score_func():
            return [get_batch_error(i) for i in range(n_batches)]
        
        return score_func

In [64]:
batch_size=10
pretrain_lr=0.001
pretraining_epochs=15
corruption_levels=[0.1, 0.2, 0.3]

# # for testing
# batch_size=10
# pretrain_lr=0.01
# pretraining_epochs=1
# corruption_levels=[0.1, 0.2, 0.3]


train_set_y = train_set_y.astype('int32')
valid_set_y = valid_set_y.astype('int32')
test_set_y = test_set_y.astype('int32')

numpy_rng = numpy.random.RandomState(89677)
n_train_batches = int(train_set_x.shape[0] / batch_size)

print('... building the model')
sda = SdA(
    numpy_rng=numpy_rng, 
    n_ins=28 * 28,
    hidden_layers_sizes=[500, 100, 20],
    n_outs=10
)

pretraining_fns = sda.pretraining_functions(
    X=train_set_x,
    batch_size=batch_size
)

print('... pre-training the model')
start_time = timeit.default_timer()

# Pre-train layer-wise
for i in range(sda.n_layers):
    for epoch in range(pretraining_epochs):
        costs = []
        for batch_index in range(n_train_batches):
            costs.append(pretraining_fns[i](batch_index, corruption_levels[i], pretrain_lr))
        print(f'Pre-training layer {i}, epoch {epoch}, mean cost: ')
        print(numpy.mean(costs, dtype='float64'))

end_time = timeit.default_timer()

print(f'Preraining time: {end_time - start_time} s.')

... building the model
... pre-training the model
Pre-training layer 0, epoch 0, mean cost: 
114.97539083404541
Pre-training layer 0, epoch 1, mean cost: 
89.29495334701538
Pre-training layer 0, epoch 2, mean cost: 
82.76587084350587
Pre-training layer 0, epoch 3, mean cost: 
79.09152413101197
Pre-training layer 0, epoch 4, mean cost: 
76.63940047073365
Pre-training layer 0, epoch 5, mean cost: 
74.84518805465699
Pre-training layer 0, epoch 6, mean cost: 
73.42404341583251
Pre-training layer 0, epoch 7, mean cost: 
72.28049770202637
Pre-training layer 0, epoch 8, mean cost: 
71.30632259674073
Pre-training layer 0, epoch 9, mean cost: 
70.50293253479003
Pre-training layer 0, epoch 10, mean cost: 
69.81221449508666
Pre-training layer 0, epoch 11, mean cost: 
69.18325417785644
Pre-training layer 0, epoch 12, mean cost: 
68.61638901062011
Pre-training layer 0, epoch 13, mean cost: 
68.16601371383668
Pre-training layer 0, epoch 14, mean cost: 
67.71901187667847
Pre-training layer 1, epoch 0

In [65]:
finetune_lr = 0.1
training_epochs=1000

print('... getting the finetuning functions')
train_fn = sda.build_train_function(
    train_x = train_set_x,
    train_y = train_set_y,
    batch_size=batch_size,
    learning_rate=finetune_lr
)
get_validate_errors = sda.get_errors(valid_set_x, valid_set_y, batch_size)
get_test_errors = sda.get_errors(test_set_x, test_set_y, batch_size)

def train_mlp():
    print('... finetuning the model')
    
    # early-stopping parameters
    patience = 4 * n_train_batches # look as this many examples regardless, i.e. 4 epochs
    patience_increase = 2. # loop for n times more when a new best is found
    improvement_threshold = 0.995 # a relative improvement of this much is considered significant

    # go through this many minibatches before checking the network on
    # the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2) # = n_train_batches

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    for epoch in range(training_epochs):
        for minibatch_index in range(n_train_batches):
            train_fn(minibatch_index)
            iter = epoch * n_train_batches + minibatch_index

            # for every 'validation_frequency' iters
            if (iter + 1) % validation_frequency == 0:
                validation_losses = get_validate_errors()
                curr_mean_validation_loss = numpy.mean(validation_losses, dtype='float64')
                print(f'epoch {epoch}, minibatch {minibatch_index + 1}/{n_train_batches}, validation error {curr_mean_validation_loss * 100.}%')

                # if we got the least validation errors until now
                if curr_mean_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough; which will allow more training = double of the curr loop count
                    if (curr_mean_validation_loss < best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = curr_mean_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = get_test_errors()
                    test_score = numpy.mean(test_losses, dtype='float64')
                    print(f'     epoch {epoch}, minibatch minibatch {minibatch_index + 1}/{n_train_batches}, test error of best model {test_score * 100.}%')

            # if no improvement in validation score for the last 50% iters
            if patience <= iter:
                return best_validation_loss, best_iter, test_score
    return best_validation_loss, best_iter, test_score

best_validation_loss, best_iter, test_score = train_mlp()
end_time = timeit.default_timer()

print(f'training time: {end_time - start_time}s.')
print(f'Optimization complete with best validation score of {best_validation_loss * 100.}%,\n'
    f'obtained at iteration {best_iter + 1},\n'
    f'with test performance {test_score * 100.}%')

... getting the finetuning functions




... finetuning the model
epoch 0, minibatch 5000/5000, validation error 7.290000000000001%
     epoch 0, minibatch minibatch 5000/5000, test error of best model 7.870000000000001%
epoch 1, minibatch 5000/5000, validation error 5.5200000000000005%
     epoch 1, minibatch minibatch 5000/5000, test error of best model 6.09%
epoch 2, minibatch 5000/5000, validation error 4.5%
     epoch 2, minibatch minibatch 5000/5000, test error of best model 4.9799999999999995%
epoch 3, minibatch 5000/5000, validation error 4.13%
     epoch 3, minibatch minibatch 5000/5000, test error of best model 4.38%
epoch 4, minibatch 5000/5000, validation error 3.8600000000000003%
     epoch 4, minibatch minibatch 5000/5000, test error of best model 4.140000000000001%
epoch 5, minibatch 5000/5000, validation error 3.5700000000000003%
     epoch 5, minibatch minibatch 5000/5000, test error of best model 3.91%
epoch 6, minibatch 5000/5000, validation error 3.55%
     epoch 6, minibatch minibatch 5000/5000, test erro

KeyboardInterrupt: 