In [1]:
# conda build conda=4.6.8=py36_0; python v3.6 is needed for theano1.0.3/4

In [1]:
# config theano to use GPU, must be done before theano is imported
import os    
os.environ['THEANO_FLAGS'] = "device=cuda,mode=FAST_RUN,floatX=float32"  

In [2]:
import theano

ERROR (theano.gpuarray): Could not initialize pygpu, support disabled
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/theano/gpuarray/__init__.py", line 227, in <module>
    use(config.device)
  File "/anaconda3/lib/python3.6/site-packages/theano/gpuarray/__init__.py", line 214, in use
    init_dev(device, preallocate=preallocate)
  File "/anaconda3/lib/python3.6/site-packages/theano/gpuarray/__init__.py", line 99, in init_dev
    **args)
  File "pygpu/gpuarray.pyx", line 658, in pygpu.gpuarray.init
  File "pygpu/gpuarray.pyx", line 587, in pygpu.gpuarray.pygpu_init
pygpu.gpuarray.GpuArrayException: b'Could not load "/Library/Frameworks/CUDA.framework/CUDA": dlopen(/Library/Frameworks/CUDA.framework/CUDA, 5): image not found'


In [40]:
# explanation of the theano.scan function
# #scan(fn=func, output_infos=args of func, n_steps=n_iters)

# for output_infos, it is either a single elem, or the length must equal to the output of func
# output_infos stand for 1. the args pass into the func for the first iter; 2. the args that get returned from the func, which are used for subsequent iters

# if the func takes in 1 arg but returns 2 args, output_infos=[pass_in, None] would pass the first output back into the func; output_infos=[None, pass_in] would pass the second output back into the func
# outputs are all the outputs of the func for each iter, however note it groups each output elem into its own array
# e.g. func returns [a,b]; outputs = [[n_steps of a], [n_steps of b]]
a = theano.shared(1)
b = theano.shared(100)
def func(x):
    return [x+1, x+10]
outputs, updates = theano.scan(func, outputs_info=[None, a], n_steps=10)
f = theano.function([], outputs=outputs, updates=updates)
print(f(), a.get_value(), b.get_value())

# scan expects the output of the function to be: 
# 1. the outputs, or 
# 2. a dict of the updates (each key of the dict must be a shared var, with the value instructions for how to update the var), or
# 3. a tuple: (outputs, updates), note in this case, the outputs must be parenthesized (see example below)

# in func1, a is passed as an arg into the func, but since the func does not return the updates dicts, 'a' itself is not updated
a = theano.shared(1)
b = theano.shared(100)
def func1(x):
    return x+1
outputs, updates = theano.scan(func1, outputs_info=a, n_steps=10)
f = theano.function([], outputs=outputs, updates=updates)
print(f(), a.get_value(), b.get_value())

# in func2, the updates dict is returned, and b is updated, but outputs is nil
a = theano.shared(1)
b = theano.shared(100)
def func2():
    return {b: b+100}
outputs, updates = theano.scan(func2, outputs_info=None, n_steps=10)
f = theano.function([], outputs=outputs, updates=updates)
print(f(), a.get_value(), b.get_value())

# in func3, both outputs and updates are returned, and b is updated as instructed by the updates dict
a = theano.shared(1)
b = theano.shared(100)
def func3(x):
    return (x+1), {b: b+100} # the first elem must be within parenthesis
outputs, updates = theano.scan(func3, outputs_info=a, n_steps=10)
f = theano.function([], outputs=outputs, updates=updates)
print(f(), a.get_value(), b.get_value())

# back to func1, #scan returns nil updates, but b is manually added into updates, so b is updated
# also note that when updates is returned natively from the scan func, the update is pefromed n_steps times per function call; 
# but when updates is added outside the func, it is only performed once per function call
a = theano.shared(1)
b = theano.shared(100)
outputs, updates = theano.scan(func1, outputs_info=a, n_steps=10)
updates[b] = b+100
f = theano.function([], outputs=outputs, updates=updates)
print(f(), a.get_value(), b.get_value())

[array([ 2, 12, 22, 32, 42, 52, 62, 72, 82, 92]), array([ 11,  21,  31,  41,  51,  61,  71,  81,  91, 101])] 1 100
[ 2  3  4  5  6  7  8  9 10 11] 1 100
[] 1 1100
[ 2  3  4  5  6  7  8  9 10 11] 1 1100
[ 2  3  4  5  6  7  8  9 10 11] 1 200


In [46]:
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams

class RBM(object):
    def __init__(
        self,
        input=None,
        n_visible=784,
        n_hidden=500,
        W=None,
        hbias=None,
        vbias=None,
        numpy_rng=None,
        theano_rng=None
    ):
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        # to generate random numbers in theano, a RandomStream need to initialized with a numpy rng
        self.numpy_rng = numpy_rng or numpy.random.RandomState(1234)
        self.theano_rng = theano_rng or RandomStreams(numpy_rng.randint(2 ** 30))
        self.W = W or self.initial_W(rng=self.numpy_rng, n_hidden=n_hidden, n_visible=n_visible)
        # hbias: an array of length n_hidden, for positive phase (forward prop)
        self.hbias = hbias or self.bias_obj(n=n_hidden, name='hbias')
        # vbias, an array of length n_visible, for negative phase (backward prop)
        self.vbias = vbias or self.bias_obj(n=n_visible, name='vbias')
        # initialize input layer for standalone RBM or layer0 of DBN
        self.input = input or T.matrix('input')
        # shared variables
        self.params = [self.W, self.hbias, self.vbias]

    def initial_W(self, rng=None, n_hidden=None, n_visible=None):
        W = numpy.asarray(
            rng.uniform(
                low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                size=(n_visible, n_hidden)
            ),
            dtype=theano.config.floatX
        )
        return theano.shared(
            value= W, 
            name='W', 
            borrow=True
        )
    
    def bias_obj(self, n=None, name=None):
        return theano.shared(
            value=numpy.zeros(
                n,
                dtype=theano.config.floatX
            ),
            name=name,
            borrow=True
        )

    # forward prop/positive phase: sigmoid(input * w + h_bias)
    def propup(self, vis):
        pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias
        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]

    # backward prop/negative phase: sigmoid(hidden * w + v_bias)
    def propdown(self, hid):
        pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias
        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]

    # force propup to return a binomial layer
    def sample_h_given_v(self, v0_sample):
        pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
        h1_sample = self.theano_rng.binomial(size=h1_mean.shape,
                                             n=1, p=h1_mean,
                                             dtype=theano.config.floatX)
        return [pre_sigmoid_h1, h1_mean, h1_sample]

    # force propdown to return a binomial layer
    def sample_v_given_h(self, h0_sample):
        pre_sigmoid_v1, v1_mean = self.propdown(h0_sample)
        v1_sample = self.theano_rng.binomial(size=v1_mean.shape,
                                             n=1, p=v1_mean,
                                             dtype=theano.config.floatX)
        return [pre_sigmoid_v1, v1_mean, v1_sample]

    # gibbs sampling, using h0 (initial layer) to generate h1 (new layer)
    def gibbs_hvh(self, h0_sample):
        pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return [pre_sigmoid_v1, v1_mean, v1_sample,
                pre_sigmoid_h1, h1_mean, h1_sample]

    # gibbs sampling, using v0 to generate v1
    def gibbs_vhv(self, v0_sample):
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
        pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
        return [pre_sigmoid_h1, h1_mean, h1_sample,
                pre_sigmoid_v1, v1_mean, v1_sample]
    
    # free energy is defined as: -v_bias*input - h_bias*hidden - hidden * weight * input
    # I don't think there's a reasoning for this other than this is how it's defined
    # which can be rewritten as -v_bias*input -sigma(log(1+ e^(h_bias + weight*input))) if both input & hidden consist of binomial nodes(either 1 or 0) only
    # http://deeplearning.net/tutorial/rbm.html
    def free_energy(self, v_sample):
        wx_b = T.dot(v_sample, self.W) + self.hbias
        vbias_term = T.dot(v_sample, self.vbias)
        hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
        return -hidden_term - vbias_term

    # persistent=None for Contrastive Divergence(CD) (default) to start gibbs sampling using the (hidden layer generated from the) input
    # persistent=given sample for Persistant Contrastive Divergence(PCD) to start gibbs sampling using the previous data point in the persistant chain
    def get_cost_updates(self, lr=0.1, persistent=None, k=1):
        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)
        chain_start = persistent or ph_sample
        (
            [
                pre_sigmoid_nvs,
                nv_means,
                nv_samples,
                pre_sigmoid_nhs,
                nh_means,
                nh_samples
            ],
            updates
        ) = theano.scan(
            self.gibbs_hvh,
            outputs_info=[None, None, None, None, None, chain_start],
            n_steps=k
        )

        chain_end = nv_samples[-1]

        cost = T.mean(self.free_energy(self.input)) - T.mean(self.free_energy(chain_end))
        # We must not compute the gradient through the gibbs sampling
        # gparams is an array of the differential of each of the shared varaibles(in self.params), ie. W, vbias, hbias
        gparams = T.grad(cost, self.params, consider_constant=[chain_end])
      
        # since update vars are added outside of the scan func, ie. hvh, the gradient is updated after every CD-k (not k times every CD-1)
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(lr, dtype=theano.config.floatX)
       
        if persistent:
            updates[persistent] = nh_samples[-1] # this allows persistent(must be a shared var) to be updated
            # pseudo-likelihood is a better proxy for PCD
            monitoring_cost = self.get_pseudo_likelihood_cost(updates)
        else:
            # reconstruction cross-entropy is a better proxy for CD
            monitoring_cost = self.get_reconstruction_cost(pre_sigmoid_nvs[-1])

        return monitoring_cost, updates

    # cost =(close to) N_bits * cost of one bit = N * e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))), where x_{\i} is x_i with bit_i_idx flipped (1 to 0, or 0 to 1)
    # bit_i_idx is randomly sampled, in this implementation, it simply loops through each bit per call
    # note: bit_i_idx is added to the 
    def get_pseudo_likelihood_cost(self, updates):
        bit_i_idx = theano.shared(value=0, name='bit_i_idx')

        # binarize the input image by rounding to nearest integer
        xi = T.round(self.input)

        # calculate free energy for the given bit configuration
        fe_xi = self.free_energy(xi)

        # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns to a new theano var
        # this allows xi_flip to auto update when bit_i_idx updates
        xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])

        # calculate free energy with bit flipped
        fe_xi_flip = self.free_energy(xi_flip)

        # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
        cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - fe_xi)))

        # increment bit_i_idx % number as part of updates
        updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible

        return cost

    def get_reconstruction_cost(self, pre_sigmoid_nv):
        cross_entropy = T.mean(
            T.sum(
                self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) +
                (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)),
                axis=1
            )
        )

        return cross_entropy

In [47]:
# helper method to plot hidden layer

import numpy


def scale_to_unit_interval(ndar, eps=1e-8):
    """ Scales all values in the ndarray ndar to be between 0 and 1 """
    ndar = ndar.copy()
    ndar -= ndar.min()
    ndar *= 1.0 / (ndar.max() + eps)
    return ndar


def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
                       scale_rows_to_unit_interval=True,
                       output_pixel_vals=True):
    """
    Transform an array with one flattened image per row, into an array in
    which images are reshaped and layed out like tiles on a floor.

    This function is useful for visualizing datasets whose rows are images,
    and also columns of matrices for transforming those rows
    (such as the first layer of a neural net).

    :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
    be 2-D ndarrays or None;
    :param X: a 2-D array in which every row is a flattened image.

    :type img_shape: tuple; (height, width)
    :param img_shape: the original shape of each image

    :type tile_shape: tuple; (rows, cols)
    :param tile_shape: the number of images to tile (rows, cols)

    :param output_pixel_vals: if output should be pixel values (i.e. int8
    values) or floats

    :param scale_rows_to_unit_interval: if the values need to be scaled before
    being plotted to [0,1] or not


    :returns: array suitable for viewing as an image.
    (See:`Image.fromarray`.)
    :rtype: a 2-d array with same dtype as X.

    """

    assert len(img_shape) == 2
    assert len(tile_shape) == 2
    assert len(tile_spacing) == 2

    # The expression below can be re-written in a more C style as
    # follows :
    #
    # out_shape    = [0,0]
    # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
    #                tile_spacing[0]
    # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
    #                tile_spacing[1]
    out_shape = [
        (ishp + tsp) * tshp - tsp
        for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
    ]

    if isinstance(X, tuple):
        assert len(X) == 4
        # Create an output numpy ndarray to store the image
        if output_pixel_vals:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
                                    dtype='uint8')
        else:
            out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
                                    dtype=X.dtype)

        #colors default to 0, alpha defaults to 1 (opaque)
        if output_pixel_vals:
            channel_defaults = [0, 0, 0, 255]
        else:
            channel_defaults = [0., 0., 0., 1.]

        for i in range(4):
            if X[i] is None:
                # if channel is None, fill it with zeros of the correct
                # dtype
                dt = out_array.dtype
                if output_pixel_vals:
                    dt = 'uint8'
                out_array[:, :, i] = numpy.zeros(
                    out_shape,
                    dtype=dt
                ) + channel_defaults[i]
            else:
                # use a recurrent call to compute the channel and store it
                # in the output
                out_array[:, :, i] = tile_raster_images(
                    X[i], img_shape, tile_shape, tile_spacing,
                    scale_rows_to_unit_interval, output_pixel_vals)
        return out_array

    else:
        # if we are dealing with only one channel
        H, W = img_shape
        Hs, Ws = tile_spacing

        # generate a matrix to store the output
        dt = X.dtype
        if output_pixel_vals:
            dt = 'uint8'
        out_array = numpy.zeros(out_shape, dtype=dt)

        for tile_row in range(tile_shape[0]):
            for tile_col in range(tile_shape[1]):
                if tile_row * tile_shape[1] + tile_col < X.shape[0]:
                    this_x = X[tile_row * tile_shape[1] + tile_col]
                    if scale_rows_to_unit_interval:
                        # if we should scale values to be between 0 and 1
                        # do this by calling the `scale_to_unit_interval`
                        # function
                        this_img = scale_to_unit_interval(
                            this_x.reshape(img_shape))
                    else:
                        this_img = this_x.reshape(img_shape)
                    # add the slice to the corresponding position in the
                    # output array
                    c = 1
                    if output_pixel_vals:
                        c = 255
                    out_array[
                        tile_row * (H + Hs): tile_row * (H + Hs) + H,
                        tile_col * (W + Ws): tile_col * (W + Ws) + W
                    ] = this_img * c
        return out_array

In [48]:
import pickle
import gzip

def load_data(dataset):
    f = gzip.open(dataset, 'rb')
    train_set, valid_set, test_set = pickle.load(f,encoding='latin1')
    f.close()
    return train_set, valid_set, test_set

datasets = load_data('mnist.pkl.gz')
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x,  test_set_y  = datasets[2]

In [49]:
import os
import timeit
import PIL.Image as Image

# specific to training mnist from a zip file of the data
def test_rbm(
    X=None,
    learning_rate=0.1, 
    training_epochs=15,
    batch_size=20,
    output_folder='rbm_plots',
    n_hidden=500
):
    # init var
    index = T.lscalar()    # index of [mini]batch
    train = T.matrix('x')
    x = train[index * batch_size : (index + 1) * batch_size] # batch, where each batch has batch_size number of rows
    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))
    # each row of the chain will store a hidden sample(layer)
    persistent_chain = theano.shared(
        numpy.zeros(
            (batch_size, n_hidden),
            dtype=theano.config.floatX
        ),
        borrow=True
    )
    rbm = RBM(
        input=x, 
        n_visible=28 * 28, # dimensions of the mnist data
        n_hidden=n_hidden, 
        numpy_rng=rng, 
        theano_rng=theano_rng
    )
    cost, updates = rbm.get_cost_updates(
        lr=learning_rate,
        persistent=persistent_chain, 
#         persistent=None, 
        k=15
    )
    
    # go into folder to save plots
#     if not os.path.isdir(output_folder):
#         os.makedirs(output_folder)
#     os.chdir(output_folder)
    
    # define theano function
    train_rbm = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            train: X
        },
        name='train_rbm'
    )
    
    # start training
    plotting_time = 0.
    start_time = timeit.default_timer()
    
    n_train_batches = int(X.shape[0] / batch_size)
    for epoch in range(training_epochs):
        # go through the training set
        mean_cost = []
        for batch_index in range(n_train_batches):
            mean_cost += [train_rbm(batch_index)]

        print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost))

        # Plot filters after each training epoch
        plotting_start = timeit.default_timer()
#         # Construct image from the weight matrix
#         image = Image.fromarray(
#             tile_raster_images(
#                 X=rbm.W.get_value(borrow=True).T,
#                 img_shape=(28, 28),
#                 tile_shape=(10, 10),
#                 tile_spacing=(1, 1)
#             )
#         )
#         image.save('filters_at_epoch_%i.png' % epoch)
        plotting_stop = timeit.default_timer()
        plotting_time += (plotting_stop - plotting_start)

    # calculate time of execution
    end_time = timeit.default_timer()
    pretraining_time = (end_time - start_time) - plotting_time
    print ('Training took %f minutes' % (pretraining_time / 60.))
    
    return rbm

In [50]:
# with (persistant) CD-k
rbm = test_rbm(train_set_x)

KeyboardInterrupt: 

In [None]:
def sample(rbm, n_chains=20, n_samples=10):
    #### sampling from the trained rbm
    n_test = test_set_x.shape[0]
    rng = numpy.random.RandomState(123)

    # pick random test examples, with which to initialize the persistent chain
    test_idx = rng.randint(n_test - n_chains)
    persistent_vis_chain = theano.shared(
        numpy.asarray(
            test_set_x[test_idx:test_idx + n_chains],
            dtype=theano.config.floatX
        )
    )
    print(test_set_y[test_idx:test_idx + n_chains])

    # pass back 1000 times
    plot_every = 1000
    (
        [
            presig_hids,
            hid_mfs,
            hid_samples,
            presig_vis,
            vis_mfs,
            vis_samples
        ],
        updates
    ) = theano.scan(
        rbm.gibbs_vhv,
        outputs_info=[None, None, None, None, None, persistent_vis_chain],
        n_steps=plot_every
    )

    updates.update({persistent_vis_chain: vis_samples[-1]}) # so in the loop below, when scan is called, persistent_vis_chain will be updated to vis_samples[-1]
    
    # execute
    sample_fn = theano.function(
        [],
        [
            vis_mfs[-1],
            vis_samples[-1]
        ],
        updates=updates,
        name='sample_fn'
    )

    # create a space to store the image for plotting; 29 because x:(28,28) + 1 for separation
    image_data = numpy.zeros(
        (29 * n_samples + 1, 29 * n_chains - 1),
        dtype='uint8'
    )
    for idx in range(n_samples):
        # for every loop, sample_fn is called, passing the data through the rbm for 1000 more times
        # only the last sample generated is plot
        vis_mf, vis_sample = sample_fn()
        print(' ... plotting sample ', idx) # note: each sample is a layer, not a row; only 20 rows are plotted
        image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
            X=vis_mf,
            img_shape=(28, 28),
            tile_shape=(1, n_chains),
            tile_spacing=(1, 1)
        )

    image = Image.fromarray(image_data)
    image.save('samples.png')

In [None]:
sample(rbm)

In [8]:
class HiddenLayer(object):
    def __init__(self, rng, input, n_in, n_out, W=None, b=None,activation=T.tanh): # if set actication=T.nnet.sigmoid, becomes logistic regresssion layer
        self.input = input
        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.
        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.W = W
        self.b = b

        lin_output = T.dot(input, self.W) + self.b
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        # parameters of the model
        self.params = [self.W, self.b]

In [9]:
class LogisticRegression(object):
    def __init__(self, input, n_in, n_out):
        self.input = input
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        # predict_proba
        self.p_y_given_x = T.nnet.softmax(T.dot(self.input, self.W) + self.b) # softmax=normalized sigmoid
        # predict
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.params = [self.W, self.b]


    # this is akin to cost = -1/m * sigma(ylog(wx) + (1-y)log(1-wx)) when y is binomial
    # in the current case y has n-labels, and only the prediction of the right label is picked out
    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    # perc of wrong predictions
    def errors(self, y):
        return T.mean(T.neq(self.y_pred, y)) # T.neq(a,b) checks a != b

In [175]:
import numpy
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams

class DBN:
    def __init__(
        self, 
        numpy_rng=None, 
        theano_rng=None, 
        n_ins=784,
        hidden_layers_sizes=[500, 500],  # each elem represents on layer with n(value of elem) nodes
        n_outs=10
    ):
        
        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = [] # holds the shared/updatable vars
        self.n_layers = len(hidden_layers_sizes)
        assert self.n_layers > 0
        self.theano_rng = theano_rng or RandomStreams(numpy_rng.randint(2 ** 30))
        self.index = T.lscalar('index')
        self.batch_size = T.lscalar('index')
        self.x = T.matrix('x')
        self.x_batch = self.x[self.index * self.batch_size : (self.index + 1) * self.batch_size]
        self.y = T.ivector('y') # the labels are presented as 1D vector of [int] labels
        self.y_batch = self.y[self.index * self.batch_size : (self.index + 1) * self.batch_size]
        
        for i in range(self.n_layers):
            output_size = hidden_layers_sizes[i]
            if i == 0:
                # first layer is to input
                input_size = n_ins
                layer_input = self.x_batch
            else:
                # subseq layers are RBMs, and the input is the prev layer
                input_size = hidden_layers_sizes[i - 1]
                layer_input = self.sigmoid_layers[-1].output # output is the final activation, i.e. softnet(X*W+b)
            
            # logistic regression layer
            sigmoid_layer = HiddenLayer(
                rng=numpy_rng,
                input=layer_input,
                n_in=input_size,
                n_out=output_size,
                activation=T.nnet.sigmoid
            )
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params) # sigmoid_layer.params is [W,b]; Note: extend: [a] + [b]; append [a] << [b]
            
            # RBM layer, which pretrains the W and b that will be used by the MLP
            rbm_layer = RBM(
                numpy_rng=numpy_rng,
                theano_rng=theano_rng,
                input=layer_input,
                n_visible=input_size,
                n_hidden=output_size,
                W=sigmoid_layer.W,
                hbias=sigmoid_layer.b
            )
            self.rbm_layers.append(rbm_layer)
            # note for this implementation, vbias of the RBMs are not treated as a param of the DBN (whereas W and hbias is already included in the DBN params)
            
        # note the sigmoid_layers do not generate a prediction or return the error of the model
        # thus a LogisticRegression class that has those functions is added to the end of the sigmoid_layers
        # the input is the activation of the final sigmoid_layer
        # output is the actual prediction
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        
        # finetune_cost = cost of regression model
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y_batch)
        # perc of wrong preds
        self.errors = self.logLayer.errors(self.y_batch)
        
    def pretraining_functions(self, X, k, learning_rate, batch_size): 
        pretrain_fns = []
        for rbm in self.rbm_layers:
            # using CD-k here (persisent=None) for training each RBM.
            cost, updates = rbm.get_cost_updates(
                learning_rate,
                persistent=None, 
                k=k # iters of hvh
            )

            fn = theano.function(
                inputs=[self.index],
                outputs=cost,
                updates=updates,
                givens={
                    self.batch_size: batch_size,
                    self.x: X
                }
            )
            # append theano function for each layer to output
            pretrain_fns.append(fn)

        return pretrain_fns
    
    def build_train_function(
        self, 
        train_x, 
        train_y, 
        batch_size, 
        learning_rate
    ):
        index = T.lscalar('index')  # index to a [mini]batch

        # gradients of MLP, computed by theano automatically
        gparams = T.grad(self.finetune_cost, self.params)

        # create updates list
        updates = {}
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * T.cast(learning_rate, dtype=theano.config.floatX)

        train_fn = theano.function(
            inputs=[self.index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.batch_size: batch_size,
                self.x: train_x,
                self.y: train_y
            }
        )
        return train_fn

    def get_errors(self, X, y, batch_size):  
        n_batches = int(X.shape[0] / batch_size)
        index = T.lscalar('index')  # index to a [mini]batch
        get_batch_error = theano.function(
            inputs=[self.index],
            outputs =self.errors, # perc of wrong preds
            givens={
                self.batch_size: batch_size,
                self.x: X,
                self.y: y
            }
        )
        
        def score_func():
            return [get_batch_error(i) for i in range(n_batches)]
        
        return score_func

In [174]:
batch_size=10
k = 1
pretrain_lr=0.01
pretraining_epochs=100

# for testing
# batch_size=10
# k = 1
# pretrain_lr=0.01
# pretraining_epochs=1
# train_set_x=train_set_x[0:1000,:]

train_set_y = train_set_y.astype('int32')[0:1000]
valid_set_y = valid_set_y.astype('int32')
test_set_y = test_set_y.astype('int32')


numpy_rng = numpy.random.RandomState(123)
n_train_batches = int(train_set_x.shape[0] / batch_size)

print('... building the model')
dbn = DBN(
    numpy_rng=numpy_rng, 
    n_ins=28 * 28,
    hidden_layers_sizes=[1000, 1000, 1000],
    n_outs=10
)
pretraining_fns = dbn.pretraining_functions(
    X=train_set_x,
    k=k,
    learning_rate=pretrain_lr,
    batch_size=batch_size
)

print('... pre-training the model')
start_time = timeit.default_timer()

# Pre-train layer-wise
for i in range(dbn.n_layers):
    for epoch in range(pretraining_epochs):
        costs = []
        for batch_index in range(n_train_batches):
            costs.append(pretraining_fns[i](index=batch_index))
        print(f'Pre-training layer {i}, epoch {epoch}, mean cost: ')
        print(numpy.mean(costs, dtype='float64'))

end_time = timeit.default_timer()

print(f'Training time: {end_time - start_time} s.')

... building the model
... pre-training the model
Pre-training layer 0, epoch 0, mean cost: 
-98.58770587005615
Pre-training layer 0, epoch 1, mean cost: 
-83.83068933868408
Pre-training layer 0, epoch 2, mean cost: 
-80.71768848648071
Pre-training layer 0, epoch 3, mean cost: 
-79.04617987442016
Pre-training layer 0, epoch 4, mean cost: 
-77.91485452041626


KeyboardInterrupt: 

In [None]:
finetune_lr = 0.1
training_epochs=1000

print('... getting the finetuning functions')
train_fn = dbn.build_train_function(
    train_x = train_set_x,
    train_y = train_set_y,
    batch_size=batch_size,
    learning_rate=finetune_lr
)
get_validate_errors = dbn.get_errors(valid_set_x, valid_set_y, batch_size)
get_test_errors = dbn.get_errors(test_set_x, test_set_y, batch_size)

def train_mlp():
    print('... finetuning the model')
    
    # early-stopping parameters
    patience = 4 * n_train_batches # look as this many examples regardless, i.e. 4 epochs
    patience_increase = 2. # loop for n times more when a new best is found
    improvement_threshold = 0.995 # a relative improvement of this much is considered significant

    # go through this many minibatches before checking the network on
    # the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2) # = n_train_batches

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    for epoch in range(training_epochs):
        for minibatch_index in range(n_train_batches):
            train_fn(minibatch_index)
            iter = epoch * n_train_batches + minibatch_index

            # for every 'validation_frequency' iters
            if (iter + 1) % validation_frequency == 0:
                validation_losses = get_validate_errors()
                curr_mean_validation_loss = numpy.mean(validation_losses, dtype='float64')
                print(f'epoch {epoch}, minibatch {minibatch_index + 1}/{n_train_batches}, validation error {curr_mean_validation_loss * 100.}%')

                # if we got the least validation errors until now
                if curr_mean_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough; which will allow more training = double of the curr loop count
                    if (curr_mean_validation_loss < best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = curr_mean_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = get_test_errors()
                    test_score = numpy.mean(test_losses, dtype='float64')
                    print(f'     epoch {epoch}, minibatch minibatch {minibatch_index + 1}/{n_train_batches}, test error of best model {test_score * 100.}%')

            # if no improvement in validation score for the last 50% iters
            if patience <= iter:
                return best_validation_loss, best_iter, test_score
    return best_validation_loss, best_iter, test_score

best_validation_loss, best_iter, test_score = train_mlp()
end_time = timeit.default_timer()

print(f'training time: {end_time - start_time}s.')
print(f'Optimization complete with best validation score of {best_validation_loss * 100.}%,\n'
    f'obtained at iteration {best_iter + 1},\n'
    f'with test performance {test_score * 100.}%')