Permalink
Browse files

Initial mcRBM tutorial

WARNING: far from finished and math has not been proof-read yet ! :)
  • Loading branch information...
1 parent 6481f4a commit 50d404d5c0522840db70c54fd055621b4dca97ac @gdesjardins gdesjardins committed Nov 1, 2010
Showing with 1,069 additions and 0 deletions.
  1. +781 −0 code/mcrbm/mcrbm.py
  2. +133 −0 code/mcrbm/test_mcrbm.py
  3. BIN data/training_colorpatches_16x16_demo.mat
  4. +155 −0 doc/mcrbm.txt
View
@@ -0,0 +1,133 @@
+import sys
+from pylearn.algorithms.mcRBM import *
+import pylearn.datasets.cifar10
+import pylearn.dataset_ops.tinyimages
+
+import pylearn.dataset_ops.cifar10
+from theano import tensor
+from pylearn.shared.layers.logreg import LogisticRegression
+
+def l2(X):
+ return numpy.sqrt((X**2).sum())
+
+def _default_rbm_alloc(n_I, n_K=256, n_J=100):
+ return mcRBM.alloc(n_I, n_K, n_J)
+
+def _default_trainer_alloc(rbm, train_batch, batchsize, l1_penalty, l1_penalty_start):
+ return mcRBMTrainer.alloc(rbm, train_batch, batchsize, l1_penalty=l1_penalty,
+ l1_penalty_start=l1_penalty_start,persistent_chains=persistent_chains)
+
+def test_reproduce_ranzato_hinton_2010(dataset='MAR',
+ n_train_iters=5000,
+ rbm_alloc=_default_rbm_alloc,
+ trainer_alloc=_default_trainer_alloc,
+ lr_per_example=.075,
+ l1_penalty=1e-3,
+ l1_penalty_start=1000,
+ persistent_chains=True,
+ ):
+
+ batchsize = 128
+ ## specific to MAR dataset ##
+ n_vis=105
+ n_patches=10240
+ epoch_size=n_patches
+
+ tile = pylearn.dataset_ops.image_patches.save_filters_of_ranzato_hinton_2010
+
+ batch_idx = TT.iscalar()
+ batch_range =batch_idx * batchsize + np.arange(batchsize)
+
+ train_batch = pylearn.dataset_ops.image_patches.ranzato_hinton_2010_op(batch_range)
+
+ imgs_fn = function([batch_idx], outputs=train_batch)
+
+ trainer = trainer_alloc(
+ rbm_alloc(n_I=n_vis),
+ train_batch,
+ batchsize,
+ initial_lr_per_example=lr_per_example,
+ l1_penalty=l1_penalty,
+ l1_penalty_start=l1_penalty_start,
+ persistent_chains=persistent_chains)
+ rbm=trainer.rbm
+
+ if persistent_chains:
+ grads = trainer.contrastive_grads()
+ learn_fn = function([batch_idx],
+ outputs=[grads[0].norm(2), grads[0].norm(2), grads[1].norm(2)],
+ updates=trainer.cd_updates())
+ else:
+ learn_fn = function([batch_idx], outputs=[], updates=trainer.cd_updates())
+
+ if persistent_chains:
+ smplr = trainer.sampler
+ else:
+ smplr = trainer._last_cd1_sampler
+
+ if dataset == 'cifar10patches8x8':
+ cPickle.dump(
+ pylearn.dataset_ops.cifar10.random_cifar_patches_pca(
+ n_vis, None, 'float32', n_patches, R, C,),
+ open('test_mcRBM.pca.pkl','w'))
+
+ print "Learning..."
+ last_epoch = -1
+ for jj in xrange(n_train_iters):
+ epoch = jj*batchsize / epoch_size
+
+ print_jj = epoch != last_epoch
+ last_epoch = epoch
+
+ if print_jj:
+ tile(imgs_fn(jj), "imgs_%06i.png"%jj)
+ if persistent_chains:
+ tile(smplr.positions.value, "sample_%06i.png"%jj)
+ tile(rbm.U.value.T, "U_%06i.png"%jj)
+ tile(rbm.W.value.T, "W_%06i.png"%jj)
+
+ print 'saving samples', jj, 'epoch', jj/(epoch_size/batchsize)
+
+ print 'l2(U)', l2(rbm.U.value),
+ print 'l2(W)', l2(rbm.W.value),
+ print 'l1_penalty',
+ try:
+ print trainer.effective_l1_penalty.value
+ except:
+ print trainer.effective_l1_penalty
+
+ print 'U min max', rbm.U.value.min(), rbm.U.value.max(),
+ print 'W min max', rbm.W.value.min(), rbm.W.value.max(),
+ print 'a min max', rbm.a.value.min(), rbm.a.value.max(),
+ print 'b min max', rbm.b.value.min(), rbm.b.value.max(),
+ print 'c min max', rbm.c.value.min(), rbm.c.value.max()
+
+ if persistent_chains:
+ print 'parts min', smplr.positions.value.min(),
+ print 'max',smplr.positions.value.max(),
+ print 'HMC step', smplr.stepsize.value,
+ print 'arate', smplr.avg_acceptance_rate.value
+
+
+ l2_of_Ugrad = learn_fn(jj)
+
+ if persistent_chains and print_jj:
+ print 'l2(U_grad)', float(l2_of_Ugrad[0]),
+ print 'l2(U_inc)', float(l2_of_Ugrad[1]),
+ print 'l2(W_inc)', float(l2_of_Ugrad[2]),
+ #print 'FE+', float(l2_of_Ugrad[2]),
+ #print 'FE+[0]', float(l2_of_Ugrad[3]),
+ #print 'FE+[1]', float(l2_of_Ugrad[4]),
+ #print 'FE+[2]', float(l2_of_Ugrad[5]),
+ #print 'FE+[3]', float(l2_of_Ugrad[6])
+
+ if jj % 2000 == 0:
+ print ''
+ print 'Saving rbm...'
+ cPickle.dump(rbm, open('mcRBM.rbm.%06i.pkl'%jj, 'w'), -1)
+ if persistent_chains:
+ print 'Saving sampler...'
+ cPickle.dump(smplr, open('mcRBM.smplr.%06i.pkl'%jj, 'w'), -1)
+
+
+ return rbm, smplr
Binary file not shown.
View
@@ -0,0 +1,155 @@
+.. _RBM:
+
+Restricted Boltzmann Machines (RBM)
+===================================
+
+.. raw:: latex
+ :label: bigskip
+
+ \bigskip
+
+Notation
+++++++++
+
+* :math:`\v \in \mathbb{R}^{D\times 1}`: D Gaussian visible units
+* :math:`\h^c \in \{0,1\}^{N\times 1}`: N covariance-hidden units
+* :math:`\P \in \mathbb{R}^{F\times N}`: weight matrix connecting N covariance-hidden units to F factors
+* :math:`\C \in \mathbb{R}^{D\times F}`: weight matrix connecting F factors to D visible units
+* :math:`\b^c \in \mathbb{R}^{N\times 1}`: biases of N covariance-hidden units
+* :math:`\h^m \in \{0,1\}^{M\times 1}`: M mean-hidden units
+* :math:`\W \in \mathbb{R}^{D\times M}`: weight matrix connecting M mean-hidden units to D visible units
+* :math:`\b^m \in \mathbb{R}^{M\times 1}`: biases of M mean-hidden units
+
+Energy Functions
+++++++++++++++++
+
+Covariance Energy
+-----------------
+
+.. math::
+ :label: cov_energy
+
+ \E^c(\v,\h^c) = -\frac{1}{2}
+ \sum_{f=1}^F \sum_{k=1}^N P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 -
+ \sum_{k=1}^N b_k^c h_k^c
+
+Mean Energy
+-----------
+
+.. math::
+ :label: mean_energy
+
+ \E^m(\v,\h^m) = - \sum_{j=1}^M \sum_{i=1}^D W_{ij} h_j^m v_i
+ - \sum_{j=1}^M b_j^m h_j^m
+
+
+Conditionals: :math:`p(\h^m | \v)`
+----------------------------------
+
+This is the same derivation as with standard RBMs. We start with the observation
+that the mean-hidden units are conditionally-independent given the visible
+units, hence :math:`p(\h^m | \v) = \prod_{j=1}^M p(\h_j^m | \v)`.
+
+We can then derive :math:`p(\h_j^m | \v)` as follows:
+
+.. math::
+
+ p(\h_j^m | \v)
+ &= \frac {p(\h_j^m, \v)} {p(\v)} \nonumber \\
+ &= \frac {p(\h_j^m, \v)} {\sum_{h_j^m} p(\h_j^m, \v)} \nonumber \\
+ &= \frac
+ {\exp(\sum_{i=1}^D W_{ij} h_j^m v_i + b_j^m h_j^m)}
+ {\sum_{h_j} \exp(\sum_{i=1}^D W_{ij} h_j^m v_i + b_j^m h_j^m)} \nonumber \\
+ &= \frac
+ {\exp(\sum_{i=1}^D W_{ij} h_j^m v_i + b_j^m h_j^m)}
+ {1 + \exp(\sum_{i=1}^D W_{ij} v_i + b_j^m)} \nonumber
+
+The activation probability of a mean-hidden unit, :math:`p(\h_j^m=1 |\v)`, is thus:
+
+.. math::
+
+ p(\h_j^m=1 |\v) &= \sigma(\sum_{i=1}^D W_{ij} v_i + b_j^m) \nonumber
+
+
+Conditionals: :math:`p(\h^c | \v)`
+----------------------------------
+
+It is straightforward to show that the covariance-hidden units are also
+conditionally independent. This is due to the fact that :math:`\E^c(\v,\h^c)` is
+linear in :math:`\h^c`, thus:
+
+.. math::
+
+ p(\h^c, \v)
+ &= \frac{1}{Z} \exp(-\E^c(\v,\h^c) \nonumber \\
+ &= \frac{1}{Z} \exp(\frac{1}{2} \sum_{f=1}^F \sum_{k=1}^N P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 +
+ \sum_{k=1}^N b_k^c h_k^c) \nonumber \\
+ &= \frac{1}{Z} \prod_{k=1}^N \exp(\frac{1}{2} \sum_{f=1}^F P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 + b_k^c h_k^c) \nonumber \\
+ &= \frac{1}{Z} \prod_{k=1}^N p(\h_k^c, \v) \nonumber
+
+The rest of the derivation is equivalent to \ref{sec:hm_given_v}, substituting
+:math:`\E^m(\v,\h^m)` for :math:`\E^c(\v,\h^c)`. This yields:
+
+.. math::
+
+ p(\h_k^c=1 |\v) &= \sigma(\frac{1}{2} \sum_{f=1}^F P_{fk} (\sum_{i=1}^D
+ C_{if} v_i)^2 + b_k^c) \nonumber
+
+
+Conditionals: :math:`p(\v | \h^c,\h^m)`
+---------------------------------------
+
+From basic probability, we can write:
+
+.. math::
+
+ p(\v | \h^c, \h^m) &= \frac {p(\v,\h)} {p(\h)} \nonumber \\
+ &= \frac{1}{p(\h)} \frac{1}{Z} \exp(
+ \frac{1}{2} \sum_{f=1}^F \sum_{k=1}^N P_{fk} h_k^c (\sum_{i=1}^D C_{if} v_i)^2 + \sum_{k=1}^N b_k^c h_k^c +
+ \sum_{j=1}^M \sum_{i=1}^D W_{ij} h_j^m v_i + \sum_{j=1}^M b_j^m h_j^m) \nonumber \\
+ &= \frac{1}{Z_2} \exp( \frac{1}{2} \v^T(\C \text{diag}(\P\h^c) \C^T)\v + {\b^c}^T\h^c + \v^T\W\h^m + {\b^m}^T\h^m) \nonumber
+
+
+
+
+Setting :math:`\Sigma^{-1} = - \C\text{diag}(P\h^c)\C^T`, we can now write:
+
+.. math::
+
+ p(\v | \h^c, \h^m) &=
+ \frac{1}{Z_2} \exp(-\frac{1}{2} \v^T\Sigma^{-1}\v + {\b^c}^T\h^c + \v^T\W\h^m + {\b^m}^T\h^m) \nonumber \\
+ &= \frac{1}{Z_2} \exp(-\frac{1}{2} \v^T\Sigma^{-1}\v + \v\W\h^m) \exp({\b^c}^T\h^c + {\b^m}^T\h^m) \nonumber \\
+ &= \frac{1}{Z_3} \exp(-\frac{1}{2} \v^T\Sigma^{-1}\v + \v\W\h^m) \nonumber
+
+Since we know that :math:`\v` are Gaussian random variables, we need to get
+the above in the form :math:`\frac{1}{Z} \exp(-\frac{1}{2} (\v-\mu)^T
+\Sigma^{-1} (\v-\mu))`. We can do this by completing the squares and then
+solving for :math:`\mu` in the cross-term, which gives
+:math:`\v^T \Sigma^{-1} \mu = \v \W \h^m`, and :math:`\mu = \Sigma \W \h^m`.
+
+Our conditional distribution can thus be written as:
+
+.. math::
+ p(\v | \h^c, \h^m) &= \mathcal{N}(\Sigma \W \h^m, \Sigma) \nonumber \\
+ \text{ with } \Sigma^{-1} &= - \C\text{diag}(P\h^c)\C^T \nonumber
+
+
+Free-Energy
+-----------
+
+By definition, the free-energy :math:`\F(\v)` of a given visible configuration :math:`\v`
+is: :math:`\F(v) = -\log \sum_h e^{-\E(\v,\h)}`. We can derive the free-energy for
+the mcRBM as follows:
+
+.. math::
+
+ \F(\v) = &-\log \sum_{h^c} \sum_{h^m} \exp(-\E^c(\v,\h^c) - \E^m(\v,\h^m)) \nonumber \\
+ = &-\log \sum_{h^c} \sum_{h^m} \left[ \prod_{k=1}^N \exp(-\E^c(\v,\h_k^c))
+ \prod_{j=1}^M \exp(-\E^m(\v,\h_j^m)) \right] \nonumber \\
+ = &-\log \left[ \prod_{k=1}^N (1 + \exp(-\E^c(\v,\h_k^c=1)))
+ \prod_{j=1}^M (1 + \exp(-\E^m(\v,\h_j^m=1))) \right]\nonumber \\
+ = &-\sum_{k=1}^N \log(1 + \exp(-\E^c(\v,\h_k^c=1)))
+ -\sum_{j=1}^M \log(1 + \exp(-\E^m(\v,\h_j^m=1))) \nonumber \\
+ = &-\sum_{k=1}^N \log(1 + \exp(\frac{1}{2} \sum_{f=1}^F P_{fk} (\sum_{i=1}^D C_{if} v_i)^2 + b_k^c)) \nonumber \\
+ &-\sum_{j=1}^M \log(1 + \exp(\sum_{i=1}^D W_{ij} v_i + b_j^m)) \nonumber
+

0 comments on commit 50d404d

Please sign in to comment.