In [1]:
"""This tutorial introduces restricted boltzmann machines (RBM) using Theano.

Boltzmann Machines (BMs) are a particular form of energy-based model which
contain hidden variables. Restricted Boltzmann Machines further restrict BMs
to those without visible-visible and hidden-hidden connections.
"""
from __future__ import print_function, division
import os
import sys
import timeit
from six.moves import cPickle as pickle

import numpy as np
import pandas as pd

import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams

from lib.mlp import HiddenLayer, LogisticRegression
from lib.rbm import RBM, RSM
from lib.dbn import DBN

os.chdir('/home/ekhongl/Codes/DL - Topic Modelling')

Using gpu device 0: Tesla K40c (CNMeM is disabled, cuDNN 5105)


In [3]:
class load_DBN(object):
    
    def __init__(self, architecture = [], opt_epoch = [], model_src = 'dbn_params_2000'):
        
        # ensure model source directory is valid
        assert type(model_src) == str or model_src is not None, "dir to load model parameters not indicated"
        assert len(architecture) == (len(opt_epoch)+1) , "len of network inputs must be 1 more than len of hidden layers"
        
        # reconstruct the DBN class
        self.n_layers = len(architecture[1:])
        self.dbn = DBN( n_ins=architecture[0],
                        hidden_layers_sizes = architecture[1:] )

        # load saved model
        for i in range(len(opt_epoch)):
            model_pkl = os.path.join(model_src,
                        'dbn_layer' + str(i) + '_epoch_' + str(opt_epoch[i]) + '.pkl')
            self.dbn.rbm_layers[i].__setstate__(pickle.load(open(model_pkl, 'rb')))
        
        
    def score(self, input, batch_size = 2000):    
        train_set_x = input
        N_input_x = train_set_x.shape[0]

        # compute number of minibatches for scoring
        if train_set_x.get_value(borrow=True).shape[0] % batch_size != 0:
            N_splits = int( np.floor(train_set_x.get_value(borrow=True).shape[0] / batch_size) + 1 )
        else:
            N_splits = int( np.floor(train_set_x.get_value(borrow=True).shape[0] / batch_size) )
        # allocate symbolic variables for the data
        index = T.lscalar()    # index to a [mini]batch
        x = T.matrix('x')  # the data is presented as rasterized images
        
        # extract the features w.r.t inputs
        outputs = self.dbn.sigmoid_layers[-1].output
        
        # input_rSum must be specified for the RSM layer
        self.dbn.rbm_layers[0].input_rSum = x.sum(axis=1)
        activation = x
        for i in range(self.n_layers):
            _, activation = self.dbn.rbm_layers[i].propup(activation)
            
        # start-snippet-5
        # it is ok for a theano function to have no output
        # the purpose of train_rbm is solely to update the RBM parameters
        score = theano.function(
            inputs = [index],
            outputs = activation,
            givens={
                x: train_set_x[index * batch_size: (index + 1) * batch_size]
            }
        )
        
        return np.concatenate( [score(ii) for ii in range(N_splits)], axis=0 )

In [2]:
dat_x = np.genfromtxt('data/dtm_20news.csv', dtype='float32', delimiter=',', skip_header = 1)
dat_y = dat_x[:,0]
dat_x = dat_x[:,1:]
vocab =  np.genfromtxt('data/dtm_20news.csv', dtype=str, delimiter=',', max_rows = 1)[1:]
test_input = theano.shared(dat_x)

In [4]:
model = load_DBN( architecture = [2756, 500, 500, 128], opt_epoch = [900,5,10])

Building layer: 0
   Input units: 2756
  Output units: 500
Building layer: 1
   Input units: 500
  Output units: 500
Building layer: 2
   Input units: 500
  Output units: 128


In [5]:
output = model.score(input = test_input)

In [18]:
colnames = ['bit'] * 128
colnames = [colnames[i] + str(i) for i in range(128)]
colnames.insert(0,'_label_')

In [19]:
pd.DataFrame(data = np.c_[dat_y, output], 
             columns = colnames). \
             to_csv( 'data/dbn_features.csv', index = False)