### Restricted Boltzman Machine - Topic Modelling

In [1]:
import numpy as np
from collections import Counter, defaultdict
import pandas as pd
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
np_rng = np.random.RandomState(1234) #setting the random state

In [2]:
# import data

df = pd.read_excel("amazon.xlsx")

In [3]:
#lets run and check
df.head()

Unnamed: 0,Text
0,So there is no way for me to plug it in here i...
1,"Good case, Excellent value."
2,Great for the jawbone.
3,Tied to charger for conversations lasting more...
4,The mic is great.


In [4]:
# create bag of words model on train and test data

tf = CountVectorizer(
    input='content',
    encoding='utf-8',
    decode_error='strict',
    strip_accents=None,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words=None,
    token_pattern=r"(?u)\b\w\w+\b",
    ngram_range=(1, 1),
    analyzer='word',
    max_df=50,
    min_df=1,
    max_features=None,
    vocabulary=None,
    binary=False,
    dtype=np.int64
)

# fit tf on the dataframe df
tf.fit(df.Text)

# transform df dataframe
trainX = tf.transform(df.Text)

In [5]:
#check if you are getting the correct output
print(sum(trainX.toarray()[1]))
trainX.shape

3


(1000, 1825)

Now that you have the bag of words model, let's define the number of visible and hidden units.

In [6]:
# define visible units
visibleUnits = trainX.shape[1] # vocabulary size

# assign number of units
hiddenUnits = 5 # hyperparameter

In [7]:
visibleUnits

1825

In [8]:
# utility Functions

# deine the sigmoid function
def sigmoid(X):
    return 1. / (1 + np.exp(-X))

In [9]:
def sampleHiddenLayer(v_sample):
    
    # write the code for calculation of hPdf using Eq 4
    hPdf = sigmoid(np.dot(v_sample, W)  + hiddenBias)
    
    # Here, np.random.binomial is used to create the hidden layer sample matrix
    h_sample = np_rng.binomial(size=hPdf.shape, n=1, p=hPdf)
    return [hPdf, h_sample]

In [10]:
def sampleVisibleLayer(h_sample, D):
    
    # complete the following function such that vPdf has the sum of entries equal to 1 for each of the datapoints in the batch
    # you have to use axis = 1
    numerator = np.exp(np.dot(h_sample, W.T) + visibleBias)
    denominator = numerator.sum(axis=1).reshape((batchSize, 1))
    vPdf = numerator/denominator
    
    # Here np.random.multinomial is used to sample as each document has different number of words 
    # and hence D is also a parameter in sampling    
    v_sample = np.zeros((batchSize, vPdf.shape[1]))
    for i in range(batchSize):
        v_sample[i] = np_rng.multinomial(size=1, n=D[i], pvals=vPdf[i])
    return [vPdf, v_sample]

### Sampling

In [11]:
def gibbs(h_sample, D):
    
    #use sampleVIsibleLayer and sampleHiddenLayer 
    vPdf, v_sample = sampleVisibleLayer(h_sample, D)
    hPdf, h_sample = sampleHiddenLayer(v_sample)
    return [vPdf, v_sample, hPdf, h_sample ]

In [12]:
def cd_k(data,k):
    
    D = data.sum(axis=1)
    hiddenPDF_data, hiddenSample_data = sampleHiddenLayer(data)
    chain_start = hiddenSample_data

    for step in range(k):
        if step == 0:
            visiblePDF, visibleSamples, hiddenPDF, hiddenSamples  = gibbs(chain_start, D) 
        else:
            visiblePDF, visibleSamples, hiddenPDF, hiddenSamples = gibbs(hiddenSamples, D)
    return hiddenPDF_data, visibleSamples, hiddenPDF


In [13]:
"""
visibleUnits: no of words in your Bag of words Model
hiddenUnits: no of topics
batchSize: data slice to be selected 
epochs: no of iterations
eta: learning rate
mrate: momentum coefficient
W : weights between the visible and hidden layer
visibleBias, hiddenBias: biases for visible and hidden layer respectively
"""

# define batch size
batchSize = 200

epochs = 100
eta = 0.05
mrate = 0.5
np_rng = np.random.RandomState(1234)

# initialise weights
weightinit=0.01
W = weightinit * np_rng.randn(visibleUnits, hiddenUnits)
visibleBias = weightinit * np_rng.randn(visibleUnits)
hiddenBias = np.zeros((hiddenUnits))

# interations of gibbs sampling
k=2

In [14]:
def train(dataX,k):
    # for momentum
    mW = np.zeros((visibleUnits, hiddenUnits)) # initialise weights
    mvisibleBias = np.zeros((visibleUnits)) # initialise visible biases with zeros
    mhiddenBias = np.zeros((hiddenUnits)) # initialise hidden biases with zeros
    global W,visibleBias,hiddenBias,mrate,batchSize,epochs
    for epoch in range(epochs):
        np_rng.shuffle(dataX) #shuffling the data
        
        for i in range(0, dataX.shape[0], batchSize):
            mData = dataX[i:i + batchSize] #select a batch of datapoints
            hiddenPDF_data, visibleSamples, hiddenPDF = cd_k(mData,k) #Contrastive Divergence on the batch for k iterations

            mW = mW * mrate + (np.dot(mData.T, hiddenPDF_data) - np.dot(visibleSamples.T, hiddenPDF)) #calculate the update weight matrix
            mvisibleBias = mvisibleBias * mrate + np.mean(mData - visibleSamples, axis=0) #calculate the update visible bias vector
            mhiddenBias = mhiddenBias * mrate + np.mean(hiddenPDF_data - hiddenPDF, axis=0) #calculate the hidden bias vector

            W += eta * mW #weight update equation
            visibleBias += eta * mvisibleBias  #visible bias update equation
            hiddenBias += eta * mhiddenBias  #hidden bias update equation

#### Train the Model

In [15]:
train(trainX.toarray(),k)

In [16]:
def worddist( topic, voc):
    
    """
    Initialize every topic =1 once 
    """
    vecTopics = np.zeros((topic, topic))
    for i in range(len(vecTopics)):
        vecTopics[i][i] = 1
    
    
    for i, vecTopic in enumerate(vecTopics):
       
        numerator = np.exp(np.dot(vecTopic, W.T) + visibleBias)
        denominator = numerator.sum().reshape((1, 1))
        word_distribution = (numerator/denominator).flatten()
        
        tmpDict = {}
        for j in voc.keys():
            tmpDict[j] = word_distribution[voc[j]]
        print('topic', str(i), ':', vecTopic)
        lm=0
        for word, prob in sorted(tmpDict.items(), key=lambda x:x[1], reverse=True):
            print ( word, str(prob))
            lm+=1
            if lm==15:
                break
        print('\n')

In [17]:
worddist( hiddenUnits, tf.vocabulary_)

topic 0 : [1. 0. 0. 0. 0.]
actually 0.02767914759045432
pda 0.023380123859689444
real 0.021818503727544132
simple 0.015835207707529095
drain 0.013295122822284653
50 0.010538843387989572
down 0.00891493652872779
world 0.008891607408844055
useful 0.008786481873627934
neat 0.008339275610760949
like 0.007613590028872028
machine 0.007581361040531757
stated 0.007124520268533332
weak 0.006990490558752492
usable 0.006977622291836275


topic 1 : [0. 1. 0. 0. 0.]
noise 0.03353141348056335
piece 0.020420758749466503
failed 0.014389161558974895
hate 0.011527950767977302
big 0.008602591898304191
day 0.008295919124564023
breaking 0.006703870631010143
bose 0.0058457935946980245
front 0.005601051426804026
ear 0.005209908617927042
most 0.005131890991124724
jawbone 0.005086079477561938
nyc 0.0049339380980908286
size 0.004896378830246795
protector 0.0048269046639245064


topic 2 : [0. 0. 1. 0. 0.]
had 0.04475881121860015
headset 0.028756744944506925
ve 0.018543849781484753
so 0.01211628531025495
use 0.01

The output shows the probability assigned to each word for ever topic present.