In [27]:
import numpy as np
import torch
import torch.utils.data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms
from torchvision.utils import make_grid , save_image

In [30]:
### Part 1 : Creating the architecture of the Neural Network 

# Create the class RBM 
class RBM(): 
    # self is the object 
    # all the variables attached to the object will be created with self. 
    # nv number of visible nodes 
    # nh number of hidden nodes 
    def __init__(self, nv, nh): 
        #initialize the parameters we optimize during the training weights and bias
        #weights used for the probability of the visible nodes given the hidden nodes (p_v_given_h))
        # torch.rand : random normal distribution mean=0, variance=1 
        self.W = torch.randn(nh,nv)
        # bias probability of the hidden nodes given the visible nodes (p_h_given_v))
        # fake dimension for the batch = 1
        self.a = torch.randn(1,nh)
        # bias probability of the visible nodes is activated 
        #given the value of the hidden nodes (p_v_given_h))
        self.b = torch.randn(1, nv)
        # we can add more, like learning rate... in this section to be used in train()

    def sample_h(self, x): 
        # probability h is activated given the value v is the sigmoid(Wx+a).
        # torch.mm make the product of two tensors. 
        # W.t()take the transpose because W is used for the p_v_given_h.
        wx=torch.mm(x,self.W.t())
        # .expand_as(wx) : expand the mini-batch.
        activation=wx+self.a.expand_as(wx)
        # probability p_h_given_v is the probability that the note drama genre is activated. 
        # v value is the input value. If v is a film drama, p_h_given_v will be hight. 
        # If v is not a film drama, p_h_given_v will be low.
        p_h_given_v=torch.sigmoid(activation)
        # Bernouilli RBM. we predict the user loves the movie or not (0 or 1).
        # activation or not activation of the nh neurons. 
        return p_h_given_v, torch.bernouilli(p_h_given_v)

    def sample_v(self, y): 
        # probability h is activated given the value v is the sigmoid(Wx+a).
        # torch.mm make the product of two tensors. 
        wy=torch.mm(y,self.W)
        # .expand_as(wx) : expand the mini-batch.
        activation=wy+self.b.expand_as(wy)
        p_v_given_h=torch.sigmoid(activation)
        # Bernouilli RBM. we predict the user loves the movie or not (0 or 1).
        # activation or not activation of the nv neurons. 
        return p_v_given_h, torch.bernouilli(p_v_given_h)

        # Contrastive divergence Algorithm
        # Optimize the weights to minimize the energy.
        # ~ Maximize the Log-Likelihood of the model. 
        # Need to approximate the gradients with the algorithm contrastive divergence. 
    def train(self,v0,vk,ph0,phk):
        #ph0,phk regarding to 1 user, this train function
        # vk: visible nodes after k round trips of sampling
        # ph: ph0: vector prob at first iteration of hidden nodes = 1, given v0
        # phk vector prob at k iteration, for h = 1 given vk
        
        self.W += torch.mm(v0.t(),ph0)-torch.mm(vk.t(),phk)
        # add ,0 for the tensor of two dimension 
        self.b += torch.sum((v0-vk),0) #keep format of v as 2d dim
        self.a += torch.sum(ph0-phk,0)

In [39]:
torch.sum(torch.FloatTensor([7]),0)

tensor(7.)

In [None]:
### Part 2 : Create the RBM Object 
# number of movies
nv=len(training_set[0]) 
# parameter is tunable is the number of features that we want to detect 
#hidden nodes rep some features rbm it to learn
# features ~ genre, actors, director, oscar, date.... 
nh=100 #to tune
# update the weights after serveral observations, also tunable
batch_size=100 # each batch train how many samples
# Creation of the object of the class RBM()
rbm=RBM(nv,nh)

In [40]:
### Part 3 : Training the RBM 
nb_epoch = 10 
# upper bound is no included nb_epoch+1 

# First for loop : epoch for loop 
for epoch in range (1,nb_epoch+1):
    #loss function initialized to 0 at the beginning of the trainning 
    train_loss = 0
    # counter which is a float . 
    s = 0.

    # Second for loop : user forloop 
    # 0 lower bound 
    # nb_users-batch_size upper bound 
    # batch_size is the step of each batch (100)
    # First batch is from user id=0 ti user id =99
    
    #taking a batch of users
    for id_user in range(0,nb_users-batch_size,batch_size): #range(a,b,c)
        # at the beginning v0=vk 
        # vk is going to be updated
        # id_user,id_user+batch_size ~id_user+100
        
        vk=training_set[id_user:id_user+batch_size] #output of gibbs sampling, now dealing with specific user
        v0=training_set[id_user:id_user+batch_size]
        #initial probability prob hidden node at start = 1 given original ratings 
        ph0, _ = rbm.sample_h(v0)
        
        
        # k steps in the random walk
        # Third for loop : Contrastive divergence
        for k in range(10): #why 10? mcmc technique
            #call sample_h on visible nodes, get the first sampling of the first hidden node
            _,hk=rbm.sample_h(vk) #v0 is the target, will not change, we take vk
            #next we use the obtained hk to get the sampled vk
            _,vk=rbm.sample_v(hk) #update vk, second sample of the visible nodes, until all the k steps are sampled and updated vk
            
            #vk are visible nodes in the kstep random walk
            
            #above are the actual sampled values 
            
            # we don't want to learn where is no rating by the user
            # no update when -1 rating.
            vk[v0<0]=v0[v0<0] #training are not done on these -1 ratings
            
            phk,_=rbm.sample_h(vk) #get the probability of hk, last sample of the visible nodes in the kstep random walk, vk is the input, using this to get the phk
            
            # maximum likelihood to update the parameters, no return type
            rbm.train(v0,vk,ph0,phk)
            
            # Contrastive divergence to approximate the gradient
            
            # Compare vk updated after the training to v0 the target. 
            # simple distance in absolute value 
            # [vO>=0] take only the value with ratings / coherence with vk[v0<0]=[v0<0]
            #update the loss value
            
            # difference between target v0, and the prediction (last sample of the visible node of the contrastive divergence random walk)
            # mean absolute difference between truth and predictions
            train_loss+=torch.mean(torch.abs(v0[vO>=0]-vk[vO>=0]))
            
            #rmse version
            #train_loss+=np.sqrt(torch.mean((v0[v0>=0] - vk[v0>=0])**2)) # RMSE here
            s += 1.
    # in each epoch , see overall loss function        
    print('epoch: ' +str(epoch) +' loss: '+str(train_loss/s))

NameError: name 'nb_users' is not defined

In [None]:
# making prediction on the test set
# testing RBM
# mcmc

test_loss = 0
# counter which is a float . 
s = 0. #each step increment by 1

#taking a batch of users, no need batch in testing
for id_user in range(nb_users):
    # v is input, need to keep training set, input used to activate the neurons to activate hidden states
    # use training set input ratings to activate the neurons and to make predictions on the testset
    v=training_set[id_user]
    
    #vt is the target, original ratings of testset
    vt=test_set[id_user]
    
    # not consider rating of -1 in testset
    if len(vt[vt>=0]) >0:
        # then to make prediction
        # sample hidden nodes first 
        _,h = rbm.sample_h(v)
        # then use the sampled hidden nodes h to sample visible nodes v
        _,v = rbm.sample_v(h)
        
        test_loss+=torch.mean(torch.abs(vt[vt>=0]-v[v>=0])) # excluding places where there are no rating, for fair comparison
        #rmse version
        #test_loss += np.sqrt(torch.mean((vt[vt>=0] - v[vt>=0])**2)) # RMSE here
        s += 1.
    print('test loss:'+str(test_loss/s))
    