# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
pd.set_option('display.max_rows', 50)
movies = pd.read_csv('dataset/ml-1m/movies.dat', sep=  '::', header= None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('dataset/ml-1m/users.dat', sep=  '::', header= None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('dataset/ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [3]:
print(movies.shape, users.shape, ratings.shape)

(3883, 3) (6040, 5) (1000209, 4)


In [4]:
movies.head(50)

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


First column: Movie ID,

Second Column: Movie's name and year

Third Column: Genre

In [5]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


First column of the users: User ID,

Second column: Gender,

Third column: Age,

Fourth column: Jobs,

Fifth column: Zip code

In [6]:
ratings.head(50)

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


First column: Users' ID

Second column: Movies' ID

Third column: Ratings (1 to 5)

Fourth column: Timestamps

In [7]:
training_set = pd.read_csv('dataset/ml-100k/u1.base', delimiter = '\t') # Seperator of u1.base is tab.
training_set = np.array(training_set, dtype = 'int') # Converting df to array for pytorch.
test_set = pd.read_csv('dataset/ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype='int')

In [8]:
print(f"""Training set: {training_set.shape ,type(training_set)},
Test set: {test_set.shape, type(test_set)}""")

Training set: ((79999, 4), <class 'numpy.ndarray'>),
Test set: ((19999, 4), <class 'numpy.ndarray'>)


In [9]:
# Getting the user and movie num.
max_user = max(max(training_set[:,0]), max(test_set[:,0]))
max_movie = max(max(training_set[:,1]), max(test_set[:,1]))
max_user, max_movie = int(max_user), int(max_movie)
print(f"Max user: {max_user},\nMax movie: {max_movie}")

Max user: 943,
Max movie: 1682


In [10]:
def convert(data):
  """
  Converting training and test sets into a matrix.
  Rows = Users, Columns = Movies, Cells = Ratings. (0 if user didn't rate)
  """
  new_data = []
  for id_users in range(1, max_user +1):
    id_movies = data[:, 1][data[:, 0] == id_users]
    id_ratings = data[:, 2][data[:, 0] == id_users]
    ratings = np.zeros(max_movie) # Rating 0 if user didn't rate the mv.
    ratings[id_movies - 1] = id_ratings # Movie indexes start at 1.
    new_data.append(list(ratings))
  return new_data

In [11]:
training_set = convert(training_set)
test_set = convert(test_set)

In [12]:
print_training_set = pd.DataFrame(training_set)
print_training_set.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Converting the data into Torch tensors.
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [14]:
# Converting the ratings into binary ratings(0 or 1)
training_set[training_set ==0] = -1
training_set[training_set ==1] = 0 # Or operator doesn't work.
training_set[training_set == 2] = 0
training_set[training_set >=3] =1
test_set[test_set == 0] = -1
test_set[test_set ==1]=0
test_set[ test_set ==2]=0
test_set[test_set >=3] = 1

# Building the RBM

In [15]:
class RBM():
  def __init__(self, nv, nh): # nv = number of visible nodes, nh = number of hidden nodes.
    self.W = torch.randn(nh, nv) # Random Weights.
    self.a = torch.randn(1, nh) # Bias for hidden nodes.
    self.b = torch.randn(1, nv) # Bias for visible nodes.

  def sample_h(self, X):
    wx = torch.mm(X,self.W.t()) # Matrix multiplication of transpose of weights and xs
    activation = wx + self.a.expand_as(wx) # Adding bias to the wx
    p_h_given_v = torch.sigmoid(activation) # Applying sigmoid funct.
    return p_h_given_v, torch.bernoulli(p_h_given_v) # Returning probabilities and samples.

  def sample_v(self, y): # Same function for visible nodes.
    wy = torch.mm(y,self.W) # Transpose no needed.
    activation = wy + self.b.expand_as(wy)
    p_v_given_h = torch.sigmoid(activation)
    return p_v_given_h, torch.bernoulli(p_v_given_h)
  def train(self, v0, vk, ph0, phk):
    """
    v0: is input vector,
    vk: visible nodes obtained after k samplings.
    ph0: probabilities of hidden nodes given the input vector v0.
    phk: probabilities of hidden nodes given the visible nodes obtained after k samplings.
    """
    self.W += (torch.mm(v0.t(),ph0) - torch.mm(vk.t(),phk)).t()
    self.b += torch.sum(v0 - vk, 0) # 0 is in order to compute the sum along a dimension 0.
    self.a += torch.sum(ph0 - phk, 0)
  def predict(self, x):
    _,h = self.sample_h(x)
    _,v = self.sample_v(h)
    return v

In [16]:
nv = len(training_set[0])
nh = 128
batch_size = 32
RBM = RBM(nv, nh)

# Training the RBM

In [17]:
epochs = 10
for epoch in range(1, epochs +1):
  train_loss = 0
  s = float(0)
  for id_user in range(0, max_user - batch_size, batch_size):
    vk = training_set[id_user: id_user+ batch_size]# Ratings of all the movies by specific user.
    v0 = training_set[id_user: id_user+ batch_size]
    ph0,_ = RBM.sample_h(X =v0) # Probabilities of hidden nodes given the input vector v0.
    for k in range(10): # MCMC
      _,hk = RBM.sample_h(v0) # Sampling the hidden nodes.
      _,vk = RBM.sample_v(hk)
      vk[v0<0] = v0[v0<0] # Freezing the visible nodes.
    phk,_ = RBM.sample_h(vk)
    RBM.train(v0, vk, ph0, phk)
    train_loss += torch.mean(torch.abs(v0[v0>0] - vk[v0>0])) # calculating train loss with getting the mean values of abs values.
    s += float(1)
  print(f'epoch: '+ str(epochs) + ' \nloss: ' + str(float(train_loss/s))) # Normalizing the train loss by dividing it w s.

epoch: 10 
loss: 0.2437972128391266
epoch: 10 
loss: 0.1484570950269699
epoch: 10 
loss: 0.14752009510993958
epoch: 10 
loss: 0.15216125547885895
epoch: 10 
loss: 0.14635948836803436
epoch: 10 
loss: 0.1494373083114624
epoch: 10 
loss: 0.150056391954422
epoch: 10 
loss: 0.14691174030303955
epoch: 10 
loss: 0.14780856668949127
epoch: 10 
loss: 0.15108922123908997


# Testing the RBM

In [18]:
test_loss = 0
s = float(0)
for id_user in range(max_user):
  v = training_set[id_user: id_user+1] # Input vector
  vt = test_set[id_user: id_user+1] # Target vector
  if len(vt[vt>=0]) >0:
    _,h = RBM.sample_h(v)
    _,v = RBM.sample_v(h)
    test_loss += torch.mean(torch.abs(vt[vt>0] - v[vt>0]))
    s += float(1)
print(f'Test loss: ' + str(float(test_loss/s)))

Test loss: 0.13269174098968506
