# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
pd.set_option('display.max_rows', 50)
movies = pd.read_csv('dataset/ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('dataset/ml-1m/users.dat', sep = '::', header= None, engine= 'python', encoding = 'latin-1')
ratings = pd.read_csv('dataset/ml-1m/ratings.dat', sep = '::', header= None, engine = 'python', encoding = 'latin-1')

In [3]:
print(movies.shape, users.shape, ratings.shape)

(3883, 3) (6040, 5) (1000209, 4)


In [4]:
movies.head(50)

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


First column: Movie ID,

Second Column: Movie's name and year

Third Column: Genre

In [5]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


First column of the users: User ID,

Second column: Gender,

Third column: Age,

Fourth column: Jobs,

Fifth column: Zip code

In [6]:
ratings.head(20)

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


First column: Users' ID

Second column: Movies' ID

Third column: Ratings (1 to 5)

Fourth column: Timestamps

In [7]:
training_set = pd.read_csv('dataset/ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('dataset/ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

In [8]:
print(f'Training set: {training_set.shape}\nTest set: {test_set.shape}')

Training set: (79999, 4)
Test set: (19999, 4)


In [9]:
max_user = max(max(training_set[:, 0]), max(test_set[:, 0]))
max_movie = max(max(training_set[:, 1]), max(test_set[:, 1]))
max_user, max_movie = int(max_user), int(max_movie)
print(f'Max user: {max_user}\nMax movie: {max_movie}')

Max user: 943
Max movie: 1682


In [10]:
def convert(data):
  """
  Converting training and test sets into a matrix.
  Rows = Users
  Column = Movies
  Cells = Ratings (0 if user didn't rate)
  """
  new_data= []
  for id_users in range(1, max_user+1):
    id_movies = data[:, 1][data[:, 0 ] == id_users]
    id_ratings = data[:, 2][data[:,0] == id_users]
    ratings = np.zeros(max_movie)
    ratings[id_movies -1] = id_ratings
    new_data.append(list(ratings))
  return new_data

In [11]:
training_set = convert(training_set)
test_set = convert(test_set)

In [12]:
print_training_set = pd.DataFrame(training_set)
print_training_set.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

# Building the AutoEncoder

### Creating the architecture of the Neural Network

In [14]:
class SAE(nn.Module):
  def __init__(self,):

    super(SAE, self).__init__()
    self.fc1 = nn.Linear(max_movie, 32) #First param. is num of features, second param. is number of nodes in next layer (First hidden layer)
    self.fc2 = nn.Linear(32, 16) # Second hidden Layer
    self.fc3 = nn.Linear(16, 32) # Third hidden Layer
    self.fc4 = nn.Linear(32, max_movie) # Output Layer
    self.activation = nn.Sigmoid() # Activation function
  def forward(self, x):

    x = self.activation(self.fc1(x)) # First layer activation funct.
    x = self.activation(self.fc2(x)) # Second layer activation funct.
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x

In [15]:
sae = SAE()
criterion = nn.MSELoss() # mse loss function
optimizer = optim.RMSprop(sae.parameters(), lr = 0.008, weight_decay = 0.6)

### Training the Stacked AutoEncoder

In [16]:
nb_epoch = 256

for epoch in range(1, nb_epoch +1):

  train_loss = 0
  s = float(0) # number of users that rated at least 1 movie.

  for id_user in range(max_user):
    input = Variable(training_set[id_user]).unsqueeze(0) # Adding a batch dimension
    target = input.clone()

    if ( torch.sum(target.data > 0) == 0): # If user does not have any ratings. (All values 0)
      continue

    output = sae(input)
    target.require_grad = False # To don't compute the gradient with respect to the target
    output[target == 0] = 0 # Setting the unrated movies to 0.

    loss = criterion(output, target)
    mean_corrector = max_movie/float(torch.sum(target.data > 0)+ 1e-10) # 1e-10 is to make sure denominator is not 0.
    loss.backward() # Backward propogation
    train_loss += np.sqrt(loss.data*mean_corrector)

    s += float(1)

    optimizer.step() # Updating the weights
  print(f'Epoch: {epoch}\n Train Loss: {train_loss/s}')


Epoch: 1
 Train Loss: 1.6790573596954346
Epoch: 2
 Train Loss: 1.080127239227295
Epoch: 3
 Train Loss: 1.0485514402389526
Epoch: 4
 Train Loss: 1.0373848676681519
Epoch: 5
 Train Loss: 1.032153844833374
Epoch: 6
 Train Loss: 1.0293275117874146
Epoch: 7
 Train Loss: 1.0275236368179321
Epoch: 8
 Train Loss: 1.0263525247573853
Epoch: 9
 Train Loss: 1.0254571437835693
Epoch: 10
 Train Loss: 1.02488112449646
Epoch: 11
 Train Loss: 1.0242732763290405
Epoch: 12
 Train Loss: 1.0242154598236084
Epoch: 13
 Train Loss: 1.0237410068511963
Epoch: 14
 Train Loss: 1.023386001586914
Epoch: 15
 Train Loss: 1.0231573581695557
Epoch: 16
 Train Loss: 1.022688627243042
Epoch: 17
 Train Loss: 1.0225651264190674
Epoch: 18
 Train Loss: 1.0208393335342407
Epoch: 19
 Train Loss: 1.018100619316101
Epoch: 20
 Train Loss: 1.0190101861953735
Epoch: 21
 Train Loss: 1.014690637588501
Epoch: 22
 Train Loss: 1.0154927968978882
Epoch: 23
 Train Loss: 1.0112253427505493
Epoch: 24
 Train Loss: 1.0110222101211548
Epoch: 25

### Testing the SAE

In [17]:
  test_loss = 0
  s = float(0)
  for id_user in range(max_user):
    input = Variable(training_set[id_user]).unsqueeze(0) # Keeping the training set as input.
    target = Variable(test_set[id_user]).unsqueeze(0)

    if ( torch.sum(target.data > 0) == 0):
      continue

    output = sae(input)
    target.require_grad = False
    output[target == 0] = 0 # Setting the unrated movies to 0.

    loss = criterion(output, target)
    mean_corrector = max_movie/float(torch.sum(target.data > 0)+ 1e-10)

    test_loss += np.sqrt(loss.data*mean_corrector)

    s += float(1)

  print(f'Loss: {test_loss/s}')


Loss: 0.9519652128219604
