# AutoEncoders

# Part 1 - Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

#http://ufldl.stanford.edu/tutorial/unsupervised/Autoencoders/

Importing the Training and Test Datasets

In [None]:
test_set = pd.read_csv('/test_set.csv') 
train_set = pd.read_csv('/training_set.csv') 

train_set

Unnamed: 0,User,Movie,Rating,Timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1287,5,978302039
...,...,...,...,...
750116,6040,1091,1,956716541
750117,6040,1094,5,956704887
750118,6040,562,5,956704746
750119,6040,1096,4,956715648


In [None]:
test_set
#so 250088 + 750121 = 1 million total ratings 

Unnamed: 0,User,Movie,Rating,Timestamp
0,1,1193,5,978300760
1,1,1197,3,978302268
2,1,2804,5,978300719
3,1,595,5,978824268
4,1,938,4,978301752
...,...,...,...,...
250083,6040,3735,4,960971654
250084,6040,2791,4,956715569
250085,6040,527,5,956704219
250086,6040,2003,1,956716294


Converting our Dataframes into Numpy Arrays

In [None]:
train_set = np.array(train_set, dtype = 'int') #specified the data type for our data, which is all integers anyway since its ratings and IDs
test_set = np.array(test_set, dtype = 'int') #specified the data type for our data, which is all integers anyway since its ratings and IDs

train_set

array([[        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       [        1,      3408,         4, 978300275],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

Getting the total number of users and movies

In [None]:
num_users = int(max(max(test_set[:,0]),max(train_set[:,0]))) #getting the maximum user ID in the training and test set from all the rows, and column 0 which is the user ID
num_movies = int(max(max(test_set[:,1]),max(train_set[:,1]))) #getting the maximum movie ID in the training and test set from all the rows, and column 1 which is the movie ID

print(num_users) #so max userID is 6040 and max movieID is 3952
print(num_movies)

6040
3952


Converting Datasets into Array with 6040 rows (users) and 3952 columns (ratings)

In [None]:
#Here we're going to create a list of lists. Basically we'll have 6040 lists (# of users), and each list will have 3952 movies (with movies they haven't rated equal to 0). We do this with a numpy array to make it easier to work with pytorch afterwards
def ratingslist(data):
  ratings_list = []
  for user_id in range(1,num_users+1):
    #starting from 1 since first user_ID is 1 and ending on 6040+1 since python doesn't include the upper bound
    movie_id = data[:,1][data[:,0] == user_id] #so here we're taking all the rows from column 1, which is the movie ids from the training/test set. We're also making it so that it only takes the movies watched by a specific user id. To do this, we basically took column 0 and said it must be equal to user_id from the training/test set
    rating_id = data[:,2][data[:,0] == user_id] #so here we're taking all the rows from column 2, which is the ratings from the training/test sets. We're also making it so that it only takes the ratings by a specific user id. To do this, we basically took column 0 and said it must be equal to user_id from the training/test set
    rating = np.zeros(num_movies) #creating a list of 3952 movies initialized with all zeros, so we can then populate it with ratings from users afterwards, and any movies they didn't rate will be given a value of 0
    rating[movie_id - 1] = rating_id #did -1 because python index starts at 0, but movie_id starts with 1
    ratings_list.append(list(rating)) #now adding the ratings of user_id to the ratings list
  return ratings_list

train_set = ratingslist(train_set) #now we're using our function to convert our training set into a list of lists
test_set = ratingslist(test_set) #same above but for test set

In [None]:
print("Number of Rows (users) =",len(test_set)) #so it's 6040 users
print("Number of Cols (ratings) =", len(test_set[0])) #and the number of columns is 3952 ratings (1 for each movie, 0 if unwatched movie)
print("User 1, Movie 661, Rating =",train_set[0][660]) #we can see for user 1, movie 660, the rating given was 3 which reflects the first row in the training set when first imported
print("User 6040, Movie 535, Rating =",test_set[6039][534]) #we can see for user 6040, movie 535, the rating given was 4 which reflects the last row in the test set when first imported

Number of Rows (users) = 6040
Number of Cols (ratings) = 3952
User 1, Movie 661, Rating = 3.0
User 6040, Movie 535, Rating = 4.0


Converting Array into Torch Tensors

In [None]:
#so we could create a neural network with numpy arrays, but pytorch arrays are far more efficient, which is why we're using it
#for autoencoders, pytorch is more simple to use than tensorflow
train_set = torch.FloatTensor(train_set) #FloatTensor expects a list of lists
test_set = torch.FloatTensor(test_set) #FloatTensor expects a list of lists

In [None]:
#Converting to Torch Tensor kept the shapes the same
print("Number of Rows (users) =",len(test_set)) #so it's 6040 users
print("Number of Cols (ratings) =", len(test_set[0])) #and the number of columns is 3952 ratings (1 for each movie, 0 if unwatched movie)
print("User 1, Movie 661, Rating =",train_set[0][660]) #we can see for user 1, movie 660, the rating given was 3 which reflects the first row in the training set when first imported
print("User 6040, Movie 535, Rating =",test_set[6039][534]) #we can see for user 6040, movie 535, the rating given was 4 which reflects the last row in the test set when first imported

Number of Rows (users) = 6040
Number of Cols (ratings) = 3952
User 1, Movie 661, Rating = tensor(3.)
User 6040, Movie 535, Rating = tensor(4.)


# Part 2 - Creating the Class for Future Neural Network Objects

In [None]:
#stacked autoencoder
#so here we're using inheritance, from PyTorch nn.Module. We do this so that we can use all the variables and functions from parent class Module, which will be useful for us when developing a SAE
class stacked_autoencoder(nn.Module):
  def __init__(self, ):
    #We put , then blank, as this will consider the variables of the module class because we are doing inheritance
    super(stacked_autoencoder, self).__init__() #super is a function that allows us to use Module's functions and variables
    self.fc1 = nn.Linear(num_movies, 32) #first full connection. num_movies is the number of input nodes (number of features in the input vector), and 32 is the number of hidden nodes
    self.fc2 = nn.Linear(32, 16) #second full connection. 32 is the input for this connection, and 16 is the 2nd hidden layer
    self.fc3 = nn.Linear(16, 32) #so now we're just decoding, so we're trying to determine the output with this next hidden layer
    self.fc4 = nn.Linear(32, num_movies) #this is the output, so it finishes with the same number of nodes as the input layers
    self.activation = nn.Sigmoid() #using the sigmoid activation function. Experimentation happened with the rectified linear unit activation function but the results were not as good
  
  #This next function is to create the forward propagations through our Stacked auto encoder
  #x is our input vector of features 
  def forward(self, x):
    x = self.activation(self.fc1(x)) #fc1 takes the input vectors x, which is then taken in by activation function
    x = self.activation(self.fc2(x)) #encoding x into the 2nd hidden layer
    x = self.activation(self.fc3(x)) #decoding x into the 3rd hidden layer
    x = self.fc4(x) #now just getting the output
    return x

SAE = stacked_autoencoder()

criterion = nn.MSELoss() #loss function which is MSE
optimizer = optim.RMSprop(SAE.parameters(), lr = 0.01, weight_decay = 0.5) #Weight decay is used to reduce the learning rate after every epoch, and that is used to regulate the convergence and prevent overfitting (0.5 was found through experimentation)
#Adam optimizer did not have as good results as RMSProp optimizer

In [None]:
SAE

stacked_autoencoder(
  (fc1): Linear(in_features=3952, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=3952, bias=True)
  (activation): Sigmoid()
)

# Part 3 -Training our SAE

In [None]:
#Adding fake batch dimension for PyTorch
user6040 = train_set[6039]
user6040_var = Variable(train_set[6039])
user6040_fb = Variable(train_set[6039]).unsqueeze(0)

print(user6040, len(user6040))
print(user6040_var, len(user6040_var))
print(user6040_fb, len(user6040_fb), len(user6040_fb[0])) #see how it's a nested array now, thus creating a batch of a single input vector

tensor([3., 0., 0.,  ..., 0., 0., 0.]) 3952
tensor([3., 0., 0.,  ..., 0., 0., 0.]) 3952
tensor([[3., 0., 0.,  ..., 0., 0., 0.]]) 1 3952


In [13]:
import time #To keep track of how long it takes to train our model

start_time = time.time()

num_epoch = 500

for epoch in range(1, num_epoch + 1):
  training_loss = 0 #initializing the training loss
  s = 0. #the . just makes it a float. s is going to be used to count the number of users who did not rate any movies, so that we don't perform any computation on them. Here we are just initializing it
  
  #below we're just using num_users to go from 0 to 6040 user indexes
  for user_id in range(num_users):
    #so below, first we're taking the index number user_id in the training set
    input = Variable(train_set[user_id]).unsqueeze(0) #A NN in PyTorch and Keras cannot accept a vector of one dimension, but it does accept a batch of input vectors. So here we're adding an additional, fake dimension which will correspond to a batch, which will be index 0, so the first dimension. so when we unsqueezed our variable (train set input), we added a new dimension of index 0. All this creates a batch of a single input vector
    target = input.clone() #so we're just cloning the input, so we can compare the output to the original input
    
    if torch.sum(target.data > 0) > 0:
       #if statement is checking if there is a user that has 1 or more rated movies
       output = SAE(input) #running our input vector data for user_id through our SAE class, which then at the end spits out the output from the forward function
       target.require_grad = False #this is just for optimization. We only want to apply Stochastic Gradient Descent only to the input, and not the clone which is target. So this require_grad function set to false just means we don't compute the gradient WRT the target
       output[target == 0] = 0 #take the indexes of output when target data (which has the same indexes as output since it's just a clone of input) has no ratings (equal to zero) equal to zero. This way we don't take these empty inputs indexes in the computation of the error, so they won't impact the updating of the weights.
       loss = criterion(output, target) #the loss just compares the MSE of the vector of inputs (target) vs the output
       mean_corrector = num_movies/float(torch.sum(target.data > 0) + 1e-10) #calculating the average number of movies that had a non-zero rating and adding 1e-10 just incase the denominator is 0. We need this mean corrector to represent the average of the errors, but by only considering the movies that were rated
       loss.backward() #this just calls the backward method for the loss, and it will tell in which direction we need to update the different weights (i.e do we need to increase or decrease the weights)
       training_loss += np.sqrt(loss.data*mean_corrector) #computing the error by square rooting the mean squared error, which is in index 0 of the loss data and is adjusted by multiplying the mean correction
       s += 1. #adding to the number of users that rated at least 1 movie
       optimizer.step() #step is just a method of using the optimizer from the RMSprop class. Backwards decided the direction in which to change the weight, whereas optimizer step decides the magnitude of said increase or decrease
  if epoch%25 == 0:
    print('epoch: ' + str(epoch) + ' loss:' + str(training_loss/s))

print(f'Training took {(time.time() - start_time)/60} minutes')

epoch: 25 loss:tensor(0.9749)
epoch: 50 loss:tensor(0.9470)
epoch: 75 loss:tensor(0.9171)
epoch: 100 loss:tensor(0.9063)
epoch: 125 loss:tensor(0.8942)
epoch: 150 loss:tensor(0.8797)
epoch: 175 loss:tensor(0.8722)
epoch: 200 loss:tensor(0.8631)
epoch: 225 loss:tensor(0.8585)
epoch: 250 loss:tensor(0.8540)
epoch: 275 loss:tensor(0.8487)
epoch: 300 loss:tensor(0.8444)
epoch: 325 loss:tensor(0.8396)
epoch: 350 loss:tensor(0.8362)
epoch: 375 loss:tensor(0.8316)
epoch: 400 loss:tensor(0.8289)
epoch: 425 loss:tensor(0.8222)
epoch: 450 loss:tensor(0.8181)
epoch: 475 loss:tensor(0.8131)
epoch: 500 loss:tensor(0.8077)
Training took 234.49684039751688 minutes


# Part 4 - Testing our Test Set

In [None]:
test_loss = 0 #initializing the test loss
s = 0. #the . just makes it a float. s is going to be used to count the number of users who did not rate any movies, so that we don't perform any computation on them. Here we are just initializing it
  
#below we're just using num_users to go from 0 to 6040 user indexes
for user_id in range(num_users):
  #so below, first we're taking the index number user_id in the training set
  input = Variable(train_set[user_id]).unsqueeze(0) #so we keep the training set, because the training set is the input that will be used to activate the hidden neurons for our test set predictions. Because the training set does not have any of the ratings from the test set, we have to use the training set as the input so it can try to predict the ratings of answers it does not contain, then compare it afterwards with the test set results
  target = Variable(test_set[user_id]).unsqueeze(0) #so here, the target is the test set, which makes sense because we're going to compare our predicted training set results from input with our test set

  if torch.sum(target.data > 0) > 0:
    #if statement is checking if there is a user that has 1 or more rated movies
    output = SAE(input) #running our input vector data for user_id through our SAE class, which then at the end spits out the output from the forward function
    target.require_grad = False #this is just for optimization. We only want to apply Stochastic Gradient Descent only to the input, and not the clone which is target. So this require_grad function set to false just means we don't compute the gradient WRT the target
    output[target == 0] = 0 #take the indexes of output when target data (which has the same indexes as output since it's just a clone of input) has no ratings (equal to zero) equal to zero. This way we don't take these empty inputs indexes in the computation of the error, so they won't impact the updating of the weights.
    loss = criterion(output, target) #the loss just compares the MSE of the vector of inputs (target) vs the output
    mean_corrector = num_movies/float(torch.sum(target.data > 0) + 1e-10) #calculating the average number of movies that had a non-zero rating and adding 1e-10 just incase the denominator is 0. We need this mean corrector to represent the average of the errors, but by only considering the movies that were rated
    #we removed loss.backward because we don't need to update weights since we're not performing backpropagation on the test set, we're just predicting in one go
    test_loss += np.sqrt(loss.data*mean_corrector) #computing the error by square rooting the mean squared error, which is in index 0 of the loss data and is adjusted by multiplying the mean correction
    s += 1. #adding to the number of users that rated at least 1 movie
    #we removed optimizer because we don't need to update weights since we're not performing backpropagation on the test set, we're just predicting in one go
print('test loss:' + str(test_loss/s))

test loss:tensor(0.8958)
