In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [3]:
### Latin encoding is used because some of the movies have special characters

In [4]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [6]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
### movie ids will be used instead of movie titles for recommendations

In [8]:
### In users data first coloumn is the user id, second is gender, third is age, fourth is occupatin code and the last one is the zip code

In [9]:
### In ratings coloumns are user id, movies id, ratings and timestamps of user rating that movie

In [10]:
### Here we will not use this 1 million data set instead we will use its subset u1base and u1test, there are 5 such different subsets

In [11]:
# Preparing the training set and the test set
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

In [12]:
np.save('training_set.npy',training_set)
np.save('test_set.npy',test_set)

In [11]:
training_set = np.load('training_set.npy')

In [12]:
test_set = np.load('test_set.npy')

In [13]:
# Getting the number of users and movies
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [14]:
# Why doing this ? Because we want to create a new structure of data, having the shape of a 2d array where:

#    the rows are the users,
#    the columns are the movies,
#    the cells are the ratings.

In [15]:
def convert(data):
    new_data = []
    for id_users in range(1,nb_users+1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        #all rows such that first coloumn is id_users
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

In [16]:
training_set = convert(training_set)

In [17]:
test_set = convert(test_set)

In [18]:
training_set[0]

[0.0,
 3.0,
 4.0,
 3.0,
 3.0,
 0.0,
 4.0,
 1.0,
 5.0,
 0.0,
 2.0,
 0.0,
 5.0,
 0.0,
 5.0,
 5.0,
 0.0,
 4.0,
 5.0,
 0.0,
 1.0,
 4.0,
 0.0,
 0.0,
 4.0,
 3.0,
 0.0,
 4.0,
 1.0,
 3.0,
 0.0,
 5.0,
 0.0,
 2.0,
 1.0,
 0.0,
 2.0,
 3.0,
 0.0,
 3.0,
 2.0,
 5.0,
 4.0,
 0.0,
 5.0,
 4.0,
 0.0,
 5.0,
 0.0,
 5.0,
 0.0,
 4.0,
 0.0,
 0.0,
 5.0,
 0.0,
 5.0,
 4.0,
 5.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 0.0,
 4.0,
 0.0,
 4.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 4.0,
 0.0,
 4.0,
 0.0,
 4.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 5.0,
 4.0,
 5.0,
 0.0,
 0.0,
 0.0,
 5.0,
 2.0,
 4.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 2.0,
 0.0,
 0.0,
 0.0,
 2.0,
 4.0,
 0.0,
 0.0,
 5.0,
 1.0,
 5.0,
 0.0,
 0.0,
 0.0,
 5.0,
 3.0,
 0.0,
 0.0,
 5.0,
 0.0,
 0.0,
 3.0,
 4.0,
 5.0,
 0.0,
 2.0,
 5.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 4.0,
 0.0,
 4.0,
 3.0,
 5.0,
 1.0,
 3.0,
 0.0,
 3.0,
 2.0,
 0.0,
 4.0,
 0.0,
 4.0,
 3.0,
 0.0,
 2.0,
 0.0,
 0.0,
 5.0,
 3.0,
 0.0,
 0.0,
 4.0,
 0.0,
 3.0,
 0.0,
 0.0,
 0.0,
 4.0,
 0.0,
 0.0,
 5.0,
 5.0,
 2.0

In [19]:
#np.save('training_set_proc.npy',training_set)
#np.save('test_set_proc.npy',test_set)
training_set = np.load('training_set_proc.npy')
test_set = np.load('test_set_proc.npy')

In [20]:
# Converting list of lists into torch tensors(mutidimensional arrays with a single data type)
# These are more efficient

In [21]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [61]:
class SAE(nn.Module):
    def __init__(self, ):
        # super function is used to use classes of parent class
        super(SAE,self).__init__()
        # by this we can get all the inherited classes of nn.Module
        # first argument is the features, second is the the number of units
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20,10) #second layer has 10 neurons
        self.fc3 = nn.Linear(10,20)
        self.fc4 = nn.Linear(20,nb_movies)
        self.activation = nn.Sigmoid()
        self.activation_t = nn.Tanh()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation_t(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x

In [67]:
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.Adam(sae.parameters(),lr=0.01,weight_decay=0.5)
#decay is used for regulating the steps of the optimizer

In [68]:
#The following code optimiztion techniques are only for pytorch

In [69]:
#Training the AutoEncoders

In [70]:
nb_epoch = 200

In [71]:
for epoch in range(1,nb_epoch+1):
    train_loss = 0
    #number of users who at least rated one movie to reduce the computation
    s = 0. #RMSE needs a float
    for id_user in range(nb_users):
        input = Variable(training_set[id_user,:]).unsqueeze(0) #dimension for a batch is also added here on the 0th axis
        #weights in this code are updated after each i/p vector
        target = input.clone() #target is the input
        if torch.sum(target.data > 0) > 0:
            # if the user has rated atleast one movie
            output = sae(input) #predicted ratings are returned for this particular user
            target.require_grad = False #gradient is a clone of i/p so now its gradient won't be calculated
            output[target == 0] = 0 #These are zeros so they don't need to be included in the computation
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) #denominator should not be zero
            # it is the average of the error with non-zero ratings
            loss.backward()
            train_loss += np.sqrt(loss.data[0]*mean_corrector) #loss.data[0] is the loss value
            s+=1
            optimizer.step()
            # backward decides the direction(increased or decreased) of the weights and optimizer decides their intensities
    print('epoch: '+str(epoch)+ 'loss: '+ str(train_loss/s)) #train loss divided by number of users with atleast one rating
    



epoch: 1loss: tensor(1.8198)
epoch: 2loss: tensor(1.0934)
epoch: 3loss: tensor(1.0492)
epoch: 4loss: tensor(1.0343)
epoch: 5loss: tensor(1.0270)
epoch: 6loss: tensor(1.0226)
epoch: 7loss: tensor(1.0198)
epoch: 8loss: tensor(1.0178)
epoch: 9loss: tensor(1.0164)
epoch: 10loss: tensor(1.0153)
epoch: 11loss: tensor(1.0145)
epoch: 12loss: tensor(1.0139)
epoch: 13loss: tensor(1.0134)
epoch: 14loss: tensor(1.0130)
epoch: 15loss: tensor(1.0127)
epoch: 16loss: tensor(1.0124)
epoch: 17loss: tensor(1.0122)
epoch: 18loss: tensor(1.0120)
epoch: 19loss: tensor(1.0118)
epoch: 20loss: tensor(1.0117)
epoch: 21loss: tensor(1.0116)
epoch: 22loss: tensor(1.0115)
epoch: 23loss: tensor(1.0114)
epoch: 24loss: tensor(1.0113)
epoch: 25loss: tensor(1.0112)
epoch: 26loss: tensor(1.0112)
epoch: 27loss: tensor(1.0111)
epoch: 28loss: tensor(1.0111)
epoch: 29loss: tensor(1.0110)
epoch: 30loss: tensor(1.0110)
epoch: 31loss: tensor(1.0109)
epoch: 32loss: tensor(1.0107)
epoch: 33loss: tensor(1.0111)
epoch: 34loss: tens

In [72]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user,:]).unsqueeze(0) #because we are predicting ratings for that user
    target = Variable(test_set[id_user,:]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        s+=1.
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        pred_loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(pred_loss.data[0]*mean_corrector)
print('test loss: '+ str(test_loss/s))

test loss: tensor(0.9602)


  del sys.path[0]
