In [3]:
# AutoEncoders

# Importing the libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [4]:
# Importing the dataset
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')



In [5]:
# Preparing the training set and the test set
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')#user_id,movie_id,ratings
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

In [6]:
test_set.shape
training_set.shape

(79999, 4)

In [7]:
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [8]:
# Converting the data into an array with users in lines and movies in columns
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

# Converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)


In [9]:
# Creating the architecture of the Neural Network
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        return x
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)


In [None]:
# # Training the SAE
# nb_epoch = 200
# for epoch in range(1, nb_epoch + 1):
#     train_loss = 0
#     s = 0.
#     for id_user in range(nb_users):
#         input = Variable(training_set[id_user]).unsqueeze(0)
#         target = input.clone()
#         if torch.sum(target.data > 0) > 0:
#             output = sae(input)
#             target = Variable(test_set[id_user]).unsqueeze(0)
#             output[target == 0] = 0
#             loss = criterion(output, target)
#             mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
#             loss.backward()
#             train_loss += np.sqrt(loss.data*mean_corrector)
#             s += 1.
#             optimizer.step()
#     print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))


In [None]:
# # Testing the SAE
# test_loss = 0
# s = 0.
# for id_user in range(nb_users):
#     input = Variable(training_set[id_user]).unsqueeze(0)
#     #target = Variable(test_set[id_user])
#     target = Variable(test_set[id_user]).unsqueeze(0)
#     if torch.sum(target.data > 0) > 0:
#         output = sae(input)
#         target.require_grad = False
#         output[(target == 0).unsqueeze(0)] = 0
#         loss = criterion(output, target)
#         mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
#         test_loss += np.sqrt(loss.data*mean_corrector)
#         s += 1.
# print('test loss: '+str(test_loss/s))

In [10]:
# Training the SAE
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()

            train_loss += np.sqrt(loss.item()*mean_corrector)

            s += 1.
            optimizer.step()
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))


epoch: 1 loss: 1.7709042567689843
epoch: 2 loss: 1.0965754964860517
epoch: 3 loss: 1.0534383831499718
epoch: 4 loss: 1.0384831942945731
epoch: 5 loss: 1.0310411598643894
epoch: 6 loss: 1.0265394315652279
epoch: 7 loss: 1.0236717612413753
epoch: 8 loss: 1.0218471287277018
epoch: 9 loss: 1.0206894074360924
epoch: 10 loss: 1.0199459626543812
epoch: 11 loss: 1.0187776035476237
epoch: 12 loss: 1.018592852231897
epoch: 13 loss: 1.0179201363477548
epoch: 14 loss: 1.0177318024337845
epoch: 15 loss: 1.0171923139775543
epoch: 16 loss: 1.0169028362203498
epoch: 17 loss: 1.01675616639095
epoch: 18 loss: 1.0166977369984982
epoch: 19 loss: 1.0163173729500576
epoch: 20 loss: 1.016263251043462
epoch: 21 loss: 1.0162129893726592
epoch: 22 loss: 1.0158141493587498
epoch: 23 loss: 1.0158155136690972
epoch: 24 loss: 1.0158389030074746
epoch: 25 loss: 1.0156052730244782
epoch: 26 loss: 1.0156876881544432
epoch: 27 loss: 1.0151888525399544
epoch: 28 loss: 1.0150464392089025
epoch: 29 loss: 1.012855103790791

In [None]:
# # Testing the SAE
# test_loss = 0
# s = 0.
# for id_user in range(nb_users):
#     input = Variable(training_set[id_user]).unsqueeze(0)
#     #target = Variable(test_set[id_user])
#     target = Variable(test_set[id_user]).unsqueeze(0)
#     if torch.sum(target.data > 0) > 0:
#         output = sae(input)
#         target.require_grad = False
#         output[(target == 0).unsqueeze(0)] = 0
#         loss = criterion(output, target)
#         mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
#         test_loss += np.sqrt(loss.data[0]*mean_corrector)
#         s += 1.
# print('test loss: '+str(test_loss/s))

In [None]:
# Testing the SAE
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user]).unsqueeze(0)


    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        #test_loss += np.sqrt(loss.data[0]*mean_corrector)
        test_loss += np.sqrt(loss.item()*mean_corrector)


        s += 1.
print('test loss: '+str(test_loss/s))

In [None]:
for i in range(1,1682):
    print(target[0][i])

In [None]:
target.shape

In [11]:
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)
 
user_input = Variable(training_set[user_id]).unsqueeze(0)
predicted = sae(user_input)
predicted = predicted.data.numpy().reshape(-1,1)
 
# Join all info in one dataset
result_array = np.hstack([movie_title, user_target, predicted])
result_array = result_array[result_array[:, 1] > 0]
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])

In [12]:
result_df

Unnamed: 0,Movie,Target Rating,Predicted
0,GoldenEye (1995),3,3.83971
1,Dracula: Dead and Loving It (1995),5,4.38459
2,Nixon (1995),5,3.80584
3,Sense and Sensibility (1995),3,3.18524
4,Money Train (1995),4,3.29192
...,...,...,...
131,Legends of the Fall (1994),2,2.69334
132,Major Payne (1994),4,3.90847
133,Little Odessa (1994),1,2.25827
134,My Crazy Life (Mi vida loca) (1993),4,2.43196
