In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [3]:
movies = pd.read_csv("./ml-1m/movies.dat",
                    sep="::",
                    header=None,
                    engine='python',
                    encoding='latin-1')

In [4]:
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
users =  pd.read_csv("./ml-1m/users.dat",
                    sep="::",
                    header=None,
                    engine='python',
                    encoding='latin-1')

In [6]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
ratings = pd.read_csv("./ml-1m/ratings.dat",
                    sep="::",
                    header=None,
                    engine='python',
                    encoding='latin-1')

In [8]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
training_set = pd.read_csv("./ml-100k/u1.base",
                          delimiter="\t")

In [10]:
training_set.head()

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [11]:
# user 1 rated movie 2, 3 stars

In [12]:
training_set = np.array(training_set,
                        dtype='int')

In [13]:
test_set = pd.read_csv("./ml-100k/u1.test",
                          delimiter="\t")
test_set = np.array(test_set,
                        dtype='int')

In [14]:
# Getting the number of users and movies
nb_users = int(max(max(training_set[:,0]),max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))


In [15]:
# convert data into array with users in row movies in col
def convert(data):
    new_data = []
    for id_users in range(1,nb_users+1):
        id_movies = data[:,1][data[:,0]==id_users]
        id_ratings = data[:,2][data[:,0]==id_users] 
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data
training_set = convert(training_set)
test_set = convert(test_set)

In [16]:
# converting the data into Torch tensors
training_set = torch.FloatTensor(training_set)

In [17]:
test_set = torch.FloatTensor(test_set)

In [18]:
#convert the ratings into binary ratings 1 (liked) and 0 (not liked)
training_set[training_set == 0] = -1 
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0 
training_set[training_set >= 3] = 1

In [19]:
test_set[test_set == 0] = -1 
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0 
test_set[test_set >= 3] = 1

In [20]:
# creating the architecture of th NN
class SAE(nn.Module):
    def __init__(self):
        super(SAE, self).__init__()
        self.fc1 = nn.Linear(nb_movies, 20) # first full conn related to AE
                        # 20 nodes in first layer based on experiment
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 20) # decoding
        self.fc4 = nn.Linear(20, nb_movies) # decoding
        self.activation = nn.Sigmoid()
    
    def forward(self, x):
        x = self.activation(self.fc1(x)) # x is the input we encode it to get the new input for other layers
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x) # as it is last layer no need to activate
        return x
    
        

In [23]:
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5) # rmsprop or adam
                                                        # decay reduces lr per epoch

In [26]:
# training the SAE
nb_epoch = 200
for epoch in range(1,nb_epoch+1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0) # creating abtch
        target = input.clone()
        if torch.sum(target.data>0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data>0)+1e-10) # 1e-10 to preven div by 0
            loss.backward()
            train_loss += np.sqrt(loss.data[0]*mean_corrector)
            s += 1.
            optimizer.step()
    print("epoch %s train loss %s"%(epoch,train_loss/s))

epoch 1 train loss 1.82798624412
epoch 2 train loss 1.82802893396
epoch 3 train loss 1.8285193833
epoch 4 train loss 1.82764103203
epoch 5 train loss 1.82806072787
epoch 6 train loss 1.8273230385
epoch 7 train loss 1.82769977666
epoch 8 train loss 1.82626810181
epoch 9 train loss 1.82679140303
epoch 10 train loss 1.82542770814
epoch 11 train loss 1.8299314668
epoch 12 train loss 1.82685539901
epoch 13 train loss 1.8273053466
epoch 14 train loss 1.82601864739
epoch 15 train loss 1.82688608681
epoch 16 train loss 1.82673168651
epoch 17 train loss 1.82749747508
epoch 18 train loss 1.82671417672
epoch 19 train loss 1.82716760723
epoch 20 train loss 1.82547779526
epoch 21 train loss 1.82536338848
epoch 22 train loss 1.82634005517
epoch 23 train loss 1.8267282068
epoch 24 train loss 1.82424911443
epoch 25 train loss 1.82548006971
epoch 26 train loss 1.82469146715
epoch 27 train loss 1.82602284409
epoch 28 train loss 1.82550487027
epoch 29 train loss 1.82711138501
epoch 30 train loss 1.825436

In [None]:
# loss should be less than 1

In [None]:
# Testing the SAE

In [27]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0) # creating abtch
    target = Variable(test_set[id_user]).unsqueeze(0)
    if torch.sum(target.data>0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data>0)+1e-10) # 1e-10 to preven div by 0
        test_loss += np.sqrt(loss.data[0]*mean_corrector)
        s += 1.

print("test loss %s"%(test_loss/s))

test loss 2.24569293067
