In [1]:
import numpy as np
from scipy.sparse.linalg import svds
import pandas as pd
import random
import scipy 
from scipy import sparse
import math
%matplotlib inline

In [2]:
def load(filename="ratings.data"):
    
    file = np.loadtxt(filename, dtype=int)
    
    ratings = sparse.csr_matrix((file[:, 2], (file[:, 0]-1, file[:, 1]-1)),dtype=float)
    non_zero_ratings = sparse.csr_matrix((np.ones(file[:, 2].shape),(file[:, 0]-1, file[:, 1]-1)), dtype=bool )
        
        
    return ratings, non_zero_ratings

In [3]:
ratings, non_zero_ratings = load(filename='ratings.data')
ratings = ratings.toarray()
non_zero_ratings = non_zero_ratings.toarray()

In [4]:
def objective(P, Q, ratings, mask, reg_param):
    ratings = (ratings - Q.dot(P)) * mask
    grad_P = -np.dot(Q.T,ratings) + reg_param * P
    grad_Q = -np.dot(ratings, P.T) + reg_param * Q
    return grad_P, grad_Q

def search(P0, Q0, t): 
    P = P0
    Q = Q0
    iter = 0
    while (iter < 10):
        if t == 0:
            f,_ = objective(P, Q, ratings, non_zero_ratings, reg_param)
            f = f[1]
            P = P - (0.5 * (0.5 ** 9)) * f
        else:
            _,f = objective(P, Q, ratings, non_zero_ratings, reg_param)
            f = f[1]
            Q = Q - (0.5 * (0.5 ** 9)) * f
        
        iter += 1
    if t == 0:
        return P 
    else:
        return Q

In [5]:
def ALS(P0, Q0, non_zero_ratings, reg_param, max_iter):
    Q = Q0
    P = P0
    k = 0
    
    _,grad_Q = objective(P0, Q0, ratings, non_zero_ratings, reg_param)
    grad_P,_ = objective(P0, Q0, ratings, non_zero_ratings, reg_param)
    
    while k < max_iter:

        P = search(P, Q, 0)
        Q = search(P, Q, 1)
        
        _,grad_Q = objective(P, Q, ratings, non_zero_ratings, reg_param)
        grad_P,_ = objective(P, Q, ratings, non_zero_ratings, reg_param)
        
        k += 1
        
        print("iteration = %d"%k)

    return P, Q

In [6]:
reg_param = 0.2 # regularization parameter 
Q0,S,P0 = svds(ratings, k=6) #SVD decomposition
max_iter = 300 # maximum of iterations
P, Q = ALS(P0, Q0, non_zero_ratings, reg_param, max_iter)

iteration = 1
iteration = 2
iteration = 3
iteration = 4
iteration = 5
iteration = 6
iteration = 7
iteration = 8
iteration = 9
iteration = 10
iteration = 11
iteration = 12
iteration = 13
iteration = 14
iteration = 15
iteration = 16
iteration = 17
iteration = 18
iteration = 19
iteration = 20
iteration = 21
iteration = 22
iteration = 23
iteration = 24
iteration = 25
iteration = 26
iteration = 27
iteration = 28
iteration = 29
iteration = 30
iteration = 31
iteration = 32
iteration = 33
iteration = 34
iteration = 35
iteration = 36
iteration = 37
iteration = 38
iteration = 39
iteration = 40
iteration = 41
iteration = 42
iteration = 43
iteration = 44
iteration = 45
iteration = 46
iteration = 47
iteration = 48
iteration = 49
iteration = 50
iteration = 51
iteration = 52
iteration = 53
iteration = 54
iteration = 55
iteration = 56
iteration = 57
iteration = 58
iteration = 59
iteration = 60
iteration = 61
iteration = 62
iteration = 63
iteration = 64
iteration = 65
iteration = 66
iteration = 67
iter

In [7]:
u = 5
predicted_ratings = np.dot(Q, P)

predicted_ratings = predicted_ratings * (1 - non_zero_ratings)
predicted_ratings[predicted_ratings > 5] = 5
predicted_ratings[predicted_ratings <= 0] = 1
N = 5

res = sorted(range(len(predicted_ratings[u])), key = lambda sub: (predicted_ratings[u])[sub])[-N:]

for i in range(N):
    print("rank %d movie for user %s is %d"%((N-i),u,res[i]))
    print ("The rating of this movie is : %0.3f "%(predicted_ratings[u])[res[i]])

rank 5 movie for user 5 is 1121
The rating of this movie is : 4.683 
rank 4 movie for user 5 is 1200
The rating of this movie is : 4.738 
rank 3 movie for user 5 is 1188
The rating of this movie is : 4.836 
rank 2 movie for user 5 is 813
The rating of this movie is : 4.841 
rank 1 movie for user 5 is 1292
The rating of this movie is : 5.000 


In [8]:
predicted_ratings = np.dot(Q, P)
predicted_ratings[predicted_ratings > 5] = 5
predicted_ratings[predicted_ratings <= 0] = 1

np.savetxt("final_ratings.csv", predicted_ratings, 
              delimiter = ",")

rmse = 0
n = 0
for i in range(len(predicted_ratings)):
    for j in range(len(predicted_ratings[0])):
        if non_zero_ratings[i][j] != False:
            n += 1
            rmse += (predicted_ratings[i][j] - ratings[i][j])**2
rmse = rmse/n
rmse = math.sqrt(rmse)
print("RMSE = " + str(rmse))

RMSE = 1.0062409189320278
