In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Ratings Database file
Ratings = pd.read_csv("ratings.dat", sep = "::", engine = "python", header = None)
Ratings.columns = ["userId","movieId","ratings","timestamp"]
Ratings.head()

Unnamed: 0,userId,movieId,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
#Movie Database file:
movie = pd.read_csv("movies.dat", sep = "::", header = None, engine = 'python')
movie.columns = ["movieId","title","genres"]
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#User Database file:
users = pd.read_csv("users.dat", sep = "::", header = None, engine = "python")
users.columns = ["userId", "gender", "age", "occupation", "zipcode"]
users.head()

Unnamed: 0,userId,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
number_of_users = Ratings.userId.unique().shape[0]
number_of_movies = movie.movieId.unique().shape[0]
print ('Number of Users = ' + str(number_of_users) + ' and Number of Movies = ' + str(number_of_movies))

Number of Users = 6040 and Number of Movies = 3883


Creating a matrix of the Users, movies and ratings given by each user to the movies

In [6]:
#rating_s = Ratings.iloc[0:1000000,:]
#movie_s = movie.iloc[0:1000000,:]

In [9]:
Mat = Ratings.pivot(index = 'userId', columns = 'movieId', values = 'ratings').fillna(0)
Mat.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
np_matrix = Mat.as_matrix()


  """Entry point for launching an IPython kernel.


Normalizing the matrix

In [11]:
matrix_means = np.mean(np_matrix, axis = 1)
norm_matrix = np_matrix - matrix_means.reshape(-1,1)

Now importing the SVD library from scipy and implementing Singula Value Decomposition on the normalized matrix

In [12]:
from scipy.sparse.linalg import svds

Mu, Sigma, V = svds(norm_matrix, k = 100)

sigma = np.diag(Sigma)



Making the predictions and denromalising the matrix:

In [13]:
pred_matrix = np.dot(np.dot(Mu,sigma),V) + matrix_means.reshape(-1,1)

In [14]:
predictions = pd.DataFrame(pred_matrix,columns = Mat.columns)
predictions.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,5.157608,0.184833,0.348341,-0.022609,0.139622,-0.156937,-0.061122,0.072117,0.018278,-0.372566,...,-0.111771,-0.00246,0.016625,-0.107081,-0.051609,0.022706,-0.114028,0.009476,0.070798,-0.195959
1,0.557186,0.296927,0.078853,-0.013888,0.028675,1.09216,-0.054492,0.114191,0.090106,1.695371,...,0.002564,-0.02291,-0.031687,0.072002,-0.008174,-0.418219,-0.225593,-0.005716,0.033955,0.039606
2,2.176318,0.396428,0.302057,-0.117164,-0.00633,0.077833,0.000836,0.064654,-0.018309,1.062417,...,0.036894,-0.008054,0.026507,0.053735,0.025591,0.024825,0.1698,0.061687,0.028985,-0.243151
3,0.194185,0.155507,0.046863,0.047477,-0.014495,0.247765,-0.05758,-0.006338,0.007387,-0.42324,...,-0.049155,-0.010652,0.007342,-0.005267,-0.031352,-0.166973,0.022989,-0.033161,-0.011156,-0.129075
4,0.243474,-0.491501,-0.008307,0.139973,-0.204174,1.664607,-0.133342,-0.047117,-0.118995,0.129404,...,0.054157,0.0654,0.004748,-0.072018,-0.106567,-0.590538,0.219853,-0.062958,0.105441,0.009634


Function for calling the recommenedations

In [15]:
def Recommendations(pred, userid, movies, og_ratings, num_rec):
    #Sorting the user's predictions
    row_no = userid - 1 
    sorted_pred = predictions.iloc[row_no].sort_values(ascending = False)
    
    #Merging user data and movie info:
    user_data = og_ratings[og_ratings.userId == (userid)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId')
            . sort_values(['ratings'], ascending = False))
    print ('User {0} has already {1} movies'.format(userid, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings movies that are not already rated'.format(num_rec))
    
    #Recommend the highest ratings for unrated movies:
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
                       merge(pd.DataFrame(sorted_pred).reset_index(), how = 'left',
                            left_on = 'movieId',
                            right_on = 'movieId').rename(columns = {row_no : 'Predictions'}).
                       sort_values('Predictions', ascending = False).iloc[:num_rec,:-1]
                      )
    return user_full, recommendations 


In [16]:
rated, Predictions = Recommendations(predictions, 1310 , movie, Ratings, 20) 

User 1310 has already 24 movies
Recommending highest 20 predicted ratings movies that are not already rated


In [17]:
rated.head(20)

Unnamed: 0,userId,movieId,ratings,timestamp,title,genres
5,1310,2248,5,974781573,Say Anything... (1989),Comedy|Drama|Romance
6,1310,2620,5,974781573,This Is My Father (1998),Drama|Romance
7,1310,3683,5,974781935,Blood Simple (1984),Drama|Film-Noir
15,1310,1704,5,974781573,Good Will Hunting (1997),Drama
1,1310,1293,5,974781839,Gandhi (1982),Drama
12,1310,3101,4,974781573,Fatal Attraction (1987),Thriller
11,1310,1343,4,974781534,Cape Fear (1991),Thriller
20,1310,2000,4,974781892,Lethal Weapon (1987),Action|Comedy|Crime|Drama
18,1310,3526,4,974781892,Parenthood (1989),Comedy|Drama
17,1310,3360,4,974781935,Hoosiers (1986),Drama


Evaluating the model:

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [19]:
predictions_f =pd.DataFrame(pred_matrix,
                            columns = Mat.columns, index = Mat.index)
pred_df = predictions_f.stack().reset_index(name = 'ratings')

In [20]:
len(Ratings)

1000209

In [21]:
pred_ratings = pd.merge(pred_df,Ratings[['userId','movieId']], on = ['userId','movieId'], how = 'inner')

In [22]:
pred_ratings.head()

Unnamed: 0,userId,movieId,ratings
0,1,1,5.157608
1,1,48,1.051625
2,1,150,2.283549
3,1,260,2.525955
4,1,527,4.922815


In [23]:
mean_absolute_error(pred_ratings.loc[:,'ratings'],Ratings.loc[:,'ratings'])

1.9848981098988192

In [25]:
mean_squared_error(pred_ratings.loc[:,'ratings'],Ratings.loc[:,'ratings'])

5.508488362366881