In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
ratings = pd.read_csv('4a - ratings.csv')

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,2.5,1260759144
1,1,2,3.0,1260759179
2,1,3,3.0,1260759182
3,2,1,2.0,1260759185
4,2,2,4.0,1260759205


In [6]:
ratings.shape

(8, 4)

In [9]:
movies = pd.read_csv('4a - movies.csv')

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [11]:
movies.shape

(3, 3)

In [12]:
from sklearn.model_selection import train_test_split

In [14]:
trainDF, tempDF = train_test_split(ratings, test_size=0.3, random_state=100)
testDF = tempDF.copy()
tempDF.rating = np.nan

In [15]:
tempDF.head()

Unnamed: 0,userId,movieId,rating,timestamp
1,1,2,,1260759179
4,2,2,,1260759205
5,2,3,,1260759151


In [16]:
testDF = testDF.dropna()
testDF.head()

Unnamed: 0,userId,movieId,rating,timestamp
1,1,2,3.0,1260759179
4,2,2,4.0,1260759205
5,2,3,2.0,1260759151


In [17]:
ratings = pd.concat([trainDF, tempDF]).reset_index()

In [18]:
ratings

Unnamed: 0,index,userId,movieId,rating,timestamp
0,2,1,3,3.0,1260759182
1,6,3,1,2.0,1260759187
2,3,2,1,2.0,1260759185
3,7,3,2,2.0,1260759148
4,0,1,1,2.5,1260759144
5,1,1,2,,1260759179
6,4,2,2,,1260759205
7,5,2,3,,1260759151


In [19]:
ratings.shape

(8, 5)

## Matrix Factorization via Singular Value Decomposition

In [20]:
# We want the format of ratings matrix to be one row per user and one column per movie. 
#we can pivot ratings_df to get that and call the new variable R_df.
R_df = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
R_df.tail()

movieId,1,2,3
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.5,0.0,3.0
2,2.0,0.0,0.0
3,2.0,2.0,0.0


In [25]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_df, k = 2)

In [26]:
U.shape
sigma.shape
Vt.shape

(3, 2)

(2,)

(2, 3)

In [27]:
#diag
sigma = np.diag(sigma)

In [28]:
sigma.shape

(2, 2)

In [29]:
#I also need to add the user means back to get the predicted 5-star ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [30]:
preds_df.head()

movieId,1,2,3
0,2.598575,-0.142404,2.904836
1,1.502352,0.718913,0.480424
2,2.221265,1.680357,-0.213606


In [37]:
# return the movies with the highest predicted rating that the specified user hasn’t already rated
#Take specific user row from matrix from predictions
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=2):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    #Added title and genres
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations, sorted_user_predictions, user_data, user_full

already_rated, predictions, sorted_user_predictions, user_data, user_full = recommend_movies(preds_df, 3, movies, ratings, 2)

User 3 has already rated 2 movies.
Recommending the highest 2 predicted ratings movies not already rated.


In [38]:
already_rated.head()

Unnamed: 0,index,userId,movieId,rating,timestamp,title,genres
0,6,3,1,2.0,1260759187,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,7,3,2,2.0,1260759148,Jumanji (1995),Adventure|Children|Fantasy


In [39]:
predictions

Unnamed: 0,movieId,title,genres
0,3,Grumpier Old Men (1995),Comedy|Romance


In [40]:
user_data

Unnamed: 0,index,userId,movieId,rating,timestamp
1,6,3,1,2.0,1260759187
3,7,3,2,2.0,1260759148


In [41]:
user_full

Unnamed: 0,index,userId,movieId,rating,timestamp,title,genres
0,6,3,1,2.0,1260759187,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,7,3,2,2.0,1260759148,Jumanji (1995),Adventure|Children|Fantasy


In [42]:
sorted_user_predictions

movieId
1    2.221265
2    1.680357
3   -0.213606
Name: 2, dtype: float64