In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading in the datasets
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
movies = pd.read_csv("./ml-latest-small/movies.csv")

In [3]:
# Will hold the original ratings in case we need it
ratings_orig = pd.read_csv("./ml-latest-small/ratings.csv")

In [4]:
# Drop timestamp as it's useless
ratings = ratings.drop('timestamp', axis = 1)

In [5]:
# We will move userId to be used as rows and the columns are the movieIds.
# Ratings are the values connecting them
ratings = ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating')

In [6]:
# For every non-zero, replace with zero.
ratings = ratings.loc[:, (ratings != 0).any(axis=0)].fillna(0)

In [7]:
# Drop movies where zeros make up 96% or more of the values.
ratings = ratings.drop(columns = ratings.columns[ratings.eq(0).mean()>.96])

In [8]:
# Grab the values in the matrix and save them to an array
R = ratings.values

In [9]:
# Demean values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [10]:
# Calculate predictions using SVD
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)
sigma = np.diag(sigma)
predicted_vals = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predictions = pd.DataFrame(predicted_vals, columns = ratings.columns)

In [11]:
predictions.head()

movieId,1,2,3,5,6,7,10,11,16,17,...,122904,122918,134130,134853,139385,148626,152081,164179,166528,168252
0,2.174649,0.659697,2.091007,0.192992,3.940126,-0.493126,0.627697,-0.519809,1.361071,-1.086418,...,-0.086233,-0.274074,0.356349,0.356265,0.079419,-0.367548,-0.195483,-0.090178,0.521414,-0.407441
1,0.06097,0.052479,0.036968,0.17125,0.027508,0.01874,-0.055286,0.020832,0.236797,-0.071228,...,0.977263,0.291638,1.127986,0.460487,0.840598,0.657713,0.131678,0.347916,0.165585,0.290193
2,0.127628,-0.010636,0.073499,0.077154,0.175419,0.057637,-0.101946,-0.015052,0.003674,0.006481,...,0.039306,0.037536,0.021886,0.052746,0.028765,0.05676,0.065145,0.119868,0.116146,0.102112
3,0.892571,0.017375,0.103089,0.306313,0.739485,0.581056,-0.288516,0.609125,-0.009794,1.192792,...,-0.141499,-0.072629,0.549952,-0.341834,0.111099,-0.021229,-0.163385,0.046524,0.272169,-0.390143
4,2.279114,0.688459,-0.277265,0.194918,0.689789,0.159226,0.634422,1.091651,0.626655,0.976669,...,-0.095349,-0.092354,-0.230943,0.370129,-0.130987,0.029347,0.100553,-0.068037,-0.009995,-0.058656


In [12]:
test = pd.DataFrame(predictions, index=ratings.index-1)

In [13]:
# Get user's rated movies
#def get_user_prev_rated(userId):
    #ratings_for_user = ratings_orig.loc[ratings_orig['userId'] == userId]
    #return ratings_for_user.movieId.tolist()

In [14]:
def get_preds(given_userId, list_of_movies, num_recs):
    # Sort predictions of user...
    sorted_preds_for_user = test.iloc[given_userId - 1].sort_values(ascending = False).to_frame()
    
    # Remove existing / selected favorite movies from predictions
    for i in list_of_movies:
        try:
            sorted_preds_for_user = sorted_preds_for_user.drop(i)
        except:
            pass
    return sorted_preds_for_user.head(num_recs)

In [15]:
def get_movie_titles(movie_list):
    return movies[movies['movieId'].isin(movie_list)]

In [16]:
def get_closest_user(list_of_movies):
    
    # Just getting a random row to use to fill in...
    user_row = ratings.tail(1).copy()
    
    # Set all values to 0
    for col in user_row.columns:
        user_row[col].values[:] = 0
    
    # Set fav movie picks to 5.0
    for item in list_of_movies:
        user_row[item].values[:] = 5.0
        
    # Find closest user...
    closest_userId = distance_calc(user_row)
    
    return closest_userId
    

In [17]:
import sklearn.metrics
from scipy.spatial.distance import cdist
from scipy.spatial.distance import squareform
def distance_calc(row):
    # Calculate cosine similarity of user and existing users
    distances = cdist(ratings, row, metric = "cosine")
    
    # Most similar is where the value is smallest
    most_similar_user = np.where(distances == distances.min())
    
    # Printing user that matches most similarly to the selected movies
    print(most_similar_user[0][0] + 1, " ", distances[most_similar_user])
    
    # Return userId (plus 1 to account for index starting at 0)
    return most_similar_user[0][0] + 1

In [18]:
#get_closest_user(test_list) 

In [19]:
def movie_recs(movie_picks, num_recs):
    userId = get_closest_user(movie_picks)
    recs = get_preds(userId, movie_picks, num_recs)
    movie_pred = recs.index.values.tolist()
    result = get_movie_titles(movie_pred)
    return result

In [20]:
test_list = [1, 1073, 7, 2, 10]
movie_recs(test_list, 10)

31   [0.71036282]


Unnamed: 0,movieId,title,genres
55,62,Mr. Holland's Opus (1995),Drama
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
546,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
592,733,"Rock, The (1996)",Action|Adventure|Thriller
594,736,Twister (1996),Action|Adventure|Romance|Thriller
615,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
793,1036,Die Hard (1988),Action|Crime|Thriller
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
969,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
1044,1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller


In [22]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller
