In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading in the datasets
ratings = pd.read_csv("./ml-latest-small/ratings.csv")
movies = pd.read_csv("./ml-latest-small/movies.csv")

In [3]:
# Will hold the original ratings in case we need it
ratings_orig = pd.read_csv("./ml-latest-small/ratings.csv")

In [4]:
# Drop timestamp as it's useless
ratings = ratings.drop('timestamp', axis = 1)

In [5]:
# We will move userId to be used as rows and the columns are the movieIds.
# Ratings are the values connecting them
ratings = ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating')

In [6]:
# For every non-zero, replace with zero.
ratings = ratings.loc[:, (ratings != 0).any(axis=0)].fillna(0)

In [22]:
# Drop movies where zeros make up 96% or more of the values.
ratings = ratings.drop(columns = ratings.columns[ratings.eq(0).mean()>.96])
ratings.tail()

movieId,1,2,3,5,6,7,10,11,16,17,...,122904,122918,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,2.5,0.0,0.0,0.0,0.0,2.5,0.0,2.5,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,4.5,0.0,...,3.0,0.0,3.5,3.5,4.5,4.0,4.0,5.0,4.0,5.0


In [8]:
# Grab the values in the matrix and save them to an array
R = ratings.values

In [9]:
# Demean values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [10]:
# Calculate predictions using SVD
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)
sigma = np.diag(sigma)
predicted_vals = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predictions = pd.DataFrame(predicted_vals, columns = ratings.columns)

In [23]:
predictions.tail()

movieId,1,2,3,5,6,7,10,11,16,17,...,122904,122918,134130,134853,139385,148626,152081,164179,166528,168252
605,2.691238,0.639627,-0.344326,0.078933,-0.598245,1.189496,-0.073306,2.759341,2.023287,4.70587,...,-0.615049,0.101532,0.184955,0.221364,1.083402,0.440659,0.205794,-0.080209,0.383722,0.286241
606,4.300821,0.838868,0.071419,-0.345009,1.902292,-0.218698,1.697727,0.517541,0.288706,0.355984,...,-0.509067,-0.029406,0.112195,-0.185898,-0.391073,0.401774,-0.632276,0.091938,0.541517,-0.019172
607,1.961274,3.007102,1.394107,0.421778,2.144006,-0.048559,3.097115,-0.098778,4.259754,-0.166169,...,-0.484139,-0.045502,-0.887535,-0.656908,-0.373424,-0.390409,0.195212,0.061246,-0.009497,-0.070646
608,1.588512,0.049925,-0.090018,0.10057,-0.032322,-0.121441,1.319429,0.270559,0.227107,-0.182479,...,-0.142084,-0.107763,-0.009311,0.035327,0.03361,0.029412,0.001128,-0.097509,-0.087665,-0.188699
609,3.696206,1.422076,-0.747591,0.54977,4.100314,-0.061452,0.706461,-0.478728,2.409137,0.61892,...,3.498497,0.81896,3.777155,3.971829,4.120764,2.806086,3.765668,4.10255,3.730291,3.395128


In [12]:
test = pd.DataFrame(predictions, index=ratings.index-1)

In [13]:
# Get user's rated movies
#def get_user_prev_rated(userId):
    #ratings_for_user = ratings_orig.loc[ratings_orig['userId'] == userId]
    #return ratings_for_user.movieId.tolist()

In [14]:
def get_preds(given_userId, list_of_movies, num_recs):
    # Sort predictions of user...
    sorted_preds_for_user = test.iloc[given_userId - 1].sort_values(ascending = False).to_frame()
    
    # Remove existing / selected favorite movies from predictions
    for i in list_of_movies:
        try:
            sorted_preds_for_user = sorted_preds_for_user.drop(i)
        except:
            pass
    return sorted_preds_for_user.head(num_recs)

In [15]:
def get_movie_titles(movie_list):
    return movies[movies['movieId'].isin(movie_list)]

In [16]:
def get_closest_user(list_of_movies):
    
    # Just getting a random row to use to fill in...
    user_row = ratings.tail(1).copy()
    
    # Set all values to 0
    for col in user_row.columns:
        user_row[col].values[:] = 0
    
    # Set fav movie picks to 5.0
    for item in list_of_movies:
        user_row[item].values[:] = 5.0
        
    # Find closest user...
    closest_userId = distance_calc(user_row)
    
    return closest_userId
    

In [17]:
import sklearn.metrics
from scipy.spatial.distance import cdist
from scipy.spatial.distance import squareform
def distance_calc(row):
    # Calculate cosine similarity of user and existing users
    distances = cdist(ratings, row, metric = "cosine")
    
    # Most similar is where the value is smallest
    most_similar_user = np.where(distances == distances.min())
    
    # Printing user that matches most similarly to the selected movies
    print(most_similar_user[0][0] + 1, " ", distances[most_similar_user])
    
    # Return userId (plus 1 to account for index starting at 0)
    return most_similar_user[0][0] + 1

In [18]:
#get_closest_user(test_list) 

In [19]:
def movie_recs(movie_picks, num_recs):
    userId = get_closest_user(movie_picks)
    recs = get_preds(userId, movie_picks, num_recs)
    movie_pred = recs.index.values.tolist()
    result = get_movie_titles(movie_pred)
    return result

In [20]:
test_list = [1, 1073, 7, 2, 10]
movie_recs(test_list, 10)

31   [0.71036282]


Unnamed: 0,movieId,title,genres
55,62,Mr. Holland's Opus (1995),Drama
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
546,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
592,733,"Rock, The (1996)",Action|Adventure|Thriller
594,736,Twister (1996),Action|Adventure|Romance|Thriller
615,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
793,1036,Die Hard (1988),Action|Crime|Thriller
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
969,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
1044,1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller


In [21]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller
