In [1]:
import pandas as pd
import numpy as np
import os
from surprise import Reader, Dataset, SVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [2]:
data_path = 'C:/Users/bayra/Desktop/tez proje/Recommender_Deneme_withbigdata/Data/'
movies = 'movies.csv'
ratings = 'ratings.csv'

In [3]:
df_movies = pd.read_csv(
    os.path.join(data_path, movies),
#     movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

In [4]:
df_ratings = pd.read_csv(
    os.path.join(data_path, ratings),
#     ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


In [5]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [6]:
df_movies.count()

movieId    58098
title      58098
dtype: int64

In [7]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [8]:
df_ratings.count()

userId     27753444
movieId    27753444
rating     27753444
dtype: int64

In [9]:
df_ratings=df_ratings[:2700000]
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [10]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193541,193595,193599,193679,193731,193751,193793,193843,193861,193866
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,4.0,0.0,0.0,2.0,4.5,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
R = df_movie_features.values
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [12]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 2500)

In [13]:
#convert it to the diagonal matrix form.
sigma = np.diag(sigma)

In [14]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [15]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193541,193595,193599,193679,193731,193751,193793,193843,193861,193866
0,-0.001805,0.004916,-0.00765,0.016462,0.009517,-0.01025,-0.003702,-0.050206,-0.021457,0.016299,...,-0.008994,-0.001699,-0.000926,-0.000619,0.002588,-0.002329,-0.000104,-0.001573,0.005591,-0.00063
1,0.003287,-0.007,0.001363,-0.020111,0.013199,-0.017895,-0.006187,0.013144,-0.037345,0.011093,...,-0.002696,0.001243,-0.002551,0.001717,0.000333,0.003,0.0003,-0.002603,0.006994,0.001134
2,-0.004889,-0.004948,-0.014977,0.209254,-0.079538,-0.004694,0.015566,0.058887,-0.080556,0.007117,...,0.001893,0.002978,0.003697,0.001983,-0.00029,-0.000575,0.001388,0.002031,0.000417,-1.3e-05
3,3.989829,4.003641,-0.003599,-0.053216,1.982832,4.516026,0.048566,0.025551,-0.02352,3.962451,...,-0.021685,0.018405,-0.024775,0.004601,-0.004256,0.000737,-0.001072,-0.035645,-0.008838,0.000315
4,-0.002364,-0.005747,-0.014954,-0.051007,0.011461,-0.004288,0.029346,-0.051646,-0.053934,0.006833,...,-0.016355,-0.002245,0.013603,-0.008942,-0.000654,-0.001409,-0.005141,-0.013673,-0.007711,-0.001054


In [16]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1

    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False))
                
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations

In [17]:
#we can select any user profile we want here
already_rated, predictions = recommend_movies(preds_df, 1, df_movies, df_ratings, 5)

In [18]:
#selected user's highest ratings for movies
already_rated.head(5)

Unnamed: 0,userId,movieId,rating,title
3,1,1257,4.5,Better Off Dead... (1985)
4,1,1449,4.5,Waiting for Guffman (1996)
7,1,2134,4.5,Weird Science (1985)
12,1,3424,4.5,Do the Right Thing (1989)
8,1,2478,4.0,¡Three Amigos! (1986)


In [19]:
#predictions for user above who rated the movies
predictions
#k=1000

Unnamed: 0,movieId,title
2169,2261,One Crazy Summer (1986)
513,519,RoboCop 3 (1993)
4568,4678,UHF (1989)
4756,4867,Riding in Cars with Boys (2001)
3757,3864,Godzilla 2000 (Gojira ni-sen mireniamu) (1999)


In [20]:
#The Reader object helps in parsing the file or dataframe containing ratings
ratings = df_ratings 
reader = Reader()
#dataset creation
data = Dataset.load_from_df(ratings, reader)
#Define the SVD algorithm object
svd = SVD()
#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=["RMSE"], cv = 3)

{'test_rmse': array([0.84313092, 0.84242579, 0.84220795]),
 'fit_time': (92.88638854026794, 93.74057364463806, 93.75009846687317),
 'test_time': (9.704850435256958, 9.292328834533691, 9.757637977600098)}