In [36]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

In [37]:
ratings = pd.read_csv('../Data/final_rating_not_spare.csv', usecols=['customer_id', 'product_id', 'stars'])
ratings.head()

Unnamed: 0,product_id,customer_id,stars
0,9672,53270,5
1,9672,91213,4
2,3038,56289,5
3,3038,88524,5
4,3038,72126,5


In [38]:
customer_index = np.unique(ratings['customer_id'])
customer_index = np.sort(customer_index)
ratings['id_customer'] = ratings['customer_id'].apply(lambda x: np.where(customer_index==int(x))[0][0])

In [39]:
ratings.drop('customer_id', axis=1, inplace=True)
ratings.rename(columns={'id_customer': 'customer_id'}, inplace=True)

In [40]:
product_index = np.unique(ratings['product_id'])
product_index = np.sort(product_index)
ratings['id_product'] = ratings['product_id'].apply(lambda x: np.where(product_index==int(x))[0][0])

In [41]:
ratings.drop('product_id', axis=1, inplace=True)
ratings.rename(columns={'id_product': 'product_id'}, inplace=True)

In [43]:
from sklearn.model_selection import train_test_split
# rate_train, rate_test = train_test_split(ratings.values, test_size=0.2, random_state=43)
ratings_train, ratings_test = train_test_split(ratings, test_size=0.2, random_state=43)

In [45]:
ratings_train.drop_duplicates(['customer_id', 'product_id'], inplace=True)

In [46]:
df_user_item = ratings_train.pivot(
    index='customer_id',
    columns='product_id',
    values='stars'
).fillna(0)

In [47]:
df_user_item

product_id,0,1,2,3,4,5,6,7,8,9,...,5278,5279,5281,5282,5283,5284,5285,5286,5287,5288
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
R = df_user_item.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [49]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)
# that the Sigma$ returned is just the values instead of a diagonal matrix. 
# This is useful, but since I'm going to leverage matrix multiplication to get predictions 
# I'll convert it to the diagonal matrix form.
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [50]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_user_item.columns)
preds_df.head()

product_id,0,1,2,3,4,5,6,7,8,9,...,5278,5279,5281,5282,5283,5284,5285,5286,5287,5288
0,-0.016753,0.001506,-0.000212,-0.010394,-0.015349,0.125442,0.006218,-0.002914,0.001482,-0.014572,...,-0.058238,0.008696,0.015771,-0.003345,0.014083,-0.018446,0.007885,0.137571,0.097186,0.015229
1,-0.028165,0.024714,0.014486,0.023683,0.019405,0.197782,0.006146,0.014539,0.008769,0.03528,...,-0.176639,0.381157,0.176594,0.107502,0.365465,0.281195,-0.035337,0.072143,0.09063,0.068457
2,0.006113,0.012839,0.007712,0.013159,0.012787,0.057691,0.008873,0.008674,0.005415,0.017087,...,0.062976,0.006448,0.006654,-0.005021,-0.057055,0.003907,-0.03993,0.011081,-0.020031,0.018426
3,-0.016529,-0.009661,-0.007994,6.4e-05,0.029681,0.279781,-0.012563,-0.007531,-0.007881,-0.034222,...,-0.075196,0.057691,-0.020425,0.095484,0.062251,0.062915,0.00395,0.044334,0.044819,-0.037207
4,0.002613,0.006956,0.002177,0.004251,0.003383,0.041645,0.0047,0.001865,0.00178,0.002047,...,0.002746,0.023177,0.025085,0.010163,0.028126,0.012,0.00641,-0.003005,0.010907,0.01665


In [54]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
#     print(preds_df.iloc[user_row_number])
#     print(sorted_user_predictions)
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.customer_id == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'product_id', right_on = 'product_id').
                     sort_values(['stars'], ascending=False)
                 )
#     print(user_full)
#     print 'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0])
#     print 'Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    #                left_on = 'movieId',
#                right_on = 'movieId').
# merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left').rename(columns = {user_row_number: 'Predictions'}).
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['product_id'].isin(user_full['product_id'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'product_id',
               right_on = 'product_id').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations

In [58]:
df_movies = ratings_train[['product_id']]
df_movies

Unnamed: 0,product_id
3733,5001
19465,4790
15647,2431
17852,4234
6187,2859
...,...
18448,3428
7985,500
18687,5053
19776,1661


In [62]:
already_rated, predictions = recommend_movies(preds_df, 2, df_movies, ratings_train, 10)

In [63]:
already_rated

Unnamed: 0,stars,customer_id,product_id
0,5,2,5040
1,5,2,5040
23,5,2,4539
24,5,2,4539
25,5,2,4539
26,5,2,4539
27,5,2,4539
28,5,2,3956
29,5,2,3956
30,5,2,3956


In [64]:
predictions

Unnamed: 0,product_id
15781,4906
11745,4906
10003,4906
7719,4906
5052,4906
1798,4906
1925,4906
15324,4906
6524,4906
14961,4906
