In [1]:
import numpy as np
import pandas as pd
import math
import json
import time
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
#from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

Data load

In [2]:
#data download
#https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Electronics.csv

# Reading data
rating_data = pd.read_csv("D:/amazon sports/ratingdata.csv", names=['productId', 'userId','Rating','timestamp']) # Loading data
rating_data.head() # Displaying the first 5 lines of the dataset

Unnamed: 0,productId,userId,Rating,timestamp
0,60009810,A1N070NS9CJQ2I,5.0,1026864000
1,60009810,A3P0KRKOBQK1KN,5.0,1025913600
2,60009810,A192HO2ICJ75VU,5.0,1025654400
3,60009810,A2T278FKFL3BLT,4.0,1025395200
4,60009810,A2ZUXVTW8RXBXW,5.0,1025222400


In [3]:
# Dropping the columns
rating_data = rating_data.drop(['timestamp'], axis = 1) 

In [4]:
# Check the number of rows and columns
rows, columns = rating_data.shape
print("No of rows: ", rows) 
print("No of columns: ", columns) 

No of rows:  20994353
No of columns:  3


Value

In [5]:
# loop through the columns and check the missing values
for col in rating_data.columns:
    pct_missing = rating_data[col].isnull().mean()
    print(f'{col} - {pct_missing :.1%}')

productId - 0.0%
userId - 0.0%
Rating - 0.0%


In [6]:
# Summary statistics of 'rating' variable
rating_data[['Rating']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,20994353.0,4.073685,1.385792,1.0,4.0,5.0,5.0,5.0


In [7]:
most_rated = rating_data.groupby('userId').size().sort_values(ascending=False)[:10]
most_rated

userId
A680RUE1FDO8B     633
A3OXHLG6DIBRW8    593
ADLVFFE4VBT8      549
A1X1CEGHTHMBL1    498
A6FIAB28IS79      491
A5JLAU2ARJ0BO     479
A31N0XY2UTB25C    471
A3OA4DV4L81N1D    424
A3LGT6UZL99IW1    424
A2LXX47A0KMJVX    418
dtype: int64

In [8]:
counts = rating_data['userId'].value_counts()
rating_data_final = rating_data[rating_data['userId'].isin(counts[counts >= 50].index)]
rating_data_final.head()

Unnamed: 0,productId,userId,Rating
222,380709473,A3MV1KKHX51FYT,4.0
306,511189877,A2I2KPNJDQ9SL0,5.0
380,511189877,A2DFM26VLNVYNY,5.0
649,594033926,A34GB2ZA1JLGND,5.0
743,594481902,AT09WGFUM934H,3.0


In [19]:
final_ratings_matrix = pd.pivot_table(rating_data_final,index=['userId'], columns = 'productId', values = "Rating")
final_ratings_matrix.fillna(0,inplace=True)
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)

possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)

density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

Shape of final_ratings_matrix:  (5143, 115961)
given_num_of_ratings =  380713
possible_num_of_ratings =  596387423
density: 0.06%


data split

In [9]:
train_data, test_data = train_test_split(rating_data_final, test_size = 0.3, random_state=0)

print(train_data.head(5))

           productId          userId  Rating
970061    B000BK1QR0  A2AVX8HN2XX0WQ     5.0
4104981   B003UT6C9G  A2RJE018IGW0K1     5.0
9460163   B00E964X1S   ABIVKBMSIPEDY     4.0
12656759  B00NO73IN2  A3HQKJ7S1U19P7     5.0
16982673  B01DQQLH74  A24D5F1AHSXKNV     4.0


In [10]:
def shape():
    print("Test data shape: ", test_data.shape)
    print("Train data shape: ", train_data.shape)
shape() 

Test data shape:  (118218, 3)
Train data shape:  (275841, 3)


Collaborative Filtering model : SVD

In [11]:
df_CF = pd.concat([train_data, test_data]).reset_index()
df_CF.tail()

Unnamed: 0,index,productId,userId,Rating
394054,218238,B000067SG3,A2GMJ3DJ7CBM1D,5.0
394055,15868269,B016RVPZ0U,A2H44WVZS59KKT,4.0
394056,3328908,B0022NH7AE,A1PG70NH85K859,5.0
394057,3945640,B003IRR0YQ,A3PD8JD9L4WEII,4.0
394058,2398334,B0013BW84G,A3NCIN6TNL0MGA,5.0


In [12]:
#User-based Collaborative Filtering
# Matrix with row per 'user' and column per 'item' 
pivot_df = pd.pivot_table(df_CF,index=['userId'], columns = 'productId', values = "Rating")
pivot_df.fillna(0,inplace=True)
print(pivot_df.shape)
pivot_df.head()

(5143, 115961)


productId,0101635370,0380709473,0511189877,059403390X,0594033926,0594450268,0594451647,0594481902,059449771X,073530498X,...,B01HIPVVRK,B01HIQYWU2,B01HIS30OY,B01HIS3FGW,B01HIS5N3K,B01HIS5O7A,B01HIWBU7Y,B01HIZEW1C,B01HJDNL60,B01HJH42KU
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1016Q5UDME15Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1053TBGTIEQ8V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
pivot_df['user_index'] = np.arange(0, pivot_df.shape[0], 1)
pivot_df.head()

productId,0101635370,0380709473,0511189877,059403390X,0594033926,0594450268,0594451647,0594481902,059449771X,073530498X,...,B01HIQYWU2,B01HIS30OY,B01HIS3FGW,B01HIS5N3K,B01HIS5O7A,B01HIWBU7Y,B01HIZEW1C,B01HJDNL60,B01HJH42KU,user_index
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
A1016Q5UDME15Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
A1053TBGTIEQ8V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [14]:
pivot_df.set_index(['user_index'], inplace=True)

# Actual ratings given by users
pivot_df.head()

productId,0101635370,0380709473,0511189877,059403390X,0594033926,0594450268,0594451647,0594481902,059449771X,073530498X,...,B01HIPVVRK,B01HIQYWU2,B01HIS30OY,B01HIS3FGW,B01HIS5N3K,B01HIS5O7A,B01HIWBU7Y,B01HIZEW1C,B01HJDNL60,B01HJH42KU
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


SVD method

In [15]:
from scipy.sparse.linalg import svds
# Singular Value Decomposition
U, sigma, Vt = svds(pivot_df, k = 50)
# Construct diagonal array in SVD
sigma = np.diag(sigma)

In [16]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 

# Predicted ratings
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = pivot_df.columns)
preds_df.head()

productId,0101635370,0380709473,0511189877,059403390X,0594033926,0594450268,0594451647,0594481902,059449771X,073530498X,...,B01HIPVVRK,B01HIQYWU2,B01HIS30OY,B01HIS3FGW,B01HIS5N3K,B01HIS5O7A,B01HIWBU7Y,B01HIZEW1C,B01HJDNL60,B01HJH42KU
0,0.021593,-6.2e-05,-0.006128,0.01147,-0.002269,0.001048,0.004962,-0.006803,0.020092,-0.004586,...,-0.000533,-0.001344,0.023728,0.003422,0.004695,-0.001145,0.002347,0.000595,0.011422,0.001904
1,0.009173,-0.000134,0.031991,0.022009,0.003264,0.002744,0.011656,0.003334,0.018474,0.010694,...,-0.008403,0.011274,0.004159,0.005562,0.046653,-0.000872,-0.000168,0.020069,0.003437,-0.00163
2,0.002085,-0.000116,0.005626,0.015694,-0.00126,0.000837,0.001712,0.006979,-0.004201,0.015469,...,-0.005193,0.004763,-0.000641,-0.004654,-0.005681,-0.001153,0.000872,-2.5e-05,0.00106,-0.000748
3,0.002258,9.2e-05,0.005765,0.005221,0.00213,0.001132,0.004067,0.004337,0.008326,-0.002275,...,-0.000279,-0.000181,0.00233,0.001733,0.0021,0.000336,0.000884,0.004053,-0.0003,0.001241
4,0.000206,-1.5e-05,-0.001496,-0.001778,-0.000818,0.000313,-0.00214,0.000585,-0.000586,0.003663,...,0.000159,0.000875,0.00087,0.013846,0.025136,0.000984,-0.000327,0.011553,0.000726,-0.000142


In [17]:
# Recommend the items with the highest predicted ratings

def recommend_items(userID, pivot_df, preds_df, num_recommendations):
      
    user_idx = userID-1 # index starts at 0
    
    # Get and sort the user's ratings
    sorted_user_ratings = pivot_df.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_ratings
    sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_predictions

    temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_predictions']
    
    temp = temp.loc[temp.user_ratings == 0]   
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'.format(userID))
    print(temp.head(num_recommendations))

In [18]:
#Enter 'userID' and 'num_recommendations' for the user #
userID = 150
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)


Below are the recommended items for user(user_id = 150):

                   user_ratings  user_predictions
Recommended Items                                
B005LFT3QG                  0.0          0.671232
B008VQ8IKY                  0.0          0.556429
B0000BZL0U                  0.0          0.525385
B004RORMF6                  0.0          0.419268
B002HAJQGA                  0.0          0.403691


Evaluate

In [20]:
# Actual ratings given by the users
final_ratings_matrix.head()

productId,0101635370,0380709473,0511189877,059403390X,0594033926,0594450268,0594451647,0594481902,059449771X,073530498X,...,B01HIPVVRK,B01HIQYWU2,B01HIS30OY,B01HIS3FGW,B01HIS5N3K,B01HIS5O7A,B01HIWBU7Y,B01HIZEW1C,B01HJDNL60,B01HJH42KU
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1016Q5UDME15Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1053TBGTIEQ8V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Average ACTUAL rating for each item
final_ratings_matrix.mean().head()

productId
0101635370    0.002528
0380709473    0.000778
0511189877    0.001944
059403390X    0.000972
0594033926    0.000972
dtype: float64

In [22]:
# Predicted ratings 
preds_df.head()

productId,0101635370,0380709473,0511189877,059403390X,0594033926,0594450268,0594451647,0594481902,059449771X,073530498X,...,B01HIPVVRK,B01HIQYWU2,B01HIS30OY,B01HIS3FGW,B01HIS5N3K,B01HIS5O7A,B01HIWBU7Y,B01HIZEW1C,B01HJDNL60,B01HJH42KU
0,0.021593,-6.2e-05,-0.006128,0.01147,-0.002269,0.001048,0.004962,-0.006803,0.020092,-0.004586,...,-0.000533,-0.001344,0.023728,0.003422,0.004695,-0.001145,0.002347,0.000595,0.011422,0.001904
1,0.009173,-0.000134,0.031991,0.022009,0.003264,0.002744,0.011656,0.003334,0.018474,0.010694,...,-0.008403,0.011274,0.004159,0.005562,0.046653,-0.000872,-0.000168,0.020069,0.003437,-0.00163
2,0.002085,-0.000116,0.005626,0.015694,-0.00126,0.000837,0.001712,0.006979,-0.004201,0.015469,...,-0.005193,0.004763,-0.000641,-0.004654,-0.005681,-0.001153,0.000872,-2.5e-05,0.00106,-0.000748
3,0.002258,9.2e-05,0.005765,0.005221,0.00213,0.001132,0.004067,0.004337,0.008326,-0.002275,...,-0.000279,-0.000181,0.00233,0.001733,0.0021,0.000336,0.000884,0.004053,-0.0003,0.001241
4,0.000206,-1.5e-05,-0.001496,-0.001778,-0.000818,0.000313,-0.00214,0.000585,-0.000586,0.003663,...,0.000159,0.000875,0.00087,0.013846,0.025136,0.000984,-0.000327,0.011553,0.000726,-0.000142


In [23]:
# Average PREDICTED rating for each item
preds_df.mean().head()

productId
0101635370    0.002100
0380709473    0.000008
0511189877    0.002000
059403390X    0.000897
0594033926    0.000886
dtype: float64

In [24]:
rmse_df = pd.concat([final_ratings_matrix.mean(), preds_df.mean()], axis=1)
rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
print(rmse_df.shape)
rmse_df['item_index'] = np.arange(0, rmse_df.shape[0], 1)
rmse_df.head()

(115961, 2)


Unnamed: 0_level_0,Avg_actual_ratings,Avg_predicted_ratings,item_index
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0101635370,0.002528,0.0021,0
0380709473,0.000778,8e-06,1
0511189877,0.001944,0.002,2
059403390X,0.000972,0.000897,3
0594033926,0.000972,0.000886,4


In [25]:
RMSE = round((((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings) ** 2).mean() ** 0.5), 5)
print('\nRMSE SVD Model = {} \n'.format(RMSE))


RMSE SVD Model = 0.00102 



Recommend

In [26]:
# Enter 'userID' and 'num_recommendations' for the user #
userID = 200
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)


Below are the recommended items for user(user_id = 200):

                   user_ratings  user_predictions
Recommended Items                                
B003MTTJOY                  0.0          0.850580
B00OBRE5UE                  0.0          0.840573
B006W8U2MU                  0.0          0.813467
B00BP5KOPA                  0.0          0.805084
B003L1ZYYW                  0.0          0.696841


In [27]:
# Enter 'userID' and 'num_recommendations' for the user #
userID = 121
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)


Below are the recommended items for user(user_id = 121):

                   user_ratings  user_predictions
Recommended Items                                
B005LJQO9G                  0.0          0.507258
B001MSU1FS                  0.0          0.499059
B003XM9774                  0.0          0.476672
B005LJQMCK                  0.0          0.449679
B004YDUZ22                  0.0          0.446790
