In [1]:
import numpy as np
import pandas as pd
import math
import json
import time
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
#from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

Data load

In [2]:
#data download
#https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/Electronics.csv

# Reading data
rating_data = pd.read_csv("D:/amazon sports/ratingdata.csv", names=['productId', 'userId','Rating','timestamp']) # Loading data
rating_data.head() # Displaying the first 5 lines of the dataset

Unnamed: 0,productId,userId,Rating,timestamp
0,60009810,A1N070NS9CJQ2I,5.0,1026864000
1,60009810,A3P0KRKOBQK1KN,5.0,1025913600
2,60009810,A192HO2ICJ75VU,5.0,1025654400
3,60009810,A2T278FKFL3BLT,4.0,1025395200
4,60009810,A2ZUXVTW8RXBXW,5.0,1025222400


In [5]:
# Dropping the columns
rating_data = rating_data.drop(['timestamp'], axis = 1) 

In [7]:
# Check the number of rows and columns
rows, columns = rating_data.shape
print("No of rows: ", rows) 
print("No of columns: ", columns) 

No of rows:  20994353
No of columns:  3


Value

In [3]:
# loop through the columns and check the missing values
for col in rating_data.columns:
    pct_missing = rating_data[col].isnull().mean()
    print(f'{col} - {pct_missing :.1%}')

productId - 0.0%
userId - 0.0%
Rating - 0.0%
timestamp - 0.0%


In [8]:
# Summary statistics of 'rating' variable
rating_data[['Rating']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,20994353.0,4.073685,1.385792,1.0,4.0,5.0,5.0,5.0


In [11]:
most_rated = rating_data.groupby('userId').size().sort_values(ascending=False)[:10]
most_rated

userId
A680RUE1FDO8B     633
A3OXHLG6DIBRW8    593
ADLVFFE4VBT8      549
A1X1CEGHTHMBL1    498
A6FIAB28IS79      491
A5JLAU2ARJ0BO     479
A31N0XY2UTB25C    471
A3OA4DV4L81N1D    424
A3LGT6UZL99IW1    424
A2LXX47A0KMJVX    418
dtype: int64

In [13]:
counts = rating_data['userId'].value_counts()
rating_data_final = rating_data[rating_data['userId'].isin(counts[counts >= 50].index)]
rating_data_final.head()

Unnamed: 0,productId,userId,Rating
222,380709473,A3MV1KKHX51FYT,4.0
306,511189877,A2I2KPNJDQ9SL0,5.0
380,511189877,A2DFM26VLNVYNY,5.0
649,594033926,A34GB2ZA1JLGND,5.0
743,594481902,AT09WGFUM934H,3.0


data split

In [14]:
train_data, test_data = train_test_split(rating_data_final, test_size = 0.3, random_state=0)

print(train_data.head(5))

           productId          userId  Rating
970061    B000BK1QR0  A2AVX8HN2XX0WQ     5.0
4104981   B003UT6C9G  A2RJE018IGW0K1     5.0
9460163   B00E964X1S   ABIVKBMSIPEDY     4.0
12656759  B00NO73IN2  A3HQKJ7S1U19P7     5.0
16982673  B01DQQLH74  A24D5F1AHSXKNV     4.0


In [15]:
def shape():
    print("Test data shape: ", test_data.shape)
    print("Train data shape: ", train_data.shape)
shape() 

Test data shape:  (118218, 3)
Train data shape:  (275841, 3)


Popularity Recommender model (non-personalised)

In [17]:
#Count of user_id for each unique product as recommendation score 
train_data_grouped = train_data.groupby('productId').agg({'userId': 'count'}).reset_index()
train_data_grouped.rename(columns = {'userId': 'user_count'},inplace=True)
train_data_grouped.head()

#Sort the products on recommendation score 
train_data_sort = train_data_grouped.sort_values(['user_count', 'productId'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['Rank'] = train_data_sort['user_count'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(10) 
popularity_recommendations 

Unnamed: 0,productId,user_count,Rank
14007,B000VS4HDM,267,1.0
26081,B003L1ZYYW,264,2.0
16282,B0015DYMVO,224,3.0
21185,B001TH7GUU,217,4.0
9587,B000FNFSPY,214,5.0
6196,B0006BB9MG,196,6.0
786,B00004ZCJI,193,7.0
34924,B005LDLP8W,192,8.0
2975,B00009KLAE,189,9.0
66471,B00M55C0NS,188,10.0


In [18]:
# Use popularity based recommender model to make predictions
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    #Add user_id column for which the recommendations are being generated 
    user_recommendations['userId'] = user_id 
      
    #Bring user_id column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations

In [20]:
userId = 150
recommend(userId)

Unnamed: 0,userId,productId,user_count,Rank
14007,150,B000VS4HDM,267,1.0
26081,150,B003L1ZYYW,264,2.0
16282,150,B0015DYMVO,224,3.0
21185,150,B001TH7GUU,217,4.0
9587,150,B000FNFSPY,214,5.0
6196,150,B0006BB9MG,196,6.0
786,150,B00004ZCJI,193,7.0
34924,150,B005LDLP8W,192,8.0
2975,150,B00009KLAE,189,9.0
66471,150,B00M55C0NS,188,10.0
