In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', -1)

In [24]:
ratings = pd.read_csv("ratings_Electronics.csv",names = ['user_id','prod_id','ratings','timestamp'])

In [25]:
ratings.columns

Index(['user_id', 'prod_id', 'ratings', 'timestamp'], dtype='object')

In [26]:
ratings.drop(columns='timestamp',inplace=True)

In [27]:
ratings.to_csv('product_ratings.csv')

In [28]:
ratings.columns

Index(['user_id', 'prod_id', 'ratings'], dtype='object')

In [29]:
ratings.dtypes

user_id    object 
prod_id    object 
ratings    float64
dtype: object

In [30]:
ratings.shape

(7824482, 3)

In [31]:
ratings_count = ratings['user_id'].value_counts()

In [32]:
ratings_subset = ratings[ratings['user_id'].isin(ratings_count[ratings_count>50].index)]

In [33]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from scipy import stats
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [34]:
data = Dataset.load_from_df(ratings_subset[['user_id','prod_id','ratings']],reader)

In [35]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.30,random_state=123)

In [38]:
type(trainset)

surprise.trainset.Trainset

In [39]:
user_records = trainset.ur
type(user_records)

collections.defaultdict

In [40]:
# Getting mean ratings for each product and renaming the count column as #ratings
ratings_subset_grouped = ratings_subset.groupby(['prod_id']).agg({'ratings':'mean'}).reset_index()
ratings_subset_grouped.rename(columns = {'ratings': 'mean_rating'}, inplace = True)
ratings_subset_grouped.head()

Unnamed: 0,prod_id,mean_rating
0,594481813,3.0
1,970407998,2.5
2,972683275,5.0
3,1400501466,3.0
4,1400501520,5.0


In [43]:
df = ratings_subset.groupby(['prod_id']).agg({'user_id':'count'}).reset_index()
df.head(3)

Unnamed: 0,prod_id,user_id
0,594481813,1
1,970407998,2
2,972683275,3


In [44]:
ratings_subset_grouped['#ratings'] = df['user_id']
ratings_subset_grouped.head()

Unnamed: 0,prod_id,mean_rating,#ratings
0,594481813,3.0,1
1,970407998,2.5,2
2,972683275,5.0,3
3,1400501466,3.0,5
4,1400501520,5.0,1


In [45]:
ratings_subset_grouped['score'] = ratings_subset_grouped['mean_rating']*ratings_subset_grouped['#ratings']
ratings_subset_grouped.head()

Unnamed: 0,prod_id,mean_rating,#ratings,score
0,594481813,3.0,1,3.0
1,970407998,2.5,2,5.0
2,972683275,5.0,3,15.0
3,1400501466,3.0,5,15.0
4,1400501520,5.0,1,5.0


In [46]:
ratings_subset_grouped['rank'] = ratings_subset_grouped['score'].rank(ascending = 0, method = 'first')
ratings_sorted = ratings_subset_grouped.sort_values(by='rank')
ratings_sorted.head()

Unnamed: 0,prod_id,mean_rating,#ratings,score,rank
24262,B003ES5ZUU,4.858757,177,860.0,1.0
38146,B0088CJT4U,4.215686,204,860.0,2.0
10820,B000N99BBC,4.773006,163,778.0,3.0
37407,B007WTAJTO,4.692308,156,732.0,4.0
37765,B00829TIEK,4.431507,146,647.0,5.0


In [47]:
# Top 15 popular products
ratings_sorted[["prod_id","rank"]].head(15)

Unnamed: 0,prod_id,rank
24262,B003ES5ZUU,1.0
38146,B0088CJT4U,2.0
10820,B000N99BBC,3.0
37407,B007WTAJTO,4.0
37765,B00829TIEK,5.0
37761,B00829THK0,6.0
38471,B008DWCRQW,7.0
28121,B004CLYEDC,8.0
21486,B002R5AM7C,9.0
22850,B0034CL2ZI,10.0


In [48]:
from surprise import KNNWithMeans
from surprise import accuracy

In [49]:
CollaborativeFiltering = KNNWithMeans(k=5)
CollaborativeFiltering.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x14efe8b00>

In [50]:
len(testset)

36652

In [51]:
testset[0:5]

[('A2NT3W1F1XYBFZ', 'B002LITT56', 5.0),
 ('A2MCRCK1V61FWQ', 'B0042RUSY0', 5.0),
 ('AOWF9T81XMX2S', 'B0002LEMWE', 5.0),
 ('A1Z7U9K6X3FEOU', 'B00845RYXE', 4.0),
 ('A200DY76VXVGP9', 'B009CP99GK', 4.0)]

In [52]:
# Evalute on test set with Collaborative Filtering Model
test_pred = CollaborativeFiltering.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 1.0715


1.0715151981450601

In [53]:
# View a particular prediction
test_pred[2]

# To access a particular value, say estimate simply mention test_pred[12].est

Prediction(uid='AOWF9T81XMX2S', iid='B0002LEMWE', r_ui=5.0, est=5, details={'actual_k': 3, 'was_impossible': False})

In [54]:
test_pred[2].details["actual_k"]

3

In [55]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [58]:
test_pred_df.loc[test_pred_df.was_impossible].head(5)

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
1,A2MCRCK1V61FWQ,B0042RUSY0,5.0,4.263824,"{'was_impossible': True, 'reason': 'User and/or item is unkown.'}",True
3,A1Z7U9K6X3FEOU,B00845RYXE,4.0,4.263824,"{'was_impossible': True, 'reason': 'User and/or item is unkown.'}",True
7,A24P4E3RJ6AX94,B00007FH6F,1.0,4.263824,"{'was_impossible': True, 'reason': 'User and/or item is unkown.'}",True
9,A1EXGL6L0QQ0M5,B004HB2X4O,3.0,4.263824,"{'was_impossible': True, 'reason': 'User and/or item is unkown.'}",True
10,A1LHMSY3Q46PJS,B000F1SFLA,5.0,4.263824,"{'was_impossible': True, 'reason': 'User and/or item is unkown.'}",True


In [59]:
testset_new = trainset.build_anti_testset()

In [60]:
len(testset_new)

54558165

In [61]:
# Recommending top 5 new products
testset_new[0:5]

[('ATS2855497V0I', 'B003BYIM0W', 4.263824413288275),
 ('ATS2855497V0I', 'B0081CRTO4', 4.263824413288275),
 ('ATS2855497V0I', 'B00DR0PDNE', 4.263824413288275),
 ('ATS2855497V0I', 'B00B0CQCK4', 4.263824413288275),
 ('ATS2855497V0I', 'B004FG16NA', 4.263824413288275)]

In [62]:
predictions = CollaborativeFiltering.test(testset_new[0:10000])

In [63]:
predictions_df = pd.DataFrame([[x.iid,x.est] for x in predictions])

In [64]:
predictions_df.columns = ["prod_id","est_rating"]
predictions_df.sort_values(by = ["prod_id", "est_rating"],ascending=False,inplace=True)

In [65]:
predictions_df.head()

Unnamed: 0,prod_id,est_rating
5254,B00L8I6SFY,4.457522
744,B00L403O94,4.787395
5578,B00L3YHF6O,4.400534
2629,B00L2P3TRS,3.928571
8598,B00L26YDA4,3.307187


In [66]:
top_10_recos = predictions_df.groupby("prod_id").head(10)

In [67]:
top_10_recos.head(10)

Unnamed: 0,prod_id,est_rating
5254,B00L8I6SFY,4.457522
744,B00L403O94,4.787395
5578,B00L3YHF6O,4.400534
2629,B00L2P3TRS,3.928571
8598,B00L26YDA4,3.307187
1220,B00L21HC7A,3.76832
5038,B00L1NZTSS,5.0
6925,B00L1I727Y,3.928571
2967,B00KWPRSJY,4.505891
9335,B00KWMNDDM,4.361458
