In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# loading the dataframe
df_main = pd.read_csv('sample30.csv')

In [4]:
df_reco = df_main[['name','reviews_username','reviews_rating']]

In [5]:
# check for null values

df_reco.isnull().sum()

name                 0
reviews_username    63
reviews_rating       0
dtype: int64

In [6]:
# dropping rows with NaN values

df_reco.dropna(axis=0, inplace=True)

In [7]:
# check for duplicates

df_reco.duplicated().sum()

2198

In [8]:
# dropping the duplicates

df_reco.drop_duplicates(inplace=True)

In [9]:
# check for duplicates based on subset of 'name' and 'reviews_username'

df_reco.duplicated(subset=['name','reviews_username']).sum()

151

From above we note that we have certain users who have given two or more different ratings to the same products. Let us impute those ratings the average ratings.

In [10]:
# get the average ratings for duplicated rows

df_mean = df_reco[df_reco.duplicated(subset=['name','reviews_username'], keep=False)].groupby(
    by=['name', 'reviews_username']).mean()

df_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,reviews_rating
name,reviews_username,Unnamed: 2_level_1
100:Complete First Season (blu-Ray),dontdodat,3.500000
Alex Cross (dvdvideo),mookie,4.500000
"Aveeno Baby Continuous Protection Lotion Sunscreen with Broad Spectrum SPF 55, 4oz",byamazon customer,2.666667
"Avery174 Ready Index Contemporary Table Of Contents Divider, 1-8, Multi, Letter",gellis,4.500000
"Avery174 Ready Index Contemporary Table Of Contents Divider, 1-8, Multi, Letter",the office guro,3.500000
...,...,...
Tostitos Bite Size Tortilla Chips,debb,4.000000
Tostitos Bite Size Tortilla Chips,rick,4.500000
Tostitos Bite Size Tortilla Chips,sandy,3.000000
Windex Original Glass Cleaner Refill 67.6oz (2 Liter),laura,1.500000


In [11]:
# left merge the df_reco and df_mean dataframes over 'name' and 'reviews_username'

df_merged = df_reco.merge(df_mean, how='left' , on=['name', 'reviews_username']).sort_values(
    by=['name', 'reviews_username'])

df_merged.head()

Unnamed: 0,name,reviews_username,reviews_rating_x,reviews_rating_y
19543,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,brewno,3,
19545,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,embum,5,
19547,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,granny,5,
19542,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,smokey bear,3,
19546,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,spicesea,5,


In [12]:
df_merged.isnull().sum()

name                    0
reviews_username        0
reviews_rating_x        0
reviews_rating_y    27453
dtype: int64

Since we have very few such rows, we would have many NaN values in reviews_rating_y column. Let us concatenate the dataframe such that we get average values from reviews_rating_y column and normal ratings from reviews_rating_x column

In [13]:
A = df_merged[df_merged['reviews_rating_y'].isnull()][['name','reviews_username','reviews_rating_x']]
A.rename(columns={'reviews_rating_x':'reviews_rating_final'}, inplace=True)

B = df_merged[df_merged['reviews_rating_y'].isnull()==False][['name','reviews_username','reviews_rating_y']]
B.rename(columns={'reviews_rating_y':'reviews_rating_final'}, inplace=True)

df_final= pd.concat([A, B])
df_final.sort_values(by=['name', 'reviews_username'])

Unnamed: 0,name,reviews_username,reviews_rating_final
19543,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,brewno,3.0
19545,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,embum,5.0
19547,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,granny,5.0
19542,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,smokey bear,3.0
19546,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,spicesea,5.0
...,...,...,...
20525,Yes To Grapefruit Rejuvenating Body Wash,sheila,1.0
20534,Yes To Grapefruit Rejuvenating Body Wash,skeel,4.0
20531,Yes To Grapefruit Rejuvenating Body Wash,td33,3.0
20553,Yes To Grapefruit Rejuvenating Body Wash,trishaxo2u,5.0


In [14]:
# check for duplicates from `df_final` df
df_final.duplicated().sum()

151

In [15]:
# drop duplicates from 'df_final' df

df_final.drop_duplicates(inplace=True)

In [16]:
df_final.head()

Unnamed: 0,name,reviews_username,reviews_rating_final
19543,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,brewno,3.0
19545,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,embum,5.0
19547,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,granny,5.0
19542,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,smokey bear,3.0
19546,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. ...,spicesea,5.0


In [17]:
df_final['reviews_rating_final'].value_counts()

5.000000    19456
4.000000     5434
3.000000     1183
1.000000     1048
2.000000      369
4.500000       75
3.500000       15
2.500000        2
1.500000        2
3.333333        2
2.666667        1
3.666667        1
Name: reviews_rating_final, dtype: int64

    R: rating matrix (user X item)
    P: User features matrix (user X latent feature)
    Q: Item features matrix (item X laten feature)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter

In [18]:
# Create a user-product matrix.
df_pivot = df_final.pivot(
    index='reviews_username',
    columns='name',
    values='reviews_rating_final'
).fillna(0)


R = np.array(df_pivot)
# N: num of User
N = R.shape[0]
# M: num of Movie
M = R.shape[1]
# K: latent features
K = 50

P = np.random.rand(N,K)
Q = np.random.rand(M,K)


In [19]:
def matrix_factorization(R, P, Q, K, steps=50, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    
    Q = Q.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

#         eR = np.dot(P,Q)

        # checking the error
        e = 0
        for i in range(len(R)):

            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))   # applying L2 regularisation
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [21]:
def matrix_factorization_optimised(R, P, Q, K, steps=50, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    
    Q = Q.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])

#                     for k in range(K):
                        # calculate gradient with a and beta parameter
                    P[i,:] = P[i,:] + alpha * (2 * eij * Q[:,j].T - beta * P[i,:])
                    Q[:,j] = Q[:,j] + alpha * (2 * eij * P[i,:].T - beta * Q[:,j])

#         eR = np.dot(P,Q)

        # checking the error
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))   # applying L2 regularisation
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [22]:
nP, nQ = matrix_factorization_optimised(R, P, Q, K)

nR = np.dot(nP, nQ.T)

In [23]:
pred_R = pd.DataFrame(nR, columns = df_pivot.columns, index = df_pivot.index)

In [24]:
# checking predicted ratings with MF method
pred_R

name,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest,100:Complete First Season (blu-Ray),2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black,"2x Ultra Era with Oxi Booster, 50fl oz","42 Dual Drop Leaf Table with 2 Madrid Chairs""",4C Grated Parmesan Cheese 100% Natural 8oz Shaker,5302050 15/16 FCT/HOSE ADAPTOR,Africa's Best No-Lye Dual Conditioning Relaxer System Super,Alberto VO5 Salon Series Smooth Plus Sleek Shampoo,Alex Cross (dvdvideo),...,Walkers Stem Ginger Shortbread,"Wallmount Server Cabinet (450mm, 9 RU)","Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee","WeatherTech 40647 14-15 Outlander Cargo Liners Behind 2nd Row, Black",Wedding Wishes Wedding Guest Book,Weleda Everon Lip Balm,Wilton Black Dots Standard Baking Cups,Windex Original Glass Cleaner Refill 67.6oz (2 Liter),Yes To Carrots Nourishing Body Wash,Yes To Grapefruit Rejuvenating Body Wash
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,7.150294,5.519434,7.079770,9.749926,11.042287,6.678009,11.696839,6.439968,8.258086,4.714687,...,11.236139,11.390738,5.844891,6.788433,12.100599,8.007179,10.206940,4.257564,5.207056,3.077792
00sab00,7.736406,4.235384,7.079238,6.956261,10.028495,8.307530,12.107881,6.982579,8.128253,4.123104,...,9.702003,9.321165,6.075149,5.803756,11.010840,8.036310,10.514212,3.807626,4.811508,4.655842
01impala,7.714596,5.731142,8.123919,10.334370,12.067533,9.118010,12.230252,7.192361,9.986191,4.464586,...,13.305093,12.578117,6.664514,7.542204,13.035477,8.374233,12.280614,5.368482,5.982731,4.356612
02dakota,7.170768,4.799320,6.277110,8.213799,10.881713,6.630253,10.900526,5.464641,6.930283,3.780186,...,11.340491,11.634236,5.040549,4.539202,11.109477,6.516942,9.162541,3.854837,4.453180,4.278425
02deuce,6.521045,5.252639,8.482999,7.656528,10.756380,8.799478,12.031361,6.465298,8.037675,3.563356,...,10.602341,11.411359,6.313700,6.236421,12.299358,6.831235,9.991723,4.088185,5.267679,4.011708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxcsdfd,6.265867,3.967552,7.519615,7.167004,9.070708,7.465212,10.640175,6.129092,7.892971,3.989972,...,10.080374,9.886476,5.673058,5.848622,10.555968,6.446202,8.808352,3.713176,4.529923,3.718866
zxjki,6.634312,4.864927,6.071578,7.330549,11.008057,8.159163,10.895540,6.541809,7.496563,3.776395,...,9.858839,10.505681,5.246307,5.494603,11.187844,8.273601,10.324178,4.054322,4.921018,3.406819
zyiah4,6.555006,4.968052,8.646245,10.219678,10.565182,7.750344,10.674643,6.541763,9.108351,4.367355,...,11.990090,11.184025,5.544045,6.354455,11.792505,8.586349,10.373727,4.153259,4.960819,5.527479
zzdiane,8.951885,6.058828,7.525995,9.501307,11.536640,7.774639,12.474834,7.433385,8.157762,4.425532,...,12.853627,11.372923,6.252841,6.007771,12.149711,8.913349,11.383230,5.062307,5.453087,4.960017


------------------------------------------------------------------------------------------------------------