# Matrix Factorization Recommender System

In [1]:
import pandas as pd
import numpy as np
import ast

#To Ignore Warnings in Output
import warnings
warnings.filterwarnings('ignore')

## Data Load

In [2]:
# Load in the data
business = pd.read_csv('../data/output_csv/business_PA_Philly_clean.csv')
review = pd.read_csv('../data/output_csv/review_PA_Philly_clean.csv')

In [3]:
business.head()

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count,categories,...,Seafood Markets,Wraps,Shaved Ice,Cupcakes,Greek,Flowers & Gifts,Home & Garden,French,Candy Stores,Chocolatiers & Shops
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,19107,39.955505,-75.155564,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,19106,39.953949,-75.143226,4.0,245,"Sushi Bars, Restaurants, Japanese",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,19147,39.943223,-75.162568,4.5,205,"Korean, Restaurants",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,19123,39.962582,-75.135657,3.5,65,"Eatertainment, Arts & Entertainment, Brewpubs,...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,19104,39.954573,-75.194894,3.0,56,"Restaurants, Automotive, Delis, Gas Stations, ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
review.head()

Unnamed: 0,review_id,user_id,business_id,stars,year
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,2015
1,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5,2013
2,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5,2014
3,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5,2018
4,YcLXh-3UC9y6YFAI9xxzPQ,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4,2015


In [5]:
business = business[['business_id','name']]

In [6]:
#Building Rating Matrix
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype


user_u = list(sorted(review.user_id.unique()))
business_u = list(sorted(review.business_id.unique()))

cat_type_user = CategoricalDtype(categories=user_u, ordered=True)
cat_type_business = CategoricalDtype(categories=business_u, ordered=True)

row = review.user_id.astype(cat_type_user).cat.codes
col = review.business_id.astype(cat_type_business).cat.codes

data = review['stars'].tolist()

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(business_u)))


In [7]:
ratings = pd.DataFrame.sparse.from_spmatrix(sparse_matrix,index=user_u, columns=business_u)

In [8]:
ratings.fillna(0, inplace = True)

# Recommendation Engine - Matrix Factorization

In [9]:
def matrix_factorization(R, P, Q, K, steps=10, alpha=0.0002, beta=0.02):
    '''
    Inputs:
    R     : The ratings (of dimension M x N)
    P     : an initial matrix of dimension M x K
    Q     : an initial matrix of dimension N x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimization
    alpha : the learning rate
    beta  : the regularization parameter

    Outputs:
    the final matrices P and Q
    '''

    for step in range(steps):
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i][j] > 0: # Skipping over missing ratings
                    #Calculating error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        # calculate gradient with alpha and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001: # tolerance
            break
    print(e)
    return P, Q

In [10]:
np.random.seed(862)

# Initializations
M = ratings.shape[0] # Number of users
N = ratings.shape[1] # Number of items
K = 3 # Number of latent features

# Initial estimate of P and Q
P = np.random.rand(M,K)
Q = np.random.rand(K,N)
rating_np = np.array(ratings)

In [11]:
P, Q = matrix_factorization(rating_np, P, Q, K)

5274180.381547268


In [12]:
predicted_rating = np.matmul(P, Q)
predicted_rating = pd.DataFrame(predicted_rating, index = ratings.index, columns = ratings.columns)
predicted_rating.head()

Unnamed: 0,-0TffRSXXIlBYVbb5AwfTg,-1B9pP_CrRBJYPICE5WbRA,-3ArWZfDjfab8qVHf3WVtg,-3m_nXlyvdKAVNNmVirpGQ,-5Rah4ZvWsDu4oilUZxhtw,-63ytt5vkWof-M9NDGTkng,-6MEKOmFu6jckT3pruSxHg,-ATiAtTikuGuqvaW2O6tNA,-AanHawaDlzWHQjrqRRWig,-Bhoyo7LL97tgt9Hze0Saw,...,zuEdIZKAYBDfPjyFg6B34Q,zuKnCtZQKZqnvEaKVnwVVQ,zucC7rHpXPYBu7aEqj0NUw,zujdPV3HT-Y-CKE1GgkMHQ,zvvl3c1FO3O3BZdhusficA,zwTmOj4B_OVPMTMYijQiKg,zwd4dyQ5ovnjVojWfAuhMw,zxRmQ_FWVowh8rlzLCSURQ,zxY4DgtXsVHihSUpsmwamg,zz3E7kmJI2r2JseE6LAnrw
--2tyArRmSoyKx5r-FVG0A,3.397622,2.077425,1.738649,1.253063,1.722688,1.237353,1.771825,2.708738,2.055915,1.790953,...,1.558995,1.164857,1.4002,2.086463,1.720596,1.761967,1.939186,1.689514,1.536984,1.038674
--2vR0DIsmQ6WfcSzKWigw,0.95927,0.726982,0.523192,0.229271,0.664386,0.297795,0.459906,0.775994,0.578088,0.507707,...,0.508844,0.219423,0.490653,0.76571,0.371206,0.380335,0.573632,0.414872,0.481286,0.227588
--4AjktZiHowEIBCMd4CZA,2.047476,0.963972,0.84411,0.853213,0.774308,0.894449,1.102629,1.684094,1.27172,1.059288,...,0.81493,0.892854,0.669113,0.92337,1.145499,1.059785,1.122486,1.203261,0.976481,0.741929
--4_p6Z3tKadJcr9Non_Vw,1.197353,0.608547,0.514005,0.470731,0.502213,0.503459,0.635439,0.982955,0.740893,0.621277,...,0.497295,0.489659,0.420195,0.594289,0.642564,0.600112,0.664216,0.679602,0.575359,0.414284
--6GckBYtTa4hj8pT09oAg,2.909388,1.865881,1.702342,1.21725,1.442973,0.966901,1.559576,2.221149,1.719683,1.557068,...,1.354615,0.976871,1.228809,1.856425,1.580222,1.777279,1.666201,1.326922,1.13795,0.874972


In [13]:
UID = '--2tyArRmSoyKx5r-FVG0A'
# Obtain the missing ratings
missing_ratings = predicted_rating.loc[UID][ratings.loc[UID,:]==0]

# Attach it with indices
missing_ratings = pd.Series(missing_ratings, index = ratings.columns[ratings.loc[UID,:] == 0] )

# Sort the ratings
missing_ratings.sort_values(ascending = False, inplace = True)

In [14]:
# Recommendations
mat_fact = []
for i in range(10):
    rec_rest_id = missing_ratings.index[i]
    mat_fact.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", missing_ratings.iloc[i])

my number  1  recommendation is  Reading Terminal Market , with a predicted rating of 6.470318707092084
my number  2  recommendation is  Zahav , with a predicted rating of 5.137247280366128
my number  3  recommendation is  Barbuzzo , with a predicted rating of 4.827370774024177
my number  4  recommendation is  El Vez , with a predicted rating of 4.675630711586587
my number  5  recommendation is  Parc , with a predicted rating of 4.58619918941663
my number  6  recommendation is  Dalessandro’s Steaks & Hoagies , with a predicted rating of 4.522453587367938
my number  7  recommendation is  Green Eggs Café , with a predicted rating of 4.437961799925259
my number  8  recommendation is  Talula's Garden , with a predicted rating of 4.433322021896766
my number  9  recommendation is  Pat's King of Steaks , with a predicted rating of 4.423061574044172
my number  10  recommendation is  Monk's Cafe , with a predicted rating of 4.297135072117197


In [15]:
mat_fact

['Reading Terminal Market',
 'Zahav',
 'Barbuzzo',
 'El Vez',
 'Parc',
 'Dalessandro’s Steaks & Hoagies',
 'Green Eggs Café',
 "Talula's Garden",
 "Pat's King of Steaks",
 "Monk's Cafe"]