## A reccommendation system with collaborative filtering

In [1]:
import json
import numpy as np
import pandas as pd
from scipy import optimize

In [2]:
# load the data
def json_to_df(path):
    data = []
    with open(path) as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame.from_dict(data)

In [3]:
review_df = json_to_df('yelp_dataset/yelp_academic_dataset_review.json')
business_df = json_to_df('yelp_dataset/yelp_academic_dataset_business.json')
#user_df = json_to_df('yelp_dataset/yelp_academic_dataset_user.json')

In [4]:
## Subset the business to only contain toronto restaurants (mainly because of memory issue)
business_df = business_df[business_df['categories'].str.contains('Restaurants') == True]
business_df = business_df[business_df['city'] == 'Toronto']

In [10]:
business_id_list = list(business_df['business_id'])

In [11]:
business_df.shape

(7578, 15)

In [12]:
## Subset reviews to reviews on restaurants only
review_df = review_df[review_df['business_id'].isin(business_id_list)]

In [13]:
## Find user id for users who reviewed restaurants
user_id_list = list(review_df['user_id'].unique())

In [14]:
print("There are {} restaurants and {} users who reviewed them.".\
      format(len(business_id_list), len(user_id_list)))

There are 7578 restaurants and 80854 users who reviewed them.


In [18]:
review_df.sort_values(by='user_id', inplace=True)

In [19]:
## Make a dummy variable which index user_id
id_list = list(review_df['user_id'])
dummy = [0] * len(review_df)
last = None
counter = -1
for i in range(len(dummy)):
    current = id_list[i]
    if current != last:
        counter += 1
    dummy[i] = counter
    last = current

In [20]:
review_df['dummy_user_id'] = dummy

In [21]:
review_df.sort_values(by='business_id', inplace=True)

In [22]:
## Make a dummy variable which index business_id
id_list = list(review_df['business_id'])
dummy = [0] * len(review_df)
last = None
counter = -1
for i in range(len(dummy)):
    current = id_list[i]
    if current != last:
        counter += 1
    dummy[i] = counter
    last = current

In [23]:
review_df['dummy_business_id'] = dummy

In [24]:
## Create a m*k matrix which holds rating for restaurants(star)
## Each row represents a unique restaurant and each column represents a unique user.
## All ratings should be from 1-5, 0 represents missing value
m = len(business_id_list)
k = len(user_id_list)
rating_mtrx = np.zeros((m,k))
did_rate = np.zeros((m,k))

In [25]:
## store rated ratings into the matrix
ratings = list(review_df['stars'])
i_index = list(review_df['dummy_business_id'])
j_index = list(review_df['dummy_user_id'])
for c in range(len(ratings)):
    ## rating valid only if > 0
    if ratings[c] > 0:
        rating_mtrx[i_index[c], j_index[c]] = ratings[c]
        did_rate[i_index[c], j_index[c]] = 1

In [27]:
rating_mtrx.shape

(7578, 80854)

In [28]:
## Mean Normalize All The Ratings, only on entries that has ratings

def normalize_ratings(ratings, did_rate):
    num_restaurants = ratings.shape[0]
    ratings_mean = np.zeros(shape = (num_restaurants, 1))
    ratings_norm = np.zeros(shape = ratings.shape)

    for i in range(num_restaurants):
        # Get all the indexes where there is a 1
        idx = np.where(did_rate[i] ==1)[0]

        # Calculate mean rating of ith restaurant only from user's that gave a rating
        ratings_mean[i] = np.mean(ratings[i, idx])
        ratings_norm[i, idx] = ratings[i, idx] - ratings_mean[i]

    return (ratings_norm, ratings_mean)

In [29]:
ratings_norm, ratings_mean = normalize_ratings(rating_mtrx, did_rate)

In [31]:
## Define cost function
def calculate_cost(X_and_theta, ratings, did_rate, num_users, num_restaurants, num_features, reg_param):
    # Retrieve the X and theta matrixes from X_and_theta, based on their dimensions
    # ------------------------------------------------------------------------------------------------------
    # Get the first (m * 3) rows in the 3*(m+k) X 1 column vector
    first_3m = X_and_theta[:num_restaurants * num_features]
    # Reshape this column vector into a m X 3 matrix
    X = first_3m.reshape((num_features, num_restaurants)).transpose()
    # Get the rest of the numers, after the first 3*m
    last_3k = X_and_theta[num_restaurants * num_features:]
    # Reshape this column vector into a k X 3 matrix
    theta = last_3k.reshape(num_features, num_users).transpose()

    # we calculate the sum of squared errors here.  
    # in other words, we calculate the squared difference between our predictions and ratings
    cost = np.sum( (X.dot( theta.T ) * did_rate - ratings) ** 2 ) / 2
    print(cost)
    # we get the sum of the square of every element of X and theta
    regularization = (reg_param / 2) * (np.sum( theta**2 ) + np.sum(X**2))
    print('--')
    print(regularization)
    return cost + regularization

In [35]:
def calculate_gradient(X_and_theta, ratings, did_rate, num_users, num_restaurants, num_features, reg_param):
    # Retrieve the X and theta matrixes from X_and_theta, based on their dimensions 
    # --------------------------------------------------------------------------------------------------------
    # Get the first (m * 3) rows in the 3*(m+k) X 1 column vector
    first_3m = X_and_theta[:num_restaurants * num_features]
    # Reshape this column vector into a m X 3 matrix
    X = first_3m.reshape((num_features, num_restaurants)).transpose()
    # Get the rest of the numers, after the first 3*m
    last_3k = X_and_theta[num_restaurants * num_features:]
    # Reshape this column vector into a k X 3 matrix
    theta = last_3k.reshape(num_features, num_users).transpose()

    # we multiply by did_rate because we only want to consider observations for which a rating was given
    difference = X.dot(theta.T) * did_rate - ratings

    # we calculate the gradients (derivatives) of the cost with respect to X and theta
    X_grad = difference.dot( theta ) + reg_param * X
    theta_grad = difference.T.dot( X ) + reg_param * theta

    # wrap the gradients back into a column vector 
    return np.r_[X_grad.T.flatten(), theta_grad.T.flatten()]

In [40]:
X_and_theta = initial_X_and_theta
first_3m = X_and_theta[:num_restaurants * num_features]
# Reshape this column vector into a m X 3 matrix
X = first_3m.reshape((num_features, num_restaurants)).transpose()
# Get the rest of the numers, after the first 3*m
last_3k = X_and_theta[num_restaurants * num_features:]
# Reshape this column vector into a k X 3 matrix
theta = last_3k.reshape(num_features, num_users).transpose()

In [37]:
num_restaurants, num_users = m, k
num_features = 3

# Initialize Parameters theta (user_prefs), X (restaurant_features)
restaurant_features = np.random.randn(num_restaurants, num_features)
user_prefs = np.random.randn(num_users, num_features)
initial_X_and_theta = np.r_[restaurant_features.T.flatten(), user_prefs.T.flatten()]

In [50]:

# Regularization paramater
reg_param = 30.0

# fprime simply refers to the derivative (gradient) of the calculate_cost function
# We iterate 100 times
minimized_cost_and_optimal_params = optimize.fmin_cg(calculate_cost, fprime=calculate_gradient, x0=initial_X_and_theta, \
                            args=(rating_mtrx, did_rate, num_users, num_restaurants, num_features, reg_param), \
                            maxiter=100, disp=True, full_output=True )

2970200.9450644115
--
3964363.309003002
2957586.4456436317
--
3952160.4604763077
2909774.5315716956
--
3903655.096369522
2757567.616272013
--
3714530.1199423824
2597943.093631479
--
3036373.8942338326
2825175.3462391687
--
1730983.3628583185
3136005.5957635585
--
896025.8539524083
2541970.2075725854
--
1226811.810483822
3038868.8058608314
--
816752.6241703387
2451069.876944005
--
1033020.0908068882
2366230.669043193
--
726687.6447741814
5030889.853515998
--
348404.3461208316
2399536.729741383
--
474599.3493470634
3244406.1373481257
--
214275.64433952398
2360093.7025356437
--
289107.89096901205
2145955.107376609
--
188058.43303563894
1858906.2342516426
--
216200.69599597677
1832994.459647082
--
435774.1981282416
1669984.3308034488
--
275921.3641172167
1541288.9625791858
--
266233.0577411079
1554616.1207574545
--
283921.5839481097
1474117.6842916596
--
270838.5470608167
1359375.2417066635
--
300854.7629288458
1255764.6974733698
--
355488.31568402087
1263352.616301798
--
335196.0422939369

In [53]:
# Retrieve the minimized cost and the optimal values of the movie_features (X) and user_prefs (theta) matrices
cost, optimal_restaurant_features_and_user_prefs = \
minimized_cost_and_optimal_params[1], minimized_cost_and_optimal_params[0]

In [54]:
first_3m = optimal_restaurant_features_and_user_prefs[:num_restaurants * num_features]
restaurant_features = first_3m.reshape((num_features, num_restaurants)).transpose()
last_3k = optimal_restaurant_features_and_user_prefs[num_restaurants * num_features:]
user_prefs = last_3k.reshape(num_features, num_users ).transpose()

In [57]:
## Making predictions
all_predictions = restaurant_features.dot(user_prefs.T) + ratings_mean