# Recommend Business to Users - Algorithm 2

### Here we utilize different collaborative filtering techniques to recommend business to users. For evaluation and comparison of the different approaches we use RMSE as we are predicting the user's rating for a particular resturant based on his historical rating data. Let's start with item-item based filtering. 

###### Note referred cambridgespark article on collaborative filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances as pdist
from sklearn import cross_validation as cv
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Read the input data frame
df = pd.read_csv('20PlusReviews_Excluding_Rare_Biz_Users.csv')

In [3]:
df.head()

Unnamed: 0,Business Id,User Id,Review Id,User_Stars,Text_encoded,Business_Stars,Useful,Funny,Cool
0,dKdApYVFDSNYsNOso6NYlA,bLbSNkLggFnqwNNzzq-Ijw,LwszgYoywAhMaIdt3zPgug,4,Crave those crazy squares!! Not breakfast cere...,3.0,10,9,10
1,dKdApYVFDSNYsNOso6NYlA,w0Yyvns5BCIL5s_7JuFrug,wxKpyfSQPMGNHSKWFFIbDQ,3,"Well, White Castle kind of is what it is... I ...",3.0,8,2,2
2,dKdApYVFDSNYsNOso6NYlA,VYOQLKuR0Ugy9lU-QiOuLw,kTdvdp0dFu2Q4hAO4hk2Dw,4,Located inside the Casino Royale Hotel & Casin...,3.0,0,0,0
3,dKdApYVFDSNYsNOso6NYlA,qlaVQkSxKcqfHJhoZU3rKg,hwEz-5l4BP5nYoGZD9xDrg,2,What a disappointment! I was really looking f...,3.0,1,2,1
4,dKdApYVFDSNYsNOso6NYlA,8OeTLey-p-WaL9ErNEci1Q,s2MW1ToXDIsD-WcdV__dPQ,2,I am being generous with 2 stars.\\nSo disappo...,3.0,5,9,5


In [4]:
# Extract only the User, Business and Rating columns
df = df[['User Id', 'Business Id', 'User_Stars']]
df.rename(columns={'User Id': 'user_id', 'Business Id': 'business_id', 'User_Stars' : 'stars'}, inplace=True)


In [5]:
# Check the number of users and businesses
users = df.user_id.unique().shape[0]
business = df.business_id.unique().shape[0]
print ('Number of users = ' + str(users) + ' | Number of business = ' + str(business))

Number of users = 3002 | Number of business = 4087


In [6]:
sparsity=round(1.0-len(df)/float(users*business),3)
print ('The sparsity level of this data is ' +  str(sparsity*100) + '%')

The sparsity level of this data is 98.7%


In [7]:
#Get the business user rating table
rating_table = df[['user_id', 'business_id', 'stars']]
rating_table_df = rating_table.pivot_table(index='user_id', columns='business_id', values='stars')

In [8]:
type(rating_table_df)

pandas.core.frame.DataFrame

In [9]:
list(rating_table_df)

['--9e1ONYQuAa-CB_Rrw7Tw',
 '-1vfRrlnNnNJ5boOVghMPA',
 '-3zffZUHoY8bQjGfPSoBKQ',
 '-8R_-EkGpUhBk55K9Dd4mg',
 '-9YyInW1wapzdNZrhQJ9dg',
 '-AD5PiuJHgdUcAK-Vxao2A',
 '-BS4aZAQm9u41YnB9MUASA',
 '-Bf8BQ3yMk8U2f45r2DRKw',
 '-BmqghX1sv7sgsxOIS2yAg',
 '-Bv-HHUs8aHzDrdWcZHn8w',
 '-C8sSrFqaCxp51pyo-fQLQ',
 '-CQokjildrY7UZezXCdEBw',
 '-CbDQXiuKzPQ0_jiUz03aw',
 '-FNquqGseSCVMWo7KbK-Tg',
 '-FcZY7a7qgxTUlTvwuyJnQ',
 '-G7MPSNBpxRJmtrJxdwt7A',
 '-IWsoxH7mLJTTpU5MmWY4w',
 '-JFVxwdVQfX207554dx1nw',
 '-JfncbVURtvuzBbof40o1w',
 '-K82LBrI3H0FVuhTbNDpRA',
 '-LMycE26AortJDsbc8oXOg',
 '-Le6cwbZL4tDZwNHwipfKg',
 '-LzUYsR54hd8cCiYXTquRA',
 '-N_agk8RUHvHk3GVDsyunQ',
 '-NjZ5HZApXjRJaTVZeSO_w',
 '-OEIW0dO96-492qa_luxaw',
 '-P8dGzSVhJi-5oZ-8U2y0w',
 '-PGsEXB6DFTVKa1eDOlzWA',
 '-PbM4ksxhGZVpgCpRakcgA',
 '-QNQ4R1VgRoL4b6lC8Anrg',
 '-Qkx7W0itbAApcG5lJuMFQ',
 '-RJ216TTIghZshCkUlD1WQ',
 '-U7tvCtaraTQ9b0zBhpBMA',
 '-Uix-n4Jqo4W7ERagC5qAA',
 '-WLrZPzjKfrftLWaCi1QZQ',
 '-YCd2_DdJUeZx3RRcmIZ_g',
 '-YWr1wm_NaAlgEf9TZzsMQ',
 

In [10]:
train_data, test_data = cv.train_test_split(rating_table_df, test_size=0.25)

In [11]:
train_data_matrix = train_data.values
test_data_matrix = test_data.values

In [12]:
train_data_matrix = np.nan_to_num(train_data_matrix)
test_data_matrix = np.nan_to_num(test_data_matrix)

In [13]:
train_data_matrix

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 2.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  3.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [14]:
test_data_matrix.shape

(751L, 4087L)

In [15]:
train_data_matrix.shape

(2251L, 4087L)

#### Calculate the user based similarity and item-based similarity

In [16]:
# Calculate cosine similarity
user_similarity = pdist(train_data_matrix, metric='cosine')
item_similarity = pdist(train_data_matrix.T, metric='cosine')

In [17]:
# Similarity values between users
def user_filtering(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
    prediction = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return prediction


In [18]:
# Similarity values for items
def item_filtering(ratings, similarity):
    prediction = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return prediction

### Predict on test data

In [19]:
item_prediction = item_filtering(train_data_matrix, item_similarity)
user_prediction = user_filtering(train_data_matrix, user_similarity)

In [20]:
# Function to calculate RMSE values
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

### The item based approach gives an RMSE of 3.83 whereas the user based approach gives a slightly smaller RMSE 3.74 as calculated below

In [21]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))

User-based CF RMSE: 3.73812045763


In [22]:
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

Item-based CF RMSE: 3.82821984379


### Now let's check the recommendations using memory based approaches

In [23]:
from surprise import Reader, Dataset
from surprise import GridSearch
from surprise import Reader, Dataset
from surprise import GridSearch
from surprise import accuracy, evaluate, SVD, KNNBasic,KNNWithMeans
import random

In [24]:
# We will be reusing the df that was created at the start of this code with User, Business and Rating information only and 
# load it to dataset from surprise package which is used for memory based methods
df.head()

Unnamed: 0,user_id,business_id,stars
0,bLbSNkLggFnqwNNzzq-Ijw,dKdApYVFDSNYsNOso6NYlA,4
1,w0Yyvns5BCIL5s_7JuFrug,dKdApYVFDSNYsNOso6NYlA,3
2,VYOQLKuR0Ugy9lU-QiOuLw,dKdApYVFDSNYsNOso6NYlA,4
3,qlaVQkSxKcqfHJhoZU3rKg,dKdApYVFDSNYsNOso6NYlA,2
4,8OeTLey-p-WaL9ErNEci1Q,dKdApYVFDSNYsNOso6NYlA,2


In [25]:
# For the surprise Dataset, we need to initialize the rating scale in the reader as below
reader = Reader(rating_scale=(1,5))
# Load the data from the file using the reader format
data = Dataset.load_from_df(df,reader)
raw_ratings = data.raw_ratings
#random shuffle the data
random.shuffle(raw_ratings)
# Split the data into train and test
t = int(0.75*len(raw_ratings))
train_raw_ratings = raw_ratings[:t]
test_raw_ratings = raw_ratings[t:]
# Set the data to training
data.raw_ratings = train_raw_ratings 
data.split(n_folds =3) # cross validation


### Using SVD for recommendations

In [26]:
# Train on the training data created above
trainset = data.build_full_trainset()
algo = SVD() # SVD algorithm
algo.train(trainset)

In [27]:
# predict on the testset
testset = data.construct_testset(test_raw_ratings)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.0221


1.0220563619640592

### Item based using KNNBasic and KNNWithMeans algorithm

In [30]:
## KNN Basic Item based - using cosine similarity
sim_options_1 = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
algo_1 = KNNBasic(sim_options=sim_options_1)
# training knn basic
algo_1.train(trainset)
#predict on  test set
predictions_1 = algo_1.test(testset)
accuracy.rmse(predictions_1)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0958


1.0957770511833034

In [31]:
## KNN Basic Item based - using pearson similarity
sim_options_2 = {'name': 'pearson',
               'user_based': False  # compute  similarities between items
               }

algo_2 =  KNNBasic(sim_options=sim_options_2)
# training knn basic
algo_2.train(trainset)
#predict on  test set
predictions_2 = algo_2.test(testset)
accuracy.rmse(predictions_2)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1566


1.1566202106964447

In [32]:
## KNN means Item based - cosine similarity
sim_options_3 = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
algo_3 = KNNWithMeans(sim_options=sim_options_3)
# training knn basic
algo_3.train(trainset)
#predict on  test set
predictions_3 = algo_3.test(testset)
accuracy.rmse(predictions_3)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0283


1.0282643916733905

In [33]:
## KNN means Item based - pearson similarity
sim_options_4 = {'name': 'pearson',
               'user_based': False  # compute  similarities between items
               }
algo_4 = KNNWithMeans(sim_options=sim_options_4)
# training knn basic
algo_4.train(trainset)
#predict on  test set
predictions_4 = algo_4.test(testset)
accuracy.rmse(predictions_4)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0844


1.0843867006365935

### User based using KNN Basic and KNN Means

In [34]:
## KNN Basic User based - using cosine similarity
sim_options_5 = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }
algo_5 = KNNBasic(sim_options=sim_options_5)
# training knn basic
algo_5.train(trainset)
#predict on  test set
predictions_5 = algo_5.test(testset)
accuracy.rmse(predictions_5)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0799


1.0799172555500904

In [35]:
## KNN Basic User based - using pearson similarity
sim_options_6 = {'name': 'pearson',
               'user_based': True  # compute  similarities between users
               }
algo_6 = KNNBasic(sim_options=sim_options_6)
# training knn basic
algo_6.train(trainset)
#predict on  test set
predictions_6 = algo_6.test(testset)
accuracy.rmse(predictions_6)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.1392


1.1392014387250191

In [37]:
## KNN means User based - cosine similarity
sim_options_7 = {'name': 'cosine',
               'user_based': True  # compute similarities between users
               }
algo_7 = KNNWithMeans(sim_options=sim_options_7)
# training knn basic
algo_7.train(trainset)
#predict on  test set
predictions_7 = algo_7.test(testset)
accuracy.rmse(predictions_7)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0370


1.0370283747157898

In [38]:
## KNN means User based - pearson similarity
sim_options_8 = {'name': 'pearson',
               'user_based': True  # compute similarities between users
               }
algo_8 = KNNWithMeans(sim_options=sim_options_8)
# training knn basic
algo_8.train(trainset)
#predict on  test set
predictions_8 = algo_8.test(testset)
accuracy.rmse(predictions_8)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0868


1.0868231334842784