In [13]:
#import seaborn as sns
import numpy as np
import pandas as pd

from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from sklearn.model_selection import train_test_split
from surprise import SVD

from sklearn.metrics import pairwise_distances

import pymongo
from pymongo import MongoClient

In [14]:
# Initialize mongoDB client
client = MongoClient()
db = client.yelp

In [15]:
# Gather businesses data from DB
# create new collection with split categories
db.temp_businesses.drop()
agr = [{'$project': {'business_id':1,'name':1,'address':1,'city':1,'state':1,'postal_code':1,'latitude':1,'longitude':1,'stars':1,'review_count':1,'is_open':1,'attributes':1,'categories': {'$split':["$categories",","]},'hours':1}},{'$out': 'temp_businesses'}]
businesses_agr = db.businesses.aggregate(agr)
businesses_collection = db.temp_businesses
businesses_df = pd.DataFrame(list(businesses_collection.find({'review_count': {'$gt': 1}})))
businesses_count = businesses_df.shape[0]

In [20]:
# Obtain random business from businesses dataframe
random_number = np.random.randint(1,businesses_count)
business = businesses_df[businesses_df['business_id'] == 'rqcOZePlVvJP9EtzldIz0w'].squeeze()
business

_id                                      5e96645b6f14ef773b7ada5c
business_id                                rqcOZePlVvJP9EtzldIz0w
name                                               Desert Storage
address                                           2402 Atchley Dr
city                                                    Henderson
state                                                          NV
postal_code                                                 89052
latitude                                                   35.952
longitude                                                -115.093
stars                                                           5
review_count                                                    5
is_open                                                         0
attributes                 {'BusinessAcceptsCreditCards': 'True'}
hours           {'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...
categories                        [Local Services,  Self Storage]
Name: 18, 

In [21]:
# Get categories array
categories = business.get('categories')

In [22]:
# Get Businesses in categories ($all for all in categories, $in for any in categories) and filter by city to accomplish recommendation by context
businesses_in_categories_df = pd.DataFrame(list(businesses_collection.find({ "categories": {"$all": categories}})))
business_ids_in_categories = businesses_in_categories_df['business_id'].tolist()

In [24]:
# Now obtain all useful (greater than zero) reviews from the group of businesses_in_categories_df
reviews_df = pd.DataFrame(list(db.reviews.find({'business_id': {'$in': business_ids_in_categories}})))
reviews_df.stars = reviews_df.stars.astype('int')

In [25]:
# group reviews dataframe by number of reviews and remove those businesses that have just one review
reviews_group = reviews_df.groupby('business_id')['stars'].agg({'count'}).sort_values(by='count', ascending=False)
reviews_group = reviews_group[reviews_group['count'] < 2].reset_index()
reviews_df_filtered = reviews_df.merge(reviews_group, how="inner", on="business_id").drop('count', axis=1)
reviews_df_filtered = pd.concat([reviews_df,reviews_df_filtered])
reviews_df_filtered = reviews_df_filtered.drop_duplicates(keep=False)

In [27]:
# Create Surprise data
reader = Reader( rating_scale= (1,5))
train_df, test_df = train_test_split(reviews_df_filtered[['business_id','user_id','stars']], test_size=.2, stratify=reviews_df_filtered['business_id'])
train_set = Dataset.load_from_df(train_df, reader)
train_set = train_set.build_full_trainset()

In [28]:
# Setup algorithm using SVD
svd = SVD( n_factors = 60, n_epochs = 200, biased = True, lr_all = 0.01, reg_all = 0, init_mean = 0, init_std_dev = 0.01, verbose = False )
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1626f42e0>

In [29]:
# Obtain pu matrix and get our business vector
pu = svd.pu
business_vector_innerid = train_set.to_inner_uid(business.get('business_id'))
business_vector = pu[business_vector_innerid]

In [30]:
# Calculate the euclidean distance between our business and everything else
distances = pairwise_distances(pu,business_vector.reshape(1, -1),'euclidean')
# Obtain an indirect array with the indices of the sorted distances array
ordered_distances_index = np.argsort(distances.flatten())

In [80]:
# If number of neighbours is greater than number_of_neighbours, return first number_of_neighbours, otherwise return what we have
number_of_neighbours = 10
neighbour_indices = []
if ordered_distances_index.shape[0] > number_of_neighbours:
    neighbour_indices = ordered_distances_index[1:number_of_neighbours+1]
else:
    neighbour_indices = ordered_distances_index

In [81]:
# Get neighbour ids of selected business
business_neighbours_ids = (train_set.to_raw_uid(rid) for rid in neighbour_indices)
business_neighbours = []
for neighbour in business_neighbours_ids:
    business_neighbours.append(neighbour)

In [82]:
# Query business ids to obtain dataframe of neighbours
businesses_neighbours_df = pd.DataFrame(list(businesses_collection.find({ "business_id": {"$in": business_neighbours} })))
businesses_neighbours_df

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,hours,categories
0,5e9c73d6d2d1710a29bdec5a,oKjd9T-8Lxr_TWQZiN2BRA,Julia Grambo - Cornel Realty,"5575 S Durango, Ste 107",Las Vegas,NV,89113,36.087726,-115.279776,5.0,4,1,"{'ByAppointmentOnly': 'False', 'BusinessAccept...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...","[Home Services, Real Estate, Real Estate Age..."
1,5e9c73d6d2d1710a29be0cfd,ypXftd9bRXqFVHyud9UbjQ,Jack Burns - Realty One Group,"3530 S Val Vista Dr, Ste 114",Gilbert,AZ,85297,33.286087,-111.75729,5.0,4,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","[Home Services, Real Estate Agents, Real Est..."
2,5e9c73d6d2d1710a29be2da7,2DfvzJN4SAiRdEw3gVxBBw,Arizona SOLD Sisters,"8388 E Hartford Drive, Ste 100",Scottsdale,AZ,85255,33.642005,-111.899244,5.0,5,1,"{'ByAppointmentOnly': 'False', 'BusinessAccept...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Home Services, Real Estate, Real Estate Age..."
3,5e9c73d7d2d1710a29be9fd3,ze5kafJm7b-BP6p7tsN3-g,Danilo Martinez - King Realty Group,"8 Sampson Mews, Suite 201",Toronto,ON,M3C 0H5,43.735056,-79.344129,5.0,3,1,,"{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","[Home Services, Real Estate Agents, Real Est..."
4,5e9c73d7d2d1710a29beaaf9,XYGW3626I4SuSnVPRrZ6Sg,Amy Koch - Kreate Property Group,22912 N 39th Ter,Phoenix,AZ,85050,33.694832,-111.985482,5.0,4,1,{'ByAppointmentOnly': 'True'},"{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","[Home Services, Real Estate, Real Estate Age..."
5,5e9c73d9d2d1710a29bf6644,xV9fOn7haAXpklW8Rmsp_g,Dustin Dufour - Berkshire Hathaway Homeservices,"8850 W Sunset Rd, Ste 200",Las Vegas,NV,89148,36.071782,-115.284717,5.0,3,1,"{'BusinessAcceptsBitcoin': 'False', 'BusinessA...","{'Monday': '9:0-17:30', 'Tuesday': '9:0-17:30'...","[Home Services, Real Estate Services, Real E..."
6,5e9c73d9d2d1710a29bfa43c,5PGwo7RuTWfn-66G1JNE6g,Pam Carlson - Realty Executives,"1528 E Williams Field Rd, Ste 106",Gilbert,AZ,85295,33.307702,-111.757237,5.0,4,1,{'BusinessAcceptsCreditCards': 'False'},"{'Monday': '7:0-21:0', 'Tuesday': '7:0-21:0', ...","[Home Services, Real Estate, Real Estate Age..."
7,5e9c73dad2d1710a29bff51a,hLcJwsQPyGpGUf-Wzn_ZDw,Tucker Blalock - The Brokery,4546 N 40th St,Phoenix,AZ,85018,33.504221,-111.995695,5.0,16,1,"{'BusinessAcceptsCreditCards': 'False', 'Busin...","{'Monday': '6:30-21:0', 'Tuesday': '6:30-21:0'...","[Home Services, Property Management, Real Es..."
8,5e9c73dbd2d1710a29c03602,ll5LOr-RYkeEPeAjnrz5zA,Ken Ramsay - Harvey Kalles Real Estate,2145 Avenue Road,Toronto,ON,M5M 4B2,43.739673,-79.421201,5.0,3,1,{'ByAppointmentOnly': 'False'},"{'Monday': '8:0-0:0', 'Tuesday': '8:0-0:0', 'W...","[Home Services, Real Estate Agents, Real Est..."
9,5e9c73dcd2d1710a29c0f2bb,42WkEwAf8zoKDC9xFlIE8w,Chris Dowlen - Keller Williams Realty,"2230 Corporate Cir, Ste 250",Henderson,NV,89074,36.02816,-115.088428,5.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'BusinessA...","{'Monday': '7:0-19:0', 'Tuesday': '7:0-19:0', ...","[Home Services, Real Estate, Real Estate Age..."


## Export busines records to line separated json

In [83]:
businesses_records = businesses_neighbours_df.astype({'_id': 'str'})
businesses_records

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,hours,categories
0,5e9c73d6d2d1710a29bdec5a,oKjd9T-8Lxr_TWQZiN2BRA,Julia Grambo - Cornel Realty,"5575 S Durango, Ste 107",Las Vegas,NV,89113,36.087726,-115.279776,5.0,4,1,"{'ByAppointmentOnly': 'False', 'BusinessAccept...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...","[Home Services, Real Estate, Real Estate Age..."
1,5e9c73d6d2d1710a29be0cfd,ypXftd9bRXqFVHyud9UbjQ,Jack Burns - Realty One Group,"3530 S Val Vista Dr, Ste 114",Gilbert,AZ,85297,33.286087,-111.75729,5.0,4,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","[Home Services, Real Estate Agents, Real Est..."
2,5e9c73d6d2d1710a29be2da7,2DfvzJN4SAiRdEw3gVxBBw,Arizona SOLD Sisters,"8388 E Hartford Drive, Ste 100",Scottsdale,AZ,85255,33.642005,-111.899244,5.0,5,1,"{'ByAppointmentOnly': 'False', 'BusinessAccept...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Home Services, Real Estate, Real Estate Age..."
3,5e9c73d7d2d1710a29be9fd3,ze5kafJm7b-BP6p7tsN3-g,Danilo Martinez - King Realty Group,"8 Sampson Mews, Suite 201",Toronto,ON,M3C 0H5,43.735056,-79.344129,5.0,3,1,,"{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","[Home Services, Real Estate Agents, Real Est..."
4,5e9c73d7d2d1710a29beaaf9,XYGW3626I4SuSnVPRrZ6Sg,Amy Koch - Kreate Property Group,22912 N 39th Ter,Phoenix,AZ,85050,33.694832,-111.985482,5.0,4,1,{'ByAppointmentOnly': 'True'},"{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","[Home Services, Real Estate, Real Estate Age..."
5,5e9c73d9d2d1710a29bf6644,xV9fOn7haAXpklW8Rmsp_g,Dustin Dufour - Berkshire Hathaway Homeservices,"8850 W Sunset Rd, Ste 200",Las Vegas,NV,89148,36.071782,-115.284717,5.0,3,1,"{'BusinessAcceptsBitcoin': 'False', 'BusinessA...","{'Monday': '9:0-17:30', 'Tuesday': '9:0-17:30'...","[Home Services, Real Estate Services, Real E..."
6,5e9c73d9d2d1710a29bfa43c,5PGwo7RuTWfn-66G1JNE6g,Pam Carlson - Realty Executives,"1528 E Williams Field Rd, Ste 106",Gilbert,AZ,85295,33.307702,-111.757237,5.0,4,1,{'BusinessAcceptsCreditCards': 'False'},"{'Monday': '7:0-21:0', 'Tuesday': '7:0-21:0', ...","[Home Services, Real Estate, Real Estate Age..."
7,5e9c73dad2d1710a29bff51a,hLcJwsQPyGpGUf-Wzn_ZDw,Tucker Blalock - The Brokery,4546 N 40th St,Phoenix,AZ,85018,33.504221,-111.995695,5.0,16,1,"{'BusinessAcceptsCreditCards': 'False', 'Busin...","{'Monday': '6:30-21:0', 'Tuesday': '6:30-21:0'...","[Home Services, Property Management, Real Es..."
8,5e9c73dbd2d1710a29c03602,ll5LOr-RYkeEPeAjnrz5zA,Ken Ramsay - Harvey Kalles Real Estate,2145 Avenue Road,Toronto,ON,M5M 4B2,43.739673,-79.421201,5.0,3,1,{'ByAppointmentOnly': 'False'},"{'Monday': '8:0-0:0', 'Tuesday': '8:0-0:0', 'W...","[Home Services, Real Estate Agents, Real Est..."
9,5e9c73dcd2d1710a29c0f2bb,42WkEwAf8zoKDC9xFlIE8w,Chris Dowlen - Keller Williams Realty,"2230 Corporate Cir, Ste 250",Henderson,NV,89074,36.02816,-115.088428,5.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'BusinessA...","{'Monday': '7:0-19:0', 'Tuesday': '7:0-19:0', ...","[Home Services, Real Estate, Real Estate Age..."


In [84]:
businesses_records.to_json('../data/businesses_neighbours.json', orient='records', lines=True)

## Export reviews of the selected businesses

In [85]:
#businesses_neighbours_reviews_df = pd.DataFrame(list(db.reviews.find({ "business_id": {"$in": business_neighbours}, 'useful': {'$gt': 0} })))
businesses_neighbours_reviews_df = pd.DataFrame(list(db.reviews.find({ "business_id": {"$in": business_neighbours} })))
businesses_neighbours_reviews_df

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,5e9249c84042b2eb9804a80e,YlS9L1R6qcHpe9tWZuCjrg,tx1nuFCSwrOwUy5al4sygQ,2DfvzJN4SAiRdEw3gVxBBw,5.0,0,0,0,I've given Arizona Sold Sisters the highest re...,2019-03-30 19:09:52
1,5e9249cd4042b2eb9807052d,6MwgkVUKm89bXYXv1UZ49g,PTpCPqx91Ot0s8k8QbNYYg,2DfvzJN4SAiRdEw3gVxBBw,5.0,0,0,0,Wendy and Jill helped me with the sale of a pr...,2019-04-04 15:02:14
2,5e9249ce4042b2eb980797f1,V07d2e8uKUnu0tUf0QnoWg,liD61ZD1DRXJDu6AW-aW2A,2DfvzJN4SAiRdEw3gVxBBw,5.0,0,0,0,AZ sold sisters made the process of selling my...,2019-04-11 01:46:43
3,5e9249ce4042b2eb9807b8f5,xbcKyYiUUFqC86L-l4aoXg,GohTl07M4AFcLL6mHNQDQA,2DfvzJN4SAiRdEw3gVxBBw,5.0,0,0,0,Wendy and Jill helped us for the second time! ...,2019-03-31 05:25:12
4,5e9249ce4042b2eb9807e666,ygYZIM7kjvTyTEcW9UUCGQ,6lSZwA8iX683BPc7f-bIgA,2DfvzJN4SAiRdEw3gVxBBw,5.0,0,0,0,HIRE THEM! Wendy and Jill were absolutely incr...,2019-07-03 19:07:54
5,5e924a894042b2eb986378df,E4MTf5qRWHcDDH4_h5a5sA,wmoODifgWZhc3fANRufUkw,42WkEwAf8zoKDC9xFlIE8w,5.0,0,0,0,"If it wasn't for this team, we wouldn't be liv...",2018-08-31 01:15:07
6,5e924a8b4042b2eb9864812b,7B_jfe_0pfNQcy48RzNLQw,5rA0Yglr1ZPoG7EkXQOnYA,42WkEwAf8zoKDC9xFlIE8w,5.0,0,0,0,"Friendly, professional, efficient and fast. We...",2017-02-02 19:44:39
7,5e924a934042b2eb98681556,uEgd6ZyEK9KVkbK8zlcUyg,Sc53RPpOtXdAn0AAgZGziQ,42WkEwAf8zoKDC9xFlIE8w,5.0,0,0,0,Chris and his team are everything you want/exp...,2018-01-04 21:48:00
8,5e924a964042b2eb9869800c,VvN8nwzFCesLadRIi21B3w,_zK6XJ6tjdAGTfp2NIijMQ,42WkEwAf8zoKDC9xFlIE8w,5.0,0,0,0,"Amazing experience, couldn't have been better ...",2016-12-13 19:30:02
9,5e924a2e4042b2eb9837d782,BzZKiD5pQji8hlq7iEJV6g,tdO_P1NEKkxj9012xiejdg,5PGwo7RuTWfn-66G1JNE6g,5.0,0,1,0,If you're looking for a realtor in the Phoenix...,2014-05-16 14:56:44


In [86]:
businesses_neighbours_reviews_records = businesses_neighbours_reviews_df.astype({'_id': 'str'})
businesses_neighbours_reviews_records.to_json('../data/businesses_neighbours_reviews.json', orient='records', lines=True)