In [64]:
#import seaborn as sns
import numpy as np
import pandas as pd

from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import SVD

from sklearn.metrics import pairwise_distances

import pymongo
from pymongo import MongoClient

In [65]:
# Initialize mongoDB client
client = MongoClient()
db = client.yelp

In [66]:
# Gather businesses data from DB
# create new collection with split categories
db.temp_businesses.drop()
agr = [{'$project': {'business_id':1,'name':1,'address':1,'city':1,'state':1,'postal_code':1,'latitude':1,'longitude':1,'stars':1,'review_count':1,'is_open':1,'attributes':1,'categories': {'$split':["$categories",","]},'hours':1}},{'$out': 'temp_businesses'}]
businesses_agr = db.businesses.aggregate(agr)
businesses_collection = db.temp_businesses
businesses_df = pd.DataFrame(list(businesses_collection.find()))
businesses_count = businesses_df.shape[0]

In [67]:
# Obtain random business from businesses dataframe
random_number = np.random.randint(1,businesses_count)
business = businesses_df.iloc[random_number]
business

_id                                      5e9664656f14ef773b7c5212
business_id                                Y7al_T_o06-AmCdpJfcNfA
name                                       American Executive Inn
address                                            1554 W Main St
city                                                         Mesa
state                                                          AZ
postal_code                                                 85201
latitude                                                  33.4151
longitude                                                -111.864
stars                                                           2
review_count                                                   23
is_open                                                         1
attributes      {'RestaurantsPriceRange2': '1', 'WiFi': 'u'fre...
hours                                                        None
categories      [Hotels,  Event Planning & Services,  Hotels &...
Name: 9620

In [68]:
# Get categories array
categories = business.get('categories')

In [69]:
# Get Businesses in categories ($all for all in categories, $in for any in categories)
businesses_in_categories_df = pd.DataFrame(list(businesses_collection.find({ "categories": {"$all": categories} })))
business_ids_in_categories = businesses_in_categories_df['business_id'].tolist()

In [70]:
# Now obtain all useful (greater than zero) reviews from the group of businesses_in_categories_df
reviews_df = pd.DataFrame(list(db.reviews.find({'business_id': {'$in': business_ids_in_categories}, 'useful': {'$gt': 0}})))
reviews_df.stars = reviews_df.stars.astype('int')

In [71]:
# Create Surprise data
reader = Reader( rating_scale= (1,5))
surprise_dataset = Dataset.load_from_df(reviews_df[['business_id','user_id','stars']], reader)
train_set, test_set = train_test_split(surprise_dataset, test_size=.2)

In [72]:
# Setup algorithm using SVD
svd = SVD( n_factors = 60, n_epochs = 200, biased = True, lr_all = 0.01, reg_all = 0, init_mean = 0, init_std_dev = 0.01, verbose = False )
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1670393a0>

In [73]:
# Obtain pu matrix and get our business vector
pu = svd.pu
business_vector_innerid = train_set.to_inner_uid(business.get('business_id'))
business_vector = pu[business_vector_innerid]

In [74]:
# Calculate the euclidean distance between our business and everything else
distances = pairwise_distances(pu,business_vector.reshape(1, -1),'euclidean')
# Obtain an indirect array with the indices of the sorted distances array
ordered_distances_index = np.argsort(distances.flatten())

In [75]:
# If number of neighbours is greater than number_of_neighbours, return first number_of_neighbours, otherwise return what we have
number_of_neighbours = 10
neighbour_indices = []
if ordered_distances_index.shape[0] > number_of_neighbours:
    neighbour_indices = ordered_distances_index[1:number_of_neighbours+1]
else:
    neighbour_indices = ordered_distances_index

In [76]:
# Get neighbour ids of selected business
business_neighbours_ids = (train_set.to_raw_uid(rid) for rid in neighbour_indices)
business_neighbours = []
for neighbour in business_neighbours_ids:
    business_neighbours.append(neighbour)

In [77]:
# Query business ids to obtain dataframe of neighbours
businesses_neighbours_df = pd.DataFrame(list(businesses_collection.find({ "business_id": {"$in": business_neighbours} })))
businesses_neighbours_df

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,hours,categories
0,5e96645c6f14ef773b7b1149,5FVBaD1poRZQeqUDxbWWQA,Sleep Inn Phoenix North I-17,18235 N 27th Ave,Phoenix,AZ,85053,33.65265,-112.116148,3.0,31,1,"{'RestaurantsPriceRange2': '2', 'WiFi': 'u'fre...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Event Planning & Services, Hotels &..."
1,5e96645e6f14ef773b7b63b7,uNjAp7k2_ydqZY3W1SWgYQ,Americas Best Value Inn,424 W Van Buren St,Phoenix,AZ,85003,33.451565,-112.080081,4.0,28,0,"{'BusinessAcceptsCreditCards': 'True', 'WiFi':...","{'Monday': '1:0-1:0', 'Tuesday': '1:0-1:0', 'W...","[Hotels, Event Planning & Services, Hotels &..."
2,5e9664646f14ef773b7c256c,NqGtKSw1EnnQ2pbCN9VrfQ,Quality Inn,6347 E. Southern Ave.,Mesa,AZ,85206,33.392346,-111.693275,2.0,3,0,"{'WiFi': ''free'', 'BusinessAcceptsCreditCards...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."
3,5e9664666f14ef773b7c69aa,8_QKutB_FqV05o38ySEG7A,Gateway Motel,928 Las Vegas Blvd S,Las Vegas,NV,89101,36.159194,-115.14768,1.5,20,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...",,"[Hotels, Hotels & Travel, Event Planning & S..."
4,5e9664676f14ef773b7c8534,GK_c37NNNGB13syjiaPNoA,Sonesta ES Suites Cleveland Airport,17525 Rosbough Dr,Middleburg Heights,OH,44130,41.374085,-81.822052,3.5,28,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Event Planning & Services, Hotels &..."
5,5e9664686f14ef773b7cb2b4,AF51nM-fjYjBVJRo6cR_Ow,Hampton Inn Phoenix-Midtown-Downtown Area,160 W Catalina Dr,Phoenix,AZ,85013,33.482432,-112.077399,4.0,60,1,"{'WiFi': 'u'free'', 'BusinessAcceptsBitcoin': ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."
6,5e9664686f14ef773b7cca51,PdKoeg9Rh_1BefgUXTDIbg,SpringHill Suites West Mifflin,1000 Regis Ave,Pittsburgh,PA,15236,40.347161,-79.951463,3.0,18,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."
7,5e9664696f14ef773b7ceb62,H03TuN-fMr4lwQP3wJobYA,Days Inn-Calgary South,3828 MacLeod Trl SE,Calgary,AB,T2G 2R2,51.02037,-114.060787,1.5,10,1,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...",,"[Hotels, Hotels & Travel, Event Planning & S..."
8,5e9664696f14ef773b7cf104,_Dz7a9gMBJgo_JTO0PPuxw,Clarion Hotel and Conference Centre,2120 16 Ave NE,Calgary,AB,T2E 1L4,51.067627,-114.010285,3.0,18,1,"{'RestaurantsPriceRange2': '2', 'WiFi': 'u'fre...",,"[Hotels, Hotels & Travel, Event Planning & S..."
9,5e96646e6f14ef773b7daac9,gjR5bIyL4ETB9CtWgNL64A,Skyline Hotel,1741 N Boulder Hwy,Henderson,NV,89011,36.062463,-115.008362,4.5,21,1,"{'WiFi': 'u'free'', 'BusinessAcceptsCreditCard...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."


## Export busines records to line separated json

In [78]:
businesses_records = businesses_neighbours_df.astype({'_id': 'str'})
businesses_records

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,hours,categories
0,5e96645c6f14ef773b7b1149,5FVBaD1poRZQeqUDxbWWQA,Sleep Inn Phoenix North I-17,18235 N 27th Ave,Phoenix,AZ,85053,33.65265,-112.116148,3.0,31,1,"{'RestaurantsPriceRange2': '2', 'WiFi': 'u'fre...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Event Planning & Services, Hotels &..."
1,5e96645e6f14ef773b7b63b7,uNjAp7k2_ydqZY3W1SWgYQ,Americas Best Value Inn,424 W Van Buren St,Phoenix,AZ,85003,33.451565,-112.080081,4.0,28,0,"{'BusinessAcceptsCreditCards': 'True', 'WiFi':...","{'Monday': '1:0-1:0', 'Tuesday': '1:0-1:0', 'W...","[Hotels, Event Planning & Services, Hotels &..."
2,5e9664646f14ef773b7c256c,NqGtKSw1EnnQ2pbCN9VrfQ,Quality Inn,6347 E. Southern Ave.,Mesa,AZ,85206,33.392346,-111.693275,2.0,3,0,"{'WiFi': ''free'', 'BusinessAcceptsCreditCards...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."
3,5e9664666f14ef773b7c69aa,8_QKutB_FqV05o38ySEG7A,Gateway Motel,928 Las Vegas Blvd S,Las Vegas,NV,89101,36.159194,-115.14768,1.5,20,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...",,"[Hotels, Hotels & Travel, Event Planning & S..."
4,5e9664676f14ef773b7c8534,GK_c37NNNGB13syjiaPNoA,Sonesta ES Suites Cleveland Airport,17525 Rosbough Dr,Middleburg Heights,OH,44130,41.374085,-81.822052,3.5,28,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Event Planning & Services, Hotels &..."
5,5e9664686f14ef773b7cb2b4,AF51nM-fjYjBVJRo6cR_Ow,Hampton Inn Phoenix-Midtown-Downtown Area,160 W Catalina Dr,Phoenix,AZ,85013,33.482432,-112.077399,4.0,60,1,"{'WiFi': 'u'free'', 'BusinessAcceptsBitcoin': ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."
6,5e9664686f14ef773b7cca51,PdKoeg9Rh_1BefgUXTDIbg,SpringHill Suites West Mifflin,1000 Regis Ave,Pittsburgh,PA,15236,40.347161,-79.951463,3.0,18,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."
7,5e9664696f14ef773b7ceb62,H03TuN-fMr4lwQP3wJobYA,Days Inn-Calgary South,3828 MacLeod Trl SE,Calgary,AB,T2G 2R2,51.02037,-114.060787,1.5,10,1,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...",,"[Hotels, Hotels & Travel, Event Planning & S..."
8,5e9664696f14ef773b7cf104,_Dz7a9gMBJgo_JTO0PPuxw,Clarion Hotel and Conference Centre,2120 16 Ave NE,Calgary,AB,T2E 1L4,51.067627,-114.010285,3.0,18,1,"{'RestaurantsPriceRange2': '2', 'WiFi': 'u'fre...",,"[Hotels, Hotels & Travel, Event Planning & S..."
9,5e96646e6f14ef773b7daac9,gjR5bIyL4ETB9CtWgNL64A,Skyline Hotel,1741 N Boulder Hwy,Henderson,NV,89011,36.062463,-115.008362,4.5,21,1,"{'WiFi': 'u'free'', 'BusinessAcceptsCreditCard...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...","[Hotels, Hotels & Travel, Event Planning & S..."


In [79]:
businesses_records.to_json('../data/businesses_neighbours.json', orient='records', lines=True)

## Export reviews of the selected businesses

In [80]:
#businesses_neighbours_reviews_df = pd.DataFrame(list(db.reviews.find({ "business_id": {"$in": business_neighbours}, 'useful': {'$gt': 0} })))
businesses_neighbours_reviews_df = pd.DataFrame(list(db.reviews.find({ "business_id": {"$in": business_neighbours} })))
businesses_neighbours_reviews_df

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,5e9664963b612dc2f0e488c0,-86eJuYugapSTs5dvCZKtw,qjX40bTubCboIQ6S4mu54A,5FVBaD1poRZQeqUDxbWWQA,1.0,3,0,0,Checked in with my family of five for two room...,2013-12-30 14:52:18
1,5e9664973b612dc2f0e4b13d,j57XFIDuzcpBrcptbrJSwA,USljZmbtpya0BEKocSLbUw,5FVBaD1poRZQeqUDxbWWQA,1.0,1,0,0,AT first it seemed like a good hotel. I'm not ...,2013-01-28 19:03:36
2,5e9664973b612dc2f0e4c04b,Hb_kJxRWevasbqHfD_YBQw,mEXLSpAc0A5zdW5iZEosSg,5FVBaD1poRZQeqUDxbWWQA,1.0,1,0,0,I have stayed here 7 times in the past few mon...,2017-03-15 22:17:31
3,5e9664983b612dc2f0e511eb,qpdvJX2M17LZrrtamrpgjA,YcdgoZzc9yuGN1pBBxevRQ,5FVBaD1poRZQeqUDxbWWQA,5.0,0,0,0,Check-in war sehr zügig und der Mitarbeiter ex...,2013-10-06 15:22:27
4,5e9664993b612dc2f0e5448a,ivzJHezSj4594qkg6DbS9g,cpOqABn6YiktZqMrJUB8Aw,5FVBaD1poRZQeqUDxbWWQA,4.0,2,0,0,"the waffle factor:\n\n i don't work, haven't f...",2012-11-08 15:29:18
...,...,...,...,...,...,...,...,...,...,...
237,5e9665cd3b612dc2f0519561,mfC8GXG3j38ip_KErwe1dg,fCExFIeu2BT35FSx2Yz2aA,gjR5bIyL4ETB9CtWgNL64A,5.0,4,0,2,So most everyone from 'round these parts are v...,2018-02-10 05:36:37
238,5e9665cd3b612dc2f051c269,LPNViYqzq4N7iwMEFvFN_g,A2P6QP1T0An81jDnB9OnzQ,gjR5bIyL4ETB9CtWgNL64A,5.0,0,0,1,Great stay. Clean updated rooms. Comfortable b...,2019-03-20 17:42:25
239,5e9665ce3b612dc2f0520816,-J6g1v3wGtT8VDxlhoPk8w,Z5_2D0duy6vBnMVHooEwEQ,gjR5bIyL4ETB9CtWgNL64A,1.0,3,0,0,DO NOT BOOK HERE. At least not if you are plan...,2018-09-02 11:12:09
240,5e9665ce3b612dc2f0521b0b,TcAUyiFCd_oAxNjTEKEL_A,qOlt6hHXxUijPJ8iwlo6hA,gjR5bIyL4ETB9CtWgNL64A,5.0,0,0,0,Staff was incredible and the room was at a ver...,2019-06-29 03:47:43


In [81]:
businesses_neighbours_reviews_records = businesses_neighbours_reviews_df.astype({'_id': 'str'})
businesses_neighbours_reviews_records.to_json('../data/businesses_neighbours_reviews.json', orient='records', lines=True)