In [1]:
#import seaborn as sns
import numpy as np
import pandas as pd

from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import SVD

from sklearn.metrics import pairwise_distances

import pymongo
from pymongo import MongoClient

In [2]:
# Initialize mongoDB client
client = MongoClient()
db = client.yelp

In [5]:
# Gather businesses data from DB
# create new collection with split categories
db.temp_businesses.drop()
agr = [{'$project': {'business_id':1,'name':1,'address':1,'city':1,'state':1,'postal_code':1,'latitude':1,'longitude':1,'stars':1,'review_count':1,'is_open':1,'attributes':1,'categories': {'$split':["$categories",","]},'hours':1}},{'$out': 'temp_businesses'}]
businesses_agr = db.businesses.aggregate(agr)
businesses_collection = db.temp_businesses
businesses_df = pd.DataFrame(list(businesses_collection.find()))
businesses_count = businesses_df.shape[0]

In [6]:
# Obtain random business from businesses dataframe
random_number = np.random.randint(1,businesses_count)
business = businesses_df.iloc[random_number]
business

_id                                      5e9664676f14ef773b7c8847
business_id                                ITRBrOOVQJEgc5dbxV1Bbw
name                                         La Flor De Michoacan
address                                       9550 W Van Buren St
city                                                     Tolleson
state                                                          AZ
postal_code                                                 85353
latitude                                                  33.4509
longitude                                                -112.265
stars                                                         3.5
review_count                                                   26
is_open                                                         0
attributes      {'BikeParking': 'False', 'RestaurantsReservati...
hours           {'Monday': '6:0-22:0', 'Tuesday': '6:0-22:0', ...
categories                                [Restaurants,  Mexican]
Name: 1100

In [7]:
# Get categories array
categories = business.get('categories')

In [8]:
# Get Businesses in categories ($all for all in categories, $in for any in categories)
businesses_in_categories_df = pd.DataFrame(list(businesses_collection.find({ "categories": {"$all": categories} })))
business_ids_in_categories = businesses_in_categories_df['business_id'].tolist()

In [9]:
# Now obtain all useful (greater than zero) reviews from the group of businesses_in_categories_df
reviews_df = pd.DataFrame(list(db.reviews.find({'business_id': {'$in': business_ids_in_categories}, 'useful': {'$gt': 0}})))
reviews_df.stars = reviews_df.stars.astype('int')

In [10]:
# Create Surprise data
reader = Reader( rating_scale= (1,5))
surprise_dataset = Dataset.load_from_df(reviews_df[['business_id','user_id','stars']], reader)
train_set, test_set = train_test_split(surprise_dataset, test_size=.2)

In [11]:
# Setup algorithm using SVD
svd = SVD( n_factors = 5, n_epochs = 200, biased = True, lr_all = 0.005, reg_all = 0, init_mean = 0, init_std_dev = 0.01, verbose = False )
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x177e26eb0>

In [12]:
# Obtain pu matrix and get our business vector
pu = svd.pu
business_vector_innerid = train_set.to_inner_uid(business.get('business_id'))
business_vector = pu[business_vector_innerid]

In [13]:
# Calculate the euclidean distance between our business and everything else
distances = pairwise_distances(pu,business_vector.reshape(1, -1),'euclidean')
# Obtain an indirect array with the indices of the sorted distances array
ordered_distances_index = np.argsort(distances.flatten())

In [14]:
# If number of neighbours is greater than number_of_neighbours, return first number_of_neighbours, otherwise return what we have
number_of_neighbours = 10
neighbour_indices = []
if ordered_distances_index.shape[0] > number_of_neighbours:
    neighbour_indices = ordered_distances_index[1:number_of_neighbours+1]
else:
    neighbour_indices = ordered_distances_index

In [15]:
# Get neighbour ids of selected business
business_neighbours_ids = (train_set.to_raw_uid(rid) for rid in neighbour_indices)
business_neighbours = []
for neighbour in business_neighbours_ids:
    business_neighbours.append(neighbour)

In [44]:
# Query business ids to obtain dataframe of neighbours
businesses_neighbours_df = pd.DataFrame(list(businesses_collection.find({ "business_id": {"$in": business_neighbours} })))
businesses_neighbours_df

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,hours,categories
0,5e96645d6f14ef773b7b3e43,lJh661dhjD5BDz5QrBEUeQ,Filiberto's Mexican Food,"3855 W. Ray Rd, Ste 8",Chandler,AZ,85226,33.319328,-111.907414,3.0,44,1,"{'NoiseLevel': 'u'average'', 'BusinessParking'...","{'Monday': '6:30-22:0', 'Tuesday': '6:30-22:0'...","[Restaurants, Mexican]"
1,5e96645e6f14ef773b7b5844,luOWGdtAfsbrZ5kC_cjuYA,Ayala's AZ Kitchen Mexican Cafe,8427 W Peoria Ave,Peoria,AZ,85345,33.581088,-112.240497,4.0,35,0,"{'Caters': 'True', 'BikeParking': 'True', 'Out...","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...","[Restaurants, Mexican]"
2,5e9664616f14ef773b7baec5,fZzgTEHDElm_R6gsYxmeBw,San Diego Bay Restaurant,"9201 S Avenida Del Yaqui, Ste 6",Guadalupe,AZ,85283,33.363353,-111.962867,4.0,102,1,"{'WiFi': ''no'', 'BikeParking': 'True', 'Resta...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ...","[Restaurants, Mexican, Seafood]"
3,5e9664676f14ef773b7c8a4d,UhyDVz7npxKUl9BfMpfmBw,Frijoles and Frescas Grilled Tacos,4300 Meadows Ln,Las Vegas,NV,89107,36.172544,-115.195724,3.5,34,1,"{'BikeParking': 'True', 'OutdoorSeating': 'Fal...","{'Monday': '10:30-21:0', 'Tuesday': '10:30-21:...","[Restaurants, Mexican, Food Court]"
4,5e9664676f14ef773b7c9dba,036SeqOKaKx89Liw5RDKqA,Francisco's Mexican Food,"3233 E Chandler Blvd, Ste 6-A",Phoenix,AZ,85048,33.304277,-112.012534,3.5,40,1,"{'Ambience': '{'touristy': False, 'hipster': F...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","[Restaurants, Mexican]"
5,5e9664686f14ef773b7ccf13,ThCqEavXwVOQu7t5LsITdw,Migos,1821 W Baseline Rd,Guadalupe,AZ,85283,33.377776,-111.966816,3.5,21,0,"{'RestaurantsReservations': 'True', 'Ambience'...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...","[Restaurants, Mexican]"
6,5e9664696f14ef773b7cd3ba,L_X5eAd4moE40AsX6Havhg,1900 Mexican Grill,5110-1E Park Rd,Charlotte,NC,28209,35.161133,-80.849282,3.5,112,1,"{'Alcohol': 'u'full_bar'', 'BikeParking': 'Tru...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...","[Restaurants, Mexican]"
7,5e9664696f14ef773b7cefba,SVu6fPieP5noUcCWRUdfag,Maskadores,"1229 S Power Rd, Ste 102",Mesa,AZ,85206,33.392657,-111.683878,4.5,112,1,"{'RestaurantsTakeOut': 'True', 'Alcohol': ''no...","{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0'...","[Restaurants, Mexican, Tacos]"
8,5e96646f6f14ef773b7dd4c6,uhroo5AV1dcCHpI6ZQIj8w,Rubio's Coastal Grill,"884 W Warner Rd, Ste B-8",Gilbert,AZ,85233,33.335917,-111.808875,3.5,90,1,"{'WiFi': ''no'', 'GoodForKids': 'True', 'Resta...","{'Monday': '10:0-21:30', 'Tuesday': '10:0-21:3...","[Restaurants, Salad, Caterers, Mexican, Se..."
9,5e96646f6f14ef773b7df1a1,nSD-uxCMb745bH3dxoko9Q,Taco Bell,2453 Hwy 160,Tega Cay,SC,29708,35.04967,-80.99063,2.5,37,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsGoo...","{'Monday': '7:0-1:0', 'Tuesday': '7:0-1:0', 'W...","[Restaurants, Fast Food, Tex-Mex, Mexican, ..."


## Export busines records to line separated json

In [47]:
businesses_records = businesses_neighbours_df.astype({'_id': 'str'})
businesses_records

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,hours,categories
0,5e96645d6f14ef773b7b3e43,lJh661dhjD5BDz5QrBEUeQ,Filiberto's Mexican Food,"3855 W. Ray Rd, Ste 8",Chandler,AZ,85226,33.319328,-111.907414,3.0,44,1,"{'NoiseLevel': 'u'average'', 'BusinessParking'...","{'Monday': '6:30-22:0', 'Tuesday': '6:30-22:0'...","[Restaurants, Mexican]"
1,5e96645e6f14ef773b7b5844,luOWGdtAfsbrZ5kC_cjuYA,Ayala's AZ Kitchen Mexican Cafe,8427 W Peoria Ave,Peoria,AZ,85345,33.581088,-112.240497,4.0,35,0,"{'Caters': 'True', 'BikeParking': 'True', 'Out...","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...","[Restaurants, Mexican]"
2,5e9664616f14ef773b7baec5,fZzgTEHDElm_R6gsYxmeBw,San Diego Bay Restaurant,"9201 S Avenida Del Yaqui, Ste 6",Guadalupe,AZ,85283,33.363353,-111.962867,4.0,102,1,"{'WiFi': ''no'', 'BikeParking': 'True', 'Resta...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ...","[Restaurants, Mexican, Seafood]"
3,5e9664676f14ef773b7c8a4d,UhyDVz7npxKUl9BfMpfmBw,Frijoles and Frescas Grilled Tacos,4300 Meadows Ln,Las Vegas,NV,89107,36.172544,-115.195724,3.5,34,1,"{'BikeParking': 'True', 'OutdoorSeating': 'Fal...","{'Monday': '10:30-21:0', 'Tuesday': '10:30-21:...","[Restaurants, Mexican, Food Court]"
4,5e9664676f14ef773b7c9dba,036SeqOKaKx89Liw5RDKqA,Francisco's Mexican Food,"3233 E Chandler Blvd, Ste 6-A",Phoenix,AZ,85048,33.304277,-112.012534,3.5,40,1,"{'Ambience': '{'touristy': False, 'hipster': F...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","[Restaurants, Mexican]"
5,5e9664686f14ef773b7ccf13,ThCqEavXwVOQu7t5LsITdw,Migos,1821 W Baseline Rd,Guadalupe,AZ,85283,33.377776,-111.966816,3.5,21,0,"{'RestaurantsReservations': 'True', 'Ambience'...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...","[Restaurants, Mexican]"
6,5e9664696f14ef773b7cd3ba,L_X5eAd4moE40AsX6Havhg,1900 Mexican Grill,5110-1E Park Rd,Charlotte,NC,28209,35.161133,-80.849282,3.5,112,1,"{'Alcohol': 'u'full_bar'', 'BikeParking': 'Tru...","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...","[Restaurants, Mexican]"
7,5e9664696f14ef773b7cefba,SVu6fPieP5noUcCWRUdfag,Maskadores,"1229 S Power Rd, Ste 102",Mesa,AZ,85206,33.392657,-111.683878,4.5,112,1,"{'RestaurantsTakeOut': 'True', 'Alcohol': ''no...","{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0'...","[Restaurants, Mexican, Tacos]"
8,5e96646f6f14ef773b7dd4c6,uhroo5AV1dcCHpI6ZQIj8w,Rubio's Coastal Grill,"884 W Warner Rd, Ste B-8",Gilbert,AZ,85233,33.335917,-111.808875,3.5,90,1,"{'WiFi': ''no'', 'GoodForKids': 'True', 'Resta...","{'Monday': '10:0-21:30', 'Tuesday': '10:0-21:3...","[Restaurants, Salad, Caterers, Mexican, Se..."
9,5e96646f6f14ef773b7df1a1,nSD-uxCMb745bH3dxoko9Q,Taco Bell,2453 Hwy 160,Tega Cay,SC,29708,35.04967,-80.99063,2.5,37,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsGoo...","{'Monday': '7:0-1:0', 'Tuesday': '7:0-1:0', 'W...","[Restaurants, Fast Food, Tex-Mex, Mexican, ..."


In [55]:
businesses_records.to_json('../data/businesses_neighbours.json', orient='records', lines=True)

## Export reviews of the selected businesses

In [56]:
businesses_neighbours_reviews_df = pd.DataFrame(list(db.reviews.find({ "business_id": {"$in": business_neighbours} })))
businesses_neighbours_reviews_df

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,5e9664b23b612dc2f0f009d3,KJ7I6unUOtO4Af8DRGBvCw,8vFdPfwvSrRriEzuRHLQDg,luOWGdtAfsbrZ5kC_cjuYA,3.0,1,1,0,Nice clean atmosphere set in s newly remodeled...,2014-10-27 01:51:18
1,5e9664b33b612dc2f0f00b87,YI8yUhoTTGbHyyhqtOry7w,NMQnFpYlj1I2z8niP0Td2w,lJh661dhjD5BDz5QrBEUeQ,1.0,0,0,0,I have lived in this area for 20 years and hav...,2014-08-10 20:34:33
2,5e9664b33b612dc2f0f03b58,q9DUuox-jmfAr80_tAjeTQ,C1uo5b6avxRDnppB1oDlzg,luOWGdtAfsbrZ5kC_cjuYA,5.0,2,0,1,"Oh my gosh, the chimi w/ baja is the best ever...",2012-12-05 20:16:53
3,5e9664b53b612dc2f0f0f453,pyhQXENH7i05M2_PdAN-LA,qibGLHABNReGeJr2w4_8yQ,lJh661dhjD5BDz5QrBEUeQ,3.0,3,2,3,"Not the best filbertos, but not the worst. I c...",2011-10-29 22:42:23
4,5e9664b53b612dc2f0f11232,QgCNyWwqo81lOAiPiYTzDQ,tA9wWXliXSWVEW9BgyxzVQ,lJh661dhjD5BDz5QrBEUeQ,4.0,1,0,1,I have been craving Horchata ever since we mov...,2012-03-04 02:19:10
...,...,...,...,...,...,...,...,...,...,...
644,5e9665ec3b612dc2f05e17b1,nvAoF1AITqDvU2KDuz2D4g,kD6-NcPbdQmRps1LY8ZLug,nSD-uxCMb745bH3dxoko9Q,1.0,0,0,0,Rude and lazy employees that will claim their ...,2019-09-16 04:40:58
645,5e9665ec3b612dc2f05e3db9,TGlPnSNo5g8MTgNsaqLY1g,EgA6sMKj6xobNMoPrbiJOQ,nSD-uxCMb745bH3dxoko9Q,1.0,1,1,0,I go to this Taco Bell in Tega Cay because it'...,2018-09-08 01:58:42
646,5e9665ec3b612dc2f05e4f6f,wc3clQgMkjRSJikSCxiYbw,qK-CX0G2JEd8jWFNsVlsNw,uhroo5AV1dcCHpI6ZQIj8w,3.0,1,0,1,I am not a fan of Mexican food but in general ...,2017-04-22 18:51:09
647,5e9665ec3b612dc2f05e543f,esGRpWSmYJuIhPb95UpsPQ,-pTu1s6kecSICn96u2EAqA,nSD-uxCMb745bH3dxoko9Q,5.0,0,0,0,This Taco Bell went from the worst Taco Bell I...,2018-12-30 01:57:27


In [57]:
businesses_neighbours_reviews_records = businesses_neighbours_reviews_df.astype({'_id': 'str'})
businesses_neighbours_reviews_records.to_json('../data/businesses_neighbours_reviews.json', orient='records', lines=True)