In [284]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

from sklearn.metrics import jaccard_similarity_score
from scipy.stats import pearsonr

In [285]:
bookmark_url = 'https://www.yelp.com/user_details_bookmarks?userid='
review_url = 'https://www.yelp.com/user_details_reviews_self?userid='
friends_url = 'https://www.yelp.com/user_details_friends?userid='
self_id = 'ShHBKjuJbQAVBLs7DgA95A'

In [286]:
def first_page_friends(user_id):
    """function to get the User IDs of the first page of friends"""

    f_page = requests.get(friends_url + user_id)
    f_soup = BeautifulSoup(f_page.content, 'html.parser')
    f_source = f_soup.find_all('a', class_='user-display-name js-analytics-click')
    
    friend_list = []
    for friend in range(0, len(f_source)):
        friend_id = f_source[friend]['href'].split('=')[1]
        friend_list.append(friend_id)
    
    return friend_list

In [287]:
def get_all_friends(user_id):
    """function to get the User IDs of all friends"""
    
    # get the max page number of bookmarks
    friendpage = requests.get(friends_url+user_id)
    fr_soup = BeautifulSoup(friendpage.content, 'html.parser')
    pages = fr_soup.find_all('div', class_='page-of-pages arrange_unit arrange_unit--fill')
    max_page = pages[0].get_text().split('of ')[1].split('\n')[0]
    friend_list = []   
    #get friends
    for page_num in range(0,48*int(max_page), 48):
        f_page = requests.get(friends_url + user_id+ '&start=' + str(page_num))
        f_soup = BeautifulSoup(f_page.content, 'html.parser')
        f_source = f_soup.find_all('a', class_='user-display-name js-analytics-click')
    
        for friend in range(0, len(f_source)):
            friend_id = f_source[friend]['href'].split('=')[1]
            friend_list.append(friend_id)

    return friend_list

In [341]:
def get_bookmarks(user_id):
    """get all bookmarks for user_id"""

    user_bookmarks = []
     
    try: # not all accounts have the bookmarks as public data       
   
        # get the max page number of bookmarks
        bookmarks = requests.get(bookmark_url+user_id)
        bm_soup = BeautifulSoup(bookmarks.content, 'html.parser')
        pages = bm_soup.find_all('div', class_='page-of-pages arrange_unit arrange_unit--fill')
        max_page = pages[0].get_text().split('of ')[1].split('\n')[0]

        # loop through all the pages to get bookmarks
        for page_num in range(0, 50*int(max_page), 50):
            bookmarks = requests.get(bookmark_url + user_id + '&start=' + str(page_num))
            bm_soup = BeautifulSoup(bookmarks.content, 'html.parser')
            bm_name = bm_soup.find_all('a', class_='biz-name js-analytics-click')

            for mark in range(0, len(bm_name)):
                user_bookmarks.append(bm_name[mark]['href'])
        
        return user_bookmarks
    
    except: 
        return [np.nan]
       


In [289]:
def get_reviews(user_id):
    """get all reviews for user_id"""
    
    # get the max page number of bookmarks
    reviews = requests.get(review_url+user_id)
    re_soup = BeautifulSoup(reviews.content, 'html.parser')
    pages = re_soup.find_all('div', class_='page-of-pages arrange_unit arrange_unit--fill')
    max_page = pages[0].get_text().split('of ')[1].split('\n')[0]
    
    user_reviews = {}
    # loop through all the pages to get bookmarks
    for page_num in range(0, 10*int(max_page), 10):
        rev = requests.get(review_url + user_id + '&rec_pagestart=' + str(page_num))
        rev_soup = BeautifulSoup(rev.content, 'html.parser')
        rating = rev_soup.find_all('div', class_=re.compile('i-stars i-stars--regular-*'))
        biz_name = rev_soup.find_all('a', class_='biz-name js-analytics-click')
        
        for mark in range(0, len(biz_name)):
            user_reviews[biz_name[mark]['href']] = int(rating[mark]['title'][0])
       
    return user_reviews

In [290]:
def normalize(row):
    mean = row.sum() / row.count()
    row = row - mean
    return row

# Purpose: 
#### The purpose is to build a recommendation system for Yelp using bookmarks and review ratings

To build a Yelp recommender engine, I will be looking at a Yelp Elite account. The data will be scraped from www.Yelp.com and used to create a recommendation engine based on collaborative filtering. I will scrape the list of friends and for each friend, get the list of their bookmarks and their review ratings. Once that is obtained, I can compare the users to find the ones that are most similar to my account and look at those items to generate a set of recommendations

## Getting the data - Bookmarks

In [291]:
# restore all stored variables
%store -r

First, I will need to obtain a list of my friend's user IDs. Instead of retreiving all friends (total of 144), I will only be retrieving 48 (only the first page) so that my processing times will be cut down. 

I also determined that the first page of friends should be sufficient as it appears that Yelp's algorithm doesn't order friends by alphabetical order but rather will sort all your Yelp Elite friends on the first page. Since that appears to be the case, I am more comfortable with only using the first page of friends since in order to be Yelp Elite, you must have many reviews.

Note: if you'd like to recreate this recommendation but with using all friends, you can use the function `get_all_friends()` instead of `first_page_friends()`. 

In [292]:
# run function to get first page of friends
source_friends = first_page_friends(self_id)

In [293]:
%store source_friends

Stored 'source_friends' (list)


In [294]:
len(source_friends)

48

Once I have a list of my friend's user IDs, I will need to obtain their lists of  bookmarks and my own list of bookmarks.

In [295]:
# list for my own bookmarks
self_bookmarks = get_bookmarks(self_id)
self_bookmarks

['/biz/three-twins-ice-cream-san-francisco-5',
 '/biz/ben-thai-cafe-san-francisco',
 '/biz/ijji-sushi-san-francisco',
 '/biz/koi-palace-daly-city',
 '/biz/robin-san-francisco',
 '/biz/maria-catita-restaurante-e-loja-regional-lisboa',
 '/biz/juns-beauty-salon-san-francisco',
 '/biz/bowld-acai-san-francisco-2',
 '/biz/china-bee-san-mateo',
 '/biz/bobos-san-francisco-14',
 '/biz/grand-lake-kitchen-oakland',
 '/biz/nabe-san-francisco-5',
 '/biz/dumpling-kitchen-san-francisco',
 '/biz/nojo-ramen-tavern-san-francisco-2',
 '/biz/el-milagro-los-angeles-2',
 '/biz/five-happiness-san-francisco',
 '/biz/tadich-grill-san-francisco',
 '/biz/mahalo-bowl-sunnyvale-2',
 '/biz/hinata-san-francisco',
 '/biz/apres-winter-lounge-east-palo-alto',
 '/biz/xiao-long-bao-kitchen-south-san-francisco',
 '/biz/benkyodo-co-san-francisco',
 '/biz/mission-pie-san-francisco',
 '/biz/neighbor-bakehouse-san-francisco',
 '/biz/ginto-izakaya-japonaise-san-francisco',
 '/biz/chile-pies-baking-co-san-francisco',
 '/biz/goc

In [296]:
# store to recall later
%store self_bookmarks

Stored 'self_bookmarks' (list)


In [297]:
len(self_bookmarks)

249

In [342]:
# create dictionary of users and their bookmarks
all_bookmarks = {i:[] for i in source_friends}

for i in source_friends:
    all_bookmarks[i] = get_bookmarks(i)

In [343]:
%store all_bookmarks

Stored 'all_bookmarks' (dict)


In [344]:
# get number of unique bookmarks from all friends

bm_vocab = [item for sl in all_bookmarks.values() for item in sl]
bm_set = list(set(bm_vocab))
len(bm_set)

10652

In [346]:
# create vectors for all friends and bookmarks - 1 for if they bookmarked and 0 if they did not
all_usr_vector = []

for k,v in all_bookmarks.items():
    usr_vector = []
    for bm in bm_set:
        if bm in v:
            usr_vector.extend([1])
        else:
            usr_vector.extend([0])
    all_usr_vector.append(usr_vector)

In [347]:
# create dataframe
bm_df = pd.DataFrame(all_usr_vector, columns=bm_set, index=all_bookmarks.keys())

bm_df

Unnamed: 0,nan,/biz/ben-thai-cafe-san-francisco,/biz/sushi-shibucho-costa-mesa,/biz/caffe-roma-coffee-roasting-san-francisco,/biz/pippin-vintage-jewelry-new-york,/biz/pok-pok-noi-portland,/biz/kiku-sushi-berkeley,/biz/carrera-cafe-los-angeles,/biz/handley-cellars-philo-2,/biz/coffee-nature-costa-mesa,...,/biz/lombard-street-san-francisco-3,/biz/polished-nest-san-francisco-2,/biz/swensens-ice-cream-san-francisco,/biz/cha-cafe-west-covina-2,/biz/shanghai-dumpling-shop-millbrae,/biz/chateau-boswell-st-helena,/biz/mckinney-farms-brentwood,/biz/la-boucherie-los-angeles-4,/biz/alta-new-york,/biz/vien-huong-restaurant-oakland
7IV3JXTXufasWNSkDEdrcQ,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
T1ZDOuZxeBK-SaowZqYl_w,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
oLh6PxTxdMAufjerdyZz5w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ipxRiIT-aL-M2nA-iaWOWA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e0zzk2kQHN2ghZHEGM-ytg,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ycztD9CWXLfDk4bPmA_tnw,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5RifcJP_Lf-MzojTHybBNw,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
NmdKCZlUVJGnqDxhXl2ofw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QjLiYeQLeMIqYu-ncmkUXg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tdowJfZymyZNJymLmBfZeA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [348]:
# add my bookmarks
my_bm_comp = []
for i in bm_df.columns:
    if i in self_bookmarks:
        my_bm_comp.append(1)
    else:
        my_bm_comp.append(0)

mbm = pd.Series(my_bm_comp).to_frame(self_id).T
mbm.columns = bm_df.columns
mbm

Unnamed: 0,nan,/biz/ben-thai-cafe-san-francisco,/biz/sushi-shibucho-costa-mesa,/biz/caffe-roma-coffee-roasting-san-francisco,/biz/pippin-vintage-jewelry-new-york,/biz/pok-pok-noi-portland,/biz/kiku-sushi-berkeley,/biz/carrera-cafe-los-angeles,/biz/handley-cellars-philo-2,/biz/coffee-nature-costa-mesa,...,/biz/lombard-street-san-francisco-3,/biz/polished-nest-san-francisco-2,/biz/swensens-ice-cream-san-francisco,/biz/cha-cafe-west-covina-2,/biz/shanghai-dumpling-shop-millbrae,/biz/chateau-boswell-st-helena,/biz/mckinney-farms-brentwood,/biz/la-boucherie-los-angeles-4,/biz/alta-new-york,/biz/vien-huong-restaurant-oakland
ShHBKjuJbQAVBLs7DgA95A,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [349]:
bookmark_df = pd.concat([bm_df, mbm])
bookmark_df

Unnamed: 0,nan,/biz/ben-thai-cafe-san-francisco,/biz/sushi-shibucho-costa-mesa,/biz/caffe-roma-coffee-roasting-san-francisco,/biz/pippin-vintage-jewelry-new-york,/biz/pok-pok-noi-portland,/biz/kiku-sushi-berkeley,/biz/carrera-cafe-los-angeles,/biz/handley-cellars-philo-2,/biz/coffee-nature-costa-mesa,...,/biz/lombard-street-san-francisco-3,/biz/polished-nest-san-francisco-2,/biz/swensens-ice-cream-san-francisco,/biz/cha-cafe-west-covina-2,/biz/shanghai-dumpling-shop-millbrae,/biz/chateau-boswell-st-helena,/biz/mckinney-farms-brentwood,/biz/la-boucherie-los-angeles-4,/biz/alta-new-york,/biz/vien-huong-restaurant-oakland
7IV3JXTXufasWNSkDEdrcQ,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
T1ZDOuZxeBK-SaowZqYl_w,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
oLh6PxTxdMAufjerdyZz5w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ipxRiIT-aL-M2nA-iaWOWA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e0zzk2kQHN2ghZHEGM-ytg,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ycztD9CWXLfDk4bPmA_tnw,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5RifcJP_Lf-MzojTHybBNw,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
NmdKCZlUVJGnqDxhXl2ofw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QjLiYeQLeMIqYu-ncmkUXg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tdowJfZymyZNJymLmBfZeA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# create a mapping for index number of bookmarks dataframe and user ID to be used later
index_map = pd.DataFrame({'user_id':bookmark_df.index})
index_map.index = index_map.index +1
index_map

In [350]:
# checking to see if there is overlap
l1 = bookmark_df.loc[self_id,:][bookmark_df.loc[self_id,:]==1].index.values
l2 = self_bookmarks

a = set(l1)
b = set(l2)
print('There is overlap: '+ str(len(bm_set) > len(b.difference(a))))
print(str(len(b.difference(a))) + ' out of ' + str(len(self_bookmarks)) + ' of my bookmarks are not bookmarked by others')

There is overlap: True
70 out of 249 of my bookmarks are not bookmarked by others


## Calculate Similarity - Bookmarks

Now that I have created the dataframe and confirmed that there is overlap in my bookmarks and my friend's bookmarks, I can calculate the similarities between myself and my friends.



In [352]:
sim_score = {}
for i in bookmark_df.index:
    ss = pearsonr(bookmark_df.loc[self_id,:], bookmark_df.loc[i,:])
    sim_score.update({i: ss[0]})

bm_sim = pd.Series(sim_score).to_frame('bm_similarity')
bm_sim_sorted = bm_sim.sort_values('bm_similarity', ascending=False)
bm_sim_sorted

Unnamed: 0,bm_similarity
ShHBKjuJbQAVBLs7DgA95A,1.0
Zw-0Lo01W0QdkHT60DYg4g,0.096375
UaKdT4twgZ4DguHJhT6vPw,0.083417
oLh6PxTxdMAufjerdyZz5w,0.080619
aqC54BcscB12jMCQcyga1g,0.078492
-OKmukwdCrHq6bkF2_gSwQ,0.077621
yDy1xHRcFG4LWU3rvlOxUQ,0.072269
NmdKCZlUVJGnqDxhXl2ofw,0.064376
MzC1_5kXxGw336fMYSrsdg,0.05833
7IV3JXTXufasWNSkDEdrcQ,0.046444


## Getting the data - Reviews

In a similar fashion, I scraped www.yelp.com for my own reviews and my friend's reviews. Once I have the data, I will calculate the similarity between myself and my friends

In [303]:
# scrape my own reviews
my_reviews = get_reviews(self_id)
my_reviews

{'/biz/ace-wasabi-rock-n-roll-sushi-san-francisco': 5,
 '/biz/adventure-cat-sailing-charters-san-francisco-2': 5,
 '/biz/aji-peruvian-cuisine-long-beach-3': 4,
 '/biz/appethaizing-portland': 4,
 '/biz/baohaus-new-york-2': 4,
 '/biz/be-fresh-la-jolla': 5,
 '/biz/beni-tora-los-angeles-3': 4,
 '/biz/cafe-21-san-diego-2': 5,
 '/biz/caffe-centro-san-francisco': 3,
 '/biz/champa-garden-san-francisco-2': 3,
 '/biz/cherry-blossom-bakery-san-francisco-2': 5,
 '/biz/china-stix-restaurant-santa-clara': 5,
 '/biz/chouchou-new-york-2': 3,
 '/biz/cupertino-iphone-repair-cupertino-5': 2,
 '/biz/delicious-food-corner-monterey-park': 2,
 '/biz/elmers-restaurant-portland-3': 3,
 '/biz/em-lash-and-brow-san-jose': 5,
 '/biz/farallon-san-francisco': 3,
 '/biz/fuji-sukiyaki-san-mateo': 3,
 '/biz/furlong-vision-correction-medical-center-san-jose': 5,
 '/biz/genwa-korean-bbq-beverly-hills': 5,
 '/biz/gogigo-korean-bbq-cupertino': 3,
 '/biz/hinata-san-francisco': 5,
 '/biz/ifixers-iphone-repair-san-jose-18': 5

In [304]:
%store my_reviews

Stored 'my_reviews' (dict)


In [305]:
# create dictionary of users and their reviews
all_reviews = {i:[] for i in source_friends}

for i in source_friends:
    all_reviews[i] = get_reviews(i)

In [306]:
%store all_reviews

Stored 'all_reviews' (dict)


In [307]:
len(all_reviews)

48

In [308]:
rev_vocab = [item for sl in list(all_reviews.values()) for item in sl.keys()]
rev_set = list(set(rev_vocab))
len(rev_set)

4331

Creating the dataframe for the reviews is slightly different. Instead of having 0 or 1 for whether or not there was a review, I am inputting the actual rating the user gave for the review.

In [309]:
all_reviews_vector = []

for usr,rating in all_reviews.items():
    for key,val in rating.items():
        usr_vector = []
        for rev in rev_set:
            if rev in key:
                usr_vector.append(all_reviews[usr][key])
            else:
                usr_vector.append(np.nan)
    all_reviews_vector.append(usr_vector)


In [310]:
%store all_reviews_vector

Stored 'all_reviews_vector' (list)


In [311]:
rev_df = pd.DataFrame(all_reviews_vector, columns=rev_set, index=all_reviews.keys())

my_rev_comp = []
for i in rev_df.columns:
    if i in my_reviews:
        my_rev_comp.append(my_reviews[i])
    else:
        my_rev_comp.append(np.nan)

mrev = pd.Series(my_rev_comp).to_frame(self_id).T
mrev.columns = rev_df.columns

review_df = pd.concat([rev_df, mrev])

# normalize the ratings
review_df = review_df.apply(lambda row: normalize(row), axis=1)
review_df = review_df.fillna(0)
review_df.tail()

Unnamed: 0,/biz/iza-ramen-soma-san-francisco-2,/biz/ben-thai-cafe-san-francisco,/biz/coffeebar-truckee-5,/biz/adamsons-french-dip-sunnyvale,/biz/elite-week-sparkcycle-san-diego,/biz/san-diego-superior-court-traffic-division-san-diego,/biz/lake-miramar-san-diego,/biz/evergreen-panda-san-jose,/biz/bandit-san-francisco,/biz/luxe-buffet-carlsbad,...,/biz/le-boulanger-campbell-2,/biz/shanghai-dumpling-shop-millbrae,/biz/%E4%B9%9C%E5%98%9C%E9%9B%9E%E8%9B%8B%E4%BB%94-%E9%A6%99%E6%B8%AF,/biz/tatami-sushi-and-seafood-buffet-cupertino-2,/biz/health-hut-los-angeles,/biz/mr-bings-san-francisco,/biz/enterprise-rent-a-car-seatac-3,/biz/past%C3%A9is-de-bel%C3%A9m-lisboa-6,/biz/yokohama-redwood-city,/biz/lord-stanley-san-francisco
y8mTD1BMn8Y0nR1ca_sPLw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ilgZlytsBSXGa14RVnQg_Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FraRROMnu9ro_RACmHafDQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
j-IxKzypD5KYgPwB-8kb-A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ShHBKjuJbQAVBLs7DgA95A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [312]:
review_df.shape

(49, 4331)

In [313]:
l1 = review_df.iloc[-1,:][review_df.iloc[-1,:]==1].index.values
l2 = my_reviews

a = set(l1)
b = set(l2)
print('There is overlap: '+ str(len(rev_set) > len(b.difference(a))))
print(str(len(b.difference(a))) + ' out of ' + str(len(my_reviews)) + ' of my reviews are not reviewed by others')

There is overlap: True
67 out of 67 of my reviews are not reviewed by others


Based on the above, it appears that none of the friends on the first page, which includes Yelp Elite friends, did not review the same location as I did. If I had the time and computing power to scrape all 144 friends and get their reviews, there would hopefully be a higher likelihood of overlap. Nonetheless, we will continue with the recommender.

## Calculate Similarity - Reviews

In [354]:
rev_sim_score = {}
for i in review_df.index:
    ss = pearsonr(review_df.loc[self_id,:], review_df.loc[i,:])
    rev_sim_score.update({i: ss[0]})
    
rev_sim = pd.Series(rev_sim_score).to_frame('rev_similarity')
rev_sim_sorted = rev_sim.sort_values('rev_similarity', ascending=False)
rev_sim_sorted

  r = r_num / r_den


Unnamed: 0,rev_similarity
ShHBKjuJbQAVBLs7DgA95A,1.0
-OKmukwdCrHq6bkF2_gSwQ,
5RifcJP_Lf-MzojTHybBNw,
6sDNWHF_jKE9rIjbc5ZfOw,
7IV3JXTXufasWNSkDEdrcQ,
7XdR2OvFXBoOmQ05pALzmg,
D7O2Euol8WGTGFWF6klNqg,
FOdWCgLWE0b7QMcu3Q9MYQ,
FraRROMnu9ro_RACmHafDQ,
FxXqn273QfaKXOLyKHhJVw,


## Combining Bookmarks & Reviews Similarities

Now that we have the similarities for both bookmarks and reviews, we can use both of them to determine the user that is most like us by simply adding the two similarities together. Based on my own experience and discussing with friends, some people will remove bookmarks once they have visited the establishment and then may leave a review if they are an active reviewer. As such, I believe that simply adding the two similarities to give us a final similarity is appropriate.

Note: In this case, we have no overlap in reviews so it is all NaN

In [355]:
both_sim = bm_sim.join(rev_sim)
both_sim['total'] = both_sim.sum(axis=1)
both_sim = both_sim.sort_values('total', ascending=False)
both_sim

Unnamed: 0,bm_similarity,rev_similarity,total
ShHBKjuJbQAVBLs7DgA95A,1.0,1.0,2.0
Zw-0Lo01W0QdkHT60DYg4g,0.096375,,0.096375
UaKdT4twgZ4DguHJhT6vPw,0.083417,,0.083417
oLh6PxTxdMAufjerdyZz5w,0.080619,,0.080619
aqC54BcscB12jMCQcyga1g,0.078492,,0.078492
-OKmukwdCrHq6bkF2_gSwQ,0.077621,,0.077621
yDy1xHRcFG4LWU3rvlOxUQ,0.072269,,0.072269
NmdKCZlUVJGnqDxhXl2ofw,0.064376,,0.064376
MzC1_5kXxGw336fMYSrsdg,0.05833,,0.05833
7IV3JXTXufasWNSkDEdrcQ,0.046444,,0.046444


## Generate Recommendations

Now I'm ready to identify recommendations by concatenating the reviews and the bookmarks togethers, look at the top 3 similar users to myself, and then further look at places where at least 2 of the 3 users have also either bookmarked or reviewed.

In [367]:
# get top 3 similar users
top = both_sim.index[:4]

book_recs = bookmark_df.loc[top,:][bookmark_df.loc[top,:]==1].fillna(0).T
review_recs = review_df.loc[top,:][review_df.loc[top,:]==1].fillna(0).T
all_recs = pd.concat([book_recs, review_recs])

# subset dataframe to only have items that I have not already bookmarked
temp = all_recs[all_recs[self_id]==0].copy()
str_recs = temp.iloc[:,:-1].copy()

# subset further to get bookmarks that more than 2 friends have also bookmarked
str_recs[str_recs.sum(axis=1)>1]

Unnamed: 0,ShHBKjuJbQAVBLs7DgA95A,Zw-0Lo01W0QdkHT60DYg4g,UaKdT4twgZ4DguHJhT6vPw
/biz/good-mong-kok-bakery-san-francisco,0.0,1.0,1.0
/biz/pho-huynh-hiep-2-kevins-noodle-house-san-francisco,0.0,1.0,1.0
/biz/zero-zero-san-francisco,0.0,1.0,1.0
/biz/skool-san-francisco,0.0,1.0,1.0
/biz/thanh-long-san-francisco,0.0,1.0,1.0
/biz/pho-2000-san-francisco,0.0,1.0,1.0
/biz/hai-ky-noodles-san-francisco,0.0,1.0,1.0
/biz/j%C5%AB-ni-san-francisco-5,0.0,1.0,1.0
/biz/la-taqueria-san-francisco-2,0.0,1.0,1.0
/biz/petite-provence-portland-2,0.0,1.0,1.0


# Results

Based on the resulting recommendations, I can confirm that these indeed look like places that I would enjoy eating at. There are some places where I have already been to and can confirm that I enjoyed it, which brings me to the limitation of the data. I did not bookmark every place I went to so some of these recommendations are places that I have already visited and would be a useless recommendation. If the account was a regular check-in user, then we could remove places that had already been visited. Further improvements could be layering in Content-based filtering with the types of restaurants I generally bookmark (cuisine type/atmosphere/pricing/and other various characteristics featured on Yelp.com). 