In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

from sklearn.metrics import jaccard_similarity_score
from scipy.stats import pearsonr

In [None]:
bookmark_url = 'https://www.yelp.com/user_details_bookmarks?userid='
review_url = 'https://www.yelp.com/user_details_reviews_self?userid='
friends_url = 'https://www.yelp.com/user_details_friends?userid='
self_id = 'ShHBKjuJbQAVBLs7DgA95A'

In [None]:
def first_page_friends(user_id):
    """function to get the User IDs of the first page of friends"""

    f_page = requests.get(friends_url + user_id)
    f_soup = BeautifulSoup(f_page.content, 'html.parser')
    f_source = f_soup.find_all('a', class_='user-display-name js-analytics-click')
    
    friend_list = []
    for friend in range(0, len(f_source)):
        friend_id = f_source[friend]['href'].split('=')[1]
        friend_list.append(friend_id)
    
    return friend_list

In [None]:
def get_all_friends(user_id):
    """function to get the User IDs of all friends"""
    
    # get the max page number of bookmarks
    friendpage = requests.get(friends_url+user_id)
    fr_soup = BeautifulSoup(friendpage.content, 'html.parser')
    pages = fr_soup.find_all('div', class_='page-of-pages arrange_unit arrange_unit--fill')
    max_page = pages[0].get_text().split('of ')[1].split('\n')[0]
    friend_list = []   
    #get friends
    for page_num in range(0,48*int(max_page), 48):
        f_page = requests.get(friends_url + user_id+ '&start=' + str(page_num))
        f_soup = BeautifulSoup(f_page.content, 'html.parser')
        f_source = f_soup.find_all('a', class_='user-display-name js-analytics-click')
    
        for friend in range(0, len(f_source)):
            friend_id = f_source[friend]['href'].split('=')[1]
            friend_list.append(friend_id)

    return friend_list

In [None]:
def get_bookmarks(user_id):
    """get all bookmarks for user_id"""
    
    # get the max page number of bookmarks
    bookmarks = requests.get(bookmark_url+user_id)
    bm_soup = BeautifulSoup(bookmarks.content, 'html.parser')
    pages = bm_soup.find_all('div', class_='page-of-pages arrange_unit arrange_unit--fill')
    max_page = pages[0].get_text().split('of ')[1].split('\n')[0]
    
    user_bookmarks = []
    # loop through all the pages to get bookmarks
    for page_num in range(0, 50*int(max_page), 50):
        bookmarks = requests.get(bookmark_url + user_id + '&start=' + str(page_num))
        bm_soup = BeautifulSoup(bookmarks.content, 'html.parser')
        bm_name = bm_soup.find_all('a', class_='biz-name js-analytics-click')
        
        for mark in range(0, len(bm_name)):
            user_bookmarks.append(bm_name[mark]['href'])
       
    return list(set(user_bookmarks))

In [None]:
def get_reviews(user_id):
    """get all reviews for user_id"""
    
    # get the max page number of bookmarks
    reviews = requests.get(review_url+user_id)
    re_soup = BeautifulSoup(reviews.content, 'html.parser')
    pages = re_soup.find_all('div', class_='page-of-pages arrange_unit arrange_unit--fill')
    max_page = pages[0].get_text().split('of ')[1].split('\n')[0]
    
    user_reviews = {}
    # loop through all the pages to get bookmarks
    for page_num in range(0, 10*int(max_page), 10):
        rev = requests.get(review_url + user_id + '&rec_pagestart=' + str(page_num))
        rev_soup = BeautifulSoup(rev.content, 'html.parser')
        rating = rev_soup.find_all('div', class_=re.compile('i-stars i-stars--regular-*'))
        biz_name = rev_soup.find_all('a', class_='biz-name js-analytics-click')
        
        for mark in range(0, len(biz_name)):
            user_reviews[biz_name[mark]['href']] = int(rating[mark]['title'][0])
       
    return user_reviews

# Friends

In [None]:
source_friends = first_page_friends(self_id)
len(source_friends)

In [None]:
%store source_friends

# Bookmarks

1. Find all of my own bookmarks

In [None]:
self_bookmarks = get_bookmarks(self_id)
len(self_bookmarks)

In [None]:
%store self_bookmarks

 '2. Get all bookmarks for friends

In [None]:
# create dictionary of users and their bookmarks

all_bookmarks = {i:[] for i in source_friends}

for i in source_friends:
    all_bookmarks[i] = get_bookmarks(i)

In [None]:
%store all_bookmarks

'3. Create utility matrix

In [None]:
# create list of all unique places from all users

bm_vocab = [item for sl in list(all_bookmarks.values()) for item in sl]
bm_set = list(set(bm_vocab))

In [None]:
all_usr_vector = []

for k,v in all_bookmarks.items():
    usr_vector = []
    for bm in bm_set:
        if bm in v:
            usr_vector.extend([1])
        else:
            usr_vector.extend([0])
    all_usr_vector.append(usr_vector)

In [None]:
# create dataframe
bm_df = pd.DataFrame(all_usr_vector, columns=bm_set, index=all_bookmarks.keys())
bm_df.head()

In [None]:
# add my own bookmarks to dataframe

my_bm_comp = []
for i in bm_df.columns:
    if i in self_bookmarks:
        my_bm_comp.append(1)
    else:
        my_bm_comp.append(0)
        
mbm = pd.Series(my_bm_comp).to_frame(self_id).T
mbm.columns = bm_df.columns

bookmark_df = pd.concat([bm_df, mbm])
bookmark_df

'4. Calculate similarity

In [None]:
l1 = bookmark_df.iloc[-1,:][bookmark_df.iloc[-1,:]==1].index.values
l2 = self_bookmarks

a = set(l1)
b = set(l2)
b.difference(a)

In [None]:
sim_score = {}
for i in range(len(bookmark_df)):
    ss = pearsonr(bookmark_df.iloc[-1,:], bookmark_df.iloc[i,:])
    sim_score.update({i: ss[0]})
    
sf = pd.Series(sim_score).to_frame('similarity')
sf.sort_values('similarity', ascending=False)

'5. Look at top 4 simliar users

In [None]:
all_recs = bookmark_df.iloc[[25, 16, 11, 48],:][bookmark_df.iloc[[25, 16, 11, 48],:]==1].fillna(0).T

temp = all_recs[all_recs[self_id]==0].copy()
str_recs = temp.iloc[:,:-1].copy()

# look at places where at least 2 ppl also bookmarked
str_recs[str_recs.sum(axis=1)>1]

# Reviews

1. Get my own reviews

In [None]:
my_reviews = get_reviews(self_id)
my_reviews

In [None]:
%store my_reviews

'2. get friend's reviews

In [None]:
# create dictionary of users and their reviews
all_reviews = {i:[] for i in source_friends}

for i in source_friends:
    all_reviews[i] = get_reviews(i)

In [None]:
%store all_reviews

'3. create utility matrix

In [None]:
rev_vocab = [item for sl in list(all_reviews.values()) for item in sl.keys()]
rev_set = list(set(rev_vocab))