In [1]:
import os
import requests
import bs4
import re
import time
import csv
from datetime import datetime as dt
from urllib.request import urlopen
from fuzzywuzzy import fuzz

import numpy as np
import pandas as pd

In [51]:
filepath = os.path.join('.', 'data', '2_wiki-interim', 'restaurants.csv')
rest_df = pd.read_csv(filepath)

# Get TripAdvisor URLs for Restaurants

In [52]:
ta_headers = {
    'authority': 'www.tripadvisor.com',
    'accept': 'text/html, */*',
    'x-requested-with': 'XMLHttpRequest',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'content-type': 'Application/json; charset=utf-8',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'referer': 'https://www.tripadvisor.com',
#     'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9'
}

In [62]:
# list of ids that limit search results to desired city
ta_geo_ids = {
    'california': '28926',
    'chicago': '28934',
    'washington dc': '28970',
    'new york': '60763'
}

In [60]:
def get_ta_url(rest_name, location, ta_geo_ids, headers):
    score_list = []
    best_name = 'n/a'
    best_score = -1
    try:
        geo_id = ta_geo_ids[location]
        search_url = ('https://www.tripadvisor.com/Search?redirect'
                              +'&uiOrigin=MASTHEAD'
                              +'&singleSearchBox=true'
                              +'&supportedSearchTypes=find_near_stand_alone_query'
                              +'&geo=' + geo_id
                              +'&q=' + rest_name + '+michelin'
                              +'&blockRedirect=true'
                              +'&ssrc=e'
                              +'&rf=10')
        resp = requests.get(search_url, headers=headers, timeout=5)
        page = bs4.BeautifulSoup(resp.text)
        time.sleep(0.1)
        
        results = page.find_all('div', class_ = 'result-title')
        for result in results:
            result_title_clean = result.find('span').get_text().lower().replace('restaurant', '').strip()
            rest_name_clean = rest_name.lower().replace('restaurant', '').strip()
            score_list.append(fuzz.ratio(result_title_clean, rest_name_clean))
        best_match_index = np.argmax(score_list)
        best_score = max(score_list)
        best_result = results[best_match_index]
        best_name = best_result.find('span').get_text()
        rest_url_suffix = best_result['onclick'].split(', ')[3].strip("'")
        rest_url = 'https://www.tripadvisor.com' + rest_url_suffix
    except:
        rest_url = 'n/a'
    
    return rest_url + '__' + str(best_score) + '__' + best_name

In [61]:
rest_df['location'] = np.where(rest_df['city_name'] == 'san francisco', 'california', rest_df['city_name'])

rest_df['query'] = rest_df['rest_name'] + '__' + rest_df['location']

In [63]:
rest_df['ta_url__score'] = rest_df['query'].map(lambda x: get_ta_url(rest_name=x.split('__')[0],
                                                                     location=x.split('__')[1],
                                                                     ta_geo_ids=ta_geo_ids,
                                                                     headers=ta_headers))

In [64]:
rest_df['ta_url'] = rest_df['ta_url__score'].map(lambda x: x.split('__')[0])
rest_df['match_score'] = rest_df['ta_url__score'].map(lambda x: int(x.split('__')[1]))
rest_df['ta_name'] = rest_df['ta_url__score'].map(lambda x: x.split('__')[2])

# Correct mistakes in TripAdvisor URLs
Manually checking for incorrect urls for restaurants that have a fuzzy match score of <100

In [186]:
mistakes = []
no_ta_pg = []

In [187]:
rest_df.loc[(rest_df['match_score']>=90) & (rest_df['match_score']<100),['location', 'rest_name', 'ta_name', 'match_score']]

Unnamed: 0,location,rest_name,ta_name,match_score
82,new york,Café Boulud,Cafe Boulud,91
83,new york,Café China,Cafe China,90
90,new york,Chef's Table at Brooklyn Fare,Chefs Table at Brooklyn Fare,98
121,new york,Gunter Seeger,Gunter Seeger NY,90
141,new york,L'Atelier de Joël Robuchon,L'Atelier de Joel Robuchon,96
155,new york,M. Wells Steakhouse,M Wells Steakhouse,97
179,new york,Rhong-Tiam,Rhong Tiam,90
200,new york,Tamarind Tribeca,Tamarind - Tribeca,94
211,new york,The River Café,The River Cafe,93
214,new york,Torrisi Italian Specialities,Torrisi Italian Specialties,98


In [188]:
rest_df.loc[(rest_df['match_score']>=80) & (rest_df['match_score']<90),['location', 'rest_name', 'ta_name', 'match_score']]

Unnamed: 0,location,rest_name,ta_name,match_score
3,chicago,Avenues,Avenue M,80
49,chicago,Yūgen,Yugen,80
55,new york,Adour,L'Adour,83
81,new york,Bâtard,Batard,83
105,new york,Delaware & Hudson,Delaware and Hudson,89
119,new york,Gotham Bar and Grill,Gotham Bar & Grill,89
172,new york,Perry Street,One Perry Street Restaurant,86
222,new york,Wallsé,Wallse,83
241,california,Baumé,Baume,80
249,california,Campton Place,Taj Campton Place Restaurant,87


In [189]:
mistakes.extend([('Perry Street', 'https://www.tripadvisor.com/Restaurant_Review-g60763-d590536-Reviews-Perry_St-New_York_City_New_York.html')])

no_ta_pg.extend(['Avenues', 'Adour'])

In [190]:
rest_df.loc[(rest_df['match_score']>=70) & (rest_df['match_score']<80),['location', 'rest_name', 'ta_name', 'match_score']]

Unnamed: 0,location,rest_name,ta_name,match_score
21,chicago,Kikko,Kumiko,73
31,chicago,Omakase Yume,Omakase,74
45,chicago,The Lobby,Lobby,71
54,new york,A Voce Madison,Marche Madison,79
103,new york,Danube,Blue Danube Restaurant,71
115,new york,Fleur de Sel,Flor de Sol,78
118,new york,Gordon Ramsay at The London,Gordon Ramsay Steak,70
148,new york,Le Grill de Joël Robuchon,L'Atelier de Joel Robuchon,75
151,new york,Lever House,Porter House,70
152,new york,Lincoln,Lincoln Cafe,74


In [191]:
mistakes.extend([('Lincoln', 'https://www.tripadvisor.com/Restaurant_Review-g60763-d2707130-Reviews-Lincoln_Ristorante-New_York_City_New_York.html')])

no_ta_pg.extend(['Omakase Yume', 'A Voce Madison', 'Danube', 'Fleur de Sel', 'Gordon Ramsay at The London',
                          'Le Grill de Joël Robuchon', 'Lever House', 'Rubicon', 'The Dining Room at the Ritz-Carlton'])

In [192]:
rest_df.loc[(rest_df['match_score']>=60) & (rest_df['match_score']<70),['location', 'rest_name', 'ta_name', 'match_score']]

Unnamed: 0,location,rest_name,ta_name,match_score
84,new york,Café Gray,Dorian Gray,60
109,new york,Eighty-One,Benito One,60
122,new york,Hakkasan,Hakkasan New York,64
125,new york,Ichimura at Brushstroke,Brushstroke,65
126,new york,Ichimura at Uchū,Ichimura,67
153,new york,Lo Scalco,Los Abuelos,60
157,new york,March,Marea,60
176,new york,Pok Pok NY,Toriko NY,63
203,new york,Tetsu Basement,The Basement Lounge,67
286,california,Martini House,Acacia House,64


In [193]:
mistakes.extend([('Pok Pok NY', 'https://www.tripadvisor.com/Restaurant_Review-g60827-d3396685-Reviews-Pok_Pok_NY-Brooklyn_New_York.html'),
                 ('Rasa', 'https://www.tripadvisor.com/Restaurant_Review-g32124-d7277633-Reviews-Rasa_Contemporary_Indian-Burlingame_California.html')])

no_ta_pg.extend(['Café Gray', 'Eighty-One', 'Ichimura at Uchū', 'Lo Scalco', 'March',
                 'Tetsu Basement', 'Martini House', ])

In [194]:
rest_df.loc[(rest_df['match_score']>=50) & (rest_df['match_score']<60),['location', 'rest_name', 'ta_name', 'match_score']]

Unnamed: 0,location,rest_name,ta_name,match_score
29,chicago,NoMI,NoMI Kitchen,50
58,new york,Alain Ducasse at the Essex House,Essex House,51
72,new york,Bar Uchū,Bar Boulud,56
154,new york,Luksus at Torst,Luksus,57
169,new york,Oxalis,oxalis restaurant Brooklyn,55
238,california,Aubergine,Tamarine Restaurant,59
303,california,Range,Manresa,50
350,washington dc,minibar,Barmini,57


In [195]:
mistakes.extend([('Bar Uchū','https://www.tripadvisor.com/Restaurant_Review-g60763-d12434865-Reviews-Uchu-New_York_City_New_York.html'),
                 ('Aubergine', 'https://www.tripadvisor.com/Restaurant_Review-g32172-d1367785-Reviews-Aubergine_at_L_Auberge_Carmel-Carmel_Monterey_County_California.html'),
                 ('minibar', 'https://www.tripadvisor.com/Restaurant_Review-g28970-d3861014-Reviews-Minibar_Washington_D_C-Washington_DC_District_of_Columbia.html')])

no_ta_pg.extend(['Alain Ducasse at the Essex House', 'Range'])

In [196]:
rest_df.loc[rest_df['match_score']<50,['location', 'rest_name', 'ta_name', 'match_score']]

Unnamed: 0,location,rest_name,ta_name,match_score
7,chicago,Bonsoirée,"Table, Donkey and Stick",31
41,chicago,Smyth,Smyth + The Loyalist,40
71,new york,Babbo,Bocuse Restaurant,36
95,new york,Cote,Atera,44
96,new york,Country,Hadlyme Country Market,48
112,new york,Etats-Unis,Bar Uni,47
117,new york,Gilt,Kingsley,33
138,new york,Kurumazushi,Sasabune,21
150,new york,Le Restaurant,Rebelle,44
184,new york,Saul,Saul at the Brooklyn Museum,26


In [197]:
mistakes.extend([('Babbo','https://www.tripadvisor.com/Restaurant_Review-g60763-d423266-Reviews-Babbo_Ristorante_e_Enoteca-New_York_City_New_York.html'),
                 ('Cote', 'https://www.tripadvisor.com/Restaurant_Review-g60763-d12672159-Reviews-Cote_Korean_Steakhouse-New_York_City_New_York.html'),
                 ('Seäsonal', 'https://www.tripadvisor.com/Restaurant_Review-g60763-d1390757-Reviews-Seasonal_Restaurant_and_Weinbar-New_York_City_New_York.html'),
                 ('Kurumazushi', 'https://www.tripadvisor.com/Restaurant_Review-g60763-d424528-Reviews-Kuruma_Zushi-New_York_City_New_York.html')])

no_ta_pg.extend(['Bonsoirée', 'Country', 'Etats-Unis', 'Gilt', 'Le Restaurant', 'Trevese', 'Vong'])

In [198]:
# fix urls for mistakes, and replace urls with nan for restaurants that don't have a TripAdvisor page

def fix_mistakes(rest_df):
    for row in range(0, len(rest_df)):
        for rest_name, ta_url in mistakes:
            if rest_df.loc[row, 'rest_name']==rest_name:
                rest_df.loc[row, 'ta_url'] = ta_url
    return rest_df

def remove_rests_with_no_ta_pg(rest_df):
    for row in range(0, len(rest_df)):
        for rest_name in no_ta_pg:
            if rest_df.loc[row, 'rest_name']==rest_name:
                rest_df.loc[row, 'ta_url'] = np.nan
    return rest_df

rest_df = fix_mistakes(rest_df)
rest_df = remove_rests_with_no_ta_pg(rest_df)

### need to clean this up

In [540]:
rest_df.to_csv(os.path.join('.', 'rest_df_temp.csv'), index=False)

# Scrape Review Data

In [471]:
ta_review_headers = {
    'authority': 'www.tripadvisor.com',
    'accept': 'text/html, */*',
    'x-requested-with': 'XMLHttpRequest',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'accept-language': 'en-US,en;q=0.9'
}

data = {
    'filterLang': 'ALL',
    'filterSafety': 'FALSE',
    'waitTime': 11,
    'changeSet': 'REVIEW_LIST',
}

In [454]:
def get_page(session, url, data, headers):
    for x in range(0, 5):  # try 5 times
        try:
            resp = session.post(url, data=data, headers=headers, timeout=10)
            page = bs4.BeautifulSoup(resp.text)
            time.sleep(.5)
            str_error = False
            return(page)
        except Exception:
            str_error = True
            print('attempt'+str(x+1)+' '+restName)
            pass
        if str_error:
            time.sleep(5)  # wait for 5 seconds before trying to fetch the data again
        else:
            break

In [None]:
ajax_url = 'https://www.tripadvisor.com/OverlayWidgetAjax'
session = requests.Session()
regex = re.compile(r'(.)*d(.)*Reviews-')

start = time.time()
print('Start: ' + time.ctime())

# export
export_folder = os.path.join('.', 'data', '3_tripadvisor-raw')
if not os.path.isdir(export_folder):
    os.mkdir(export_folder)

filename = os.path.join(export_folder,'reviews.csv')

# create file and write headers
with open(filename, 'w') as export_csv:
    wr = csv.writer(export_csv, quoting=csv.QUOTE_ALL)
    wr.writerow(['city_name', 'rest_name', 'screenname', 'member_id', 'review_language',
             'user_loc', 'rating', 'review_date', 'review_quote', 'review_text'])

# scrape reviews for each restaurant
    for row in range(0, len(rest_df)):
        city_name = rest_df.loc[row, 'city_name']
        rest_name = rest_df.loc[row, 'rest_name']
        url = rest_df.loc[row, 'ta_url']
        
        if not pd.isna(url):
            mo = regex.search(url)
            url_begin = mo.group()
            url_end = url.split(url_begin)[-1]

            page = get_page(session, url, data=data, headers=ta_review_headers)

            #Get last page of reviews from page
            last_page_link = page.find('a', class_='pageNum last ')
            if last_page_link:
                last_page_num = int(last_page_link.text)
            else:
                last_page_num = 0

            # create list to store review ids
            review_id_list = []

            # for each page, store review ids
            for page_num in range(0,(last_page_num)*10,10):
                if page_num > 0:
                    page_num_code = "or" + str(page_num) + "-"
                    url = url_begin + page_num_code + url_end

                page = get_page(session, url, data=data, headers=ta_review_headers)

                reviews = page.find_all('div', class_='review-container')
                for review in reviews:
                    review_id_list.append(review.find('div', class_='reviewSelector')['data-reviewid'])

            review_id_str = ','.join(review_id_list)

            # create data and headers for ajax request
            expand_rev_data = {
                'Mode': 'EXPANDED_HOTEL_REVIEWS_RESP',
                'reviews': review_id_str,
                'contextChoice': 'DETAIL',
                'loadMtHeader': 'true',
            }

            expand_rev_headers = {
                'authority': 'www.tripadvisor.com',
                'accept': 'text/html, */*; q=0.01',
                'x-requested-with': 'XMLHttpRequest',
                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
                'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
                'sec-fetch-site': 'same-origin',
                'sec-fetch-mode': 'cors',
                'referer': url,
                'accept-language': 'en-US,en;q=0.9',
            }

            reviews = get_page(session, ajax_url, data=expand_rev_data, headers=expand_rev_headers)

            for review in reviews.find_all('div', class_ = "reviewSelector"):
                try:
                    user_profile = review.find('span', class_='taLnk ')
                    screenname = user_profile['data-screenname']
                    member_id = user_profile['data-memberid']
                    review_language = user_profile['data-language']
                except:
                    try:
                        screenname = next(review.find('div', class_='info_text pointer_cursor').children).text
                        member_id = review.find('div', class_='memberOverlayLink clickable')['id'].replace('-', '_').split('_')[1]
                        review_language = '(null)'
                    except:
                        screenname = '(null)'
                        member_id = '(null)'
                        review_language = '(null)'

                # get location
                if review.find('div', class_='userLoc') is None:
                    user_loc = '(null)'
                else:
                    user_loc = review.find('div', class_='userLoc').text

                # get rating
                if review.find('span', class_='ui_bubble_rating bubble_50') is not None:
                    rating = 5
                elif review.find('span', class_='ui_bubble_rating bubble_40') is not None:
                    rating = 4
                elif review.find('span', class_='ui_bubble_rating bubble_30') is not None:
                    rating = 3
                elif review.find('span', class_='ui_bubble_rating bubble_20') is not None:
                    rating = 2
                elif review.find('span', class_='ui_bubble_rating bubble_10') is not None:
                    rating = 1
                else:
                    rating = 0

                # get review date
                review_date = review.find('span', class_='ratingDate')['title']

                # get review quote
                review_quote = review.find('div', class_='quote').find('span', class_='noQuotes').text

                # get review text
                review_text = review.find('p', class_='partial_entry').text

                # gather review data
                review_data = [city_name, rest_name, screenname, member_id, review_language,
                               user_loc, rating, review_date, review_quote, review_text]

                # write row to file
                wr.writerow(review_data)

            # message
            print('Scraped reviews for ' + rest_name 
                  + ', lastpage: ' + str(last_page_num) + '. ' 
                  + str(round((time.time()-start)/60, 1)) + ' min elapsed.')

Start: Wed Feb  5 21:01:33 2020
Scraped reviews for 42 Grams, lastpage: 3. 0.1 min elapsed.
Scraped reviews for Acadia, lastpage: 18. 0.4 min elapsed.
