# Scraper Feature
## Scraping TripAdvisor Reviews for further analysis and processing

This feature takes a TripAdvisor venue's homepage as inputs and scrapes all the reviews related to that venue and stores the data for further processing.

Instructions:
1. Update the `dataset_name_stem` in the Inputs & Setup section below to whatever the dataset should be referred to throughout the tool.  For example, if you are scraping and analysing reviews for the Three Broomsticks venue in Diagon Alley, then name the dataset `3B` and update the `dataset` variable in the `config.toml` file before doing further processing. Make sure these folders are setup with the proper name and an appropriate `line-config.toml` file as well
2. Update the `start_urls` to include the URLs that should be scraped. A single venue, or multiple datasets in a venue are possible.

## Inputs & Setup

In [186]:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
import re
import time

# Setup Variables
dataset_name_stem = '3b'

start_urls = [
    'https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida.html'
]

lang = 'en'


## Trip Advisor Scraping with BeautifulSoup

In [187]:
def display(content, filename='output.html'):
    with open(filename, 'wb') as f:
        f.write(content)
        webbrowser.open(filename)

In [188]:
def get_soup(session, url, show=False):
    r = session.get(url)
    if show:
        display(r.content, 'temp.html')

    if r.status_code != 200:  # not OK
        print('[get_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')

In [189]:
def post_soup(session, url, params, show=False):

    r = session.post(url, data=params)

    if show:
        display(r.content, 'temp.html')

    if r.status_code != 200:  # not OK
        print('[post_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')

In [190]:
def scrape(url, lang='ALL'):
    # create session to keep all cookies (etc.) between requests
    session = requests.Session()

    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
    })

    items = parse(session, url + '?filterLang=' + lang)

    return items

In [191]:
def parse(session, url):
    '''Get number of reviews and start getting subpages with reviews'''

    soup = get_soup(session, url)

    if not soup:
        print('[parse] No Soup:', url)
        return

    num_reviews = soup.find('span', class_='reviews_header_count').text  # get text
    num_reviews = num_reviews[1:-1]
    num_reviews = num_reviews.replace(',', '')
    num_reviews = int(num_reviews)  # convert text into integer
    print('[parse] Number of Reviews:', num_reviews)

    url_template = url.replace('.html', '-or{}.html')
    print('[parse] URL Template:', url_template)

    items = []

    offset = 0

    while (True):
        subpage_url = url_template.format(offset)

        subpage_items = parse_reviews(session, subpage_url)
        if not subpage_items:
            break

        items += subpage_items

        if len(subpage_items) < 10:
            break

        offset += 10 # This changes how far ahead you skip for reviews

    return items

In [192]:
def get_reviews_ids(soup):
    items = soup.find_all('div', attrs={'data-reviewid': True})

    if items:
        reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
        print('[get_reviews_ids] data-reviewid:', reviews_ids)
        return reviews_ids

In [193]:
def get_more(session, reviews_ids):
    #
    # Initial Source: Furas, https://stackoverflow.com/questions/47856273/scraping-reviews-from-tripadvisor/47858268#47858268
    #
    
    url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'

    payload = {
        'reviews': ','.join(reviews_ids),
        'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX',
        'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
        'haveCsses': 'apg-Hotel_Review-in',
        'Action': 'install',
    }

    soup = post_soup(session, url, payload)

    return soup

In [194]:
def parse_reviews(session, url):
    '''Get all reviews from one page'''

    print('Parsing Reviews -- url:', url)

    soup = get_soup(session, url)

    if not soup:
        # Error
        print('Parsing Reviews -- no soup:', url)
        return

    venue_name = soup.find('h1', id='HEADING').text

    reviews_ids = get_reviews_ids(soup)
    if not reviews_ids:
        return

    soup = get_more(session, reviews_ids)

    if not soup:
        # Error
        print('Parsing Reviews -- no soup:', url)
        return

    items = []

    for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):

        if review:
        
            # Contributions
            badgets = review.find_all('span', class_='badgetext')
            if len(badgets) > 0:
                contributions = badgets[0].text
            else:
                contributions = '0'

            #print('contributions: ', contributions)

            # Helpful Votes
            if len(badgets) > 1:
                helpful_vote = badgets[1].text
            else:
                helpful_vote = '0'

            #print('helpful: ', helpful_vote)


            # User Location
            user_loc = review.select_one('div.userLoc strong')
            if user_loc:
                user_loc = user_loc.text
            else:
                user_loc = ''
            #print('user loc: ', user_loc)

            # User Name
            soup.find_all(href=re.compile("elsie"))

            if review.find(id = re.compile("UID_")):
                uid = review.find(id = re.compile("UID_"))['id']
                uid = re.split(r'[_-]', uid)[1]
            else:
                uid = ''


            # Review Rating
            if review.select_one('span.ui_bubble_rating'):
                bubble_rating = review.select_one('span.ui_bubble_rating')['class']
                bubble_rating = bubble_rating[1].split('_')[-1]
            else:
                bubble_rating = ''

            # Review ID
            if review:
                review_id = review.attrs['data-reviewid']
                review_title = review.select_one('span.noQuotes').string
            else:
                review_id = ''
                review_title = ''


            item = {
                'venue_name': venue_name,
                'review_id': review_id,
                'review_title': review_title,
                'review_body': review.find('p', class_='partial_entry').text,
                'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
                'rating': bubble_rating,
                'helpful_vote': helpful_vote,
                'contributions': contributions,
                'uid': uid,
                'user_location': user_loc
            }

            items.append(item)

            # Slow down process a bit
            #time.sleep(.5)
            
        else:
            break

    return items

## Write out Reviews & Related Metadata

In [195]:
def write_reviews(items, filename='results.dat',
                 headers=['review_body'],
                 mode='w'):
    print('--- Review Write Start ---')

    with io.open(filename, mode, encoding="utf-8") as csvfile:
        csv_file = csv.DictWriter(csvfile, headers, delimiter='\t', extrasaction='ignore')

        csv_file.writerows(items)
        
    print('--- Review Write Finish ---')
    
def write_metas(items, filename='results_meta.dat',
                headers=['venue_name', 'review_id', 'review_title',
                'review_date', 'rating', 'helpful_vote', 'contributions',
                'uid', 'user_location'], 
                mode='w'):

    print('--- Meta Write Start ---')
    
    with io.open(filename, mode, encoding="utf-8") as csvfile:
        
        meta_file = csv.DictWriter(csvfile, headers, delimiter='\t', extrasaction='ignore')
        meta_file.writerows(items)
    
    print('--- Meta Write Finish ---')


## Actual Scraping

Results of scraping will be output here for tracking purposes and the datasets are stored in their respective directories

In [196]:
for url in start_urls:

    # get all reviews for 'url' and 'lang'
    items = scrape(url, lang)


    if not items:
        print('No reviews')
    else:
        # write in CSV
        filename = dataset_name_stem
        print('Dataset Name:', filename)
        write_reviews(items, './' + filename + '/' + filename + '.dat', mode='w')
        write_metas(items, './' + filename + '/' + 'metadata.dat', mode='w')

[parse] Number of Reviews: 2288
[parse] URL Template: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or{}.html?filterLang=en
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or0.html?filterLang=en
[get_reviews_ids] data-reviewid: ['639770583', '639760552', '639025165', '638955967', '638433354', '638251093', '638020113', '636869474', '636324853', '635714212']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or10.html?filterLang=en
[get_reviews_ids] data-reviewid: ['634372105', '633230165', '632955188', '632944061', '632800780', '632407017', '631387080', '631380543', '630676746', '630605099']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or20.html?filterLang=en
[get_reviews_ids] data-reviewid: ['

[get_reviews_ids] data-reviewid: ['530062465', '529697610', '529258226', '529153845', '529130386', '528289261', '528126634', '528058597', '527912688', '527346966']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or260.html?filterLang=en
[get_reviews_ids] data-reviewid: ['526772844', '526555938', '526085505', '526029920', '522963552', '522153581', '522094646', '521903268', '521557262', '521112452']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or270.html?filterLang=en
[get_reviews_ids] data-reviewid: ['519259032', '518890047', '518820435', '518573588', '518543088', '518536385', '518126723', '517722242', '517554570', '516473801']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or280.html?filterLang=en
[get_reviews_ids] data-reviewid: ['516333791', '5

[get_reviews_ids] data-reviewid: ['451040003', '450548306', '450489548', '450476554', '450125812', '449376469', '449371009', '449320175', '449191216', '448876183']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or520.html?filterLang=en
[get_reviews_ids] data-reviewid: ['448718748', '448301146', '448233676', '448164364', '447654874', '447623624', '447340588', '447213596', '446563778', '445982834']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or530.html?filterLang=en
[get_reviews_ids] data-reviewid: ['445965065', '445961829', '445885898', '445815218', '445432520', '444485463', '443184506', '442977827', '442965537', '442950454']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or540.html?filterLang=en
[get_reviews_ids] data-reviewid: ['442756469', '4

[get_reviews_ids] data-reviewid: ['382709766', '382558647', '382282202', '382255836', '382017633', '382003402', '381578151', '381195879', '381050395', '380822312']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or780.html?filterLang=en
[get_reviews_ids] data-reviewid: ['380742494', '380714234', '380556032', '380229121', '379949457', '379398624', '379315145', '379135782', '379100649', '379045783']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or790.html?filterLang=en
[get_reviews_ids] data-reviewid: ['379038358', '378821669', '378657987', '378467890', '378396616', '378093763', '377921344', '377760538', '377586661', '377443341']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or800.html?filterLang=en
[get_reviews_ids] data-reviewid: ['376825417', '3

[get_reviews_ids] data-reviewid: ['333397309', '333368250', '333297618', '333259504', '333064254', '332735422', '332669482', '332665828', '332390140', '332160179']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1040.html?filterLang=en
[get_reviews_ids] data-reviewid: ['331760360', '331655611', '331336889', '331313606', '331213253', '330910620', '330489216', '330437212', '329966213', '329757159']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1050.html?filterLang=en
[get_reviews_ids] data-reviewid: ['329616068', '329453648', '329050996', '328892824', '328886663', '328465165', '328284172', '328283965', '328268981', '328252936']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1060.html?filterLang=en
[get_reviews_ids] data-reviewid: ['327982646',

[get_reviews_ids] data-reviewid: ['284554202', '284530129', '284496294', '284485679', '284470928', '284469269', '284440692', '284022698', '284018332', '283333912']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1300.html?filterLang=en
[get_reviews_ids] data-reviewid: ['283202345', '283195896', '283149781', '282719250', '282400913', '282244001', '282202279', '282131416', '281970832', '281938805']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1310.html?filterLang=en
[get_reviews_ids] data-reviewid: ['281887790', '281800875', '281705585', '281688500', '281485829', '281442933', '281422081', '281350792', '281172200', '281161167']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1320.html?filterLang=en
[get_reviews_ids] data-reviewid: ['281153015',

[get_reviews_ids] data-reviewid: ['240465292', '240155687', '240025682', '239922600', '239821696', '239688438', '239664231', '239577992', '239552926', '239366317']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1560.html?filterLang=en
[get_reviews_ids] data-reviewid: ['239116875', '238829859', '238134744', '238063720', '238025420', '237951997', '237905628', '237163822', '237159461', '236988690']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1570.html?filterLang=en
[get_reviews_ids] data-reviewid: ['236939430', '236537803', '235732613', '234897188', '234633807', '234433762', '234432099', '234155905', '233647647', '232419475']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1580.html?filterLang=en
[get_reviews_ids] data-reviewid: ['232039566',

[get_reviews_ids] data-reviewid: ['149514639', '149244551', '149018552', '148856441', '148402463', '148399064', '148345362', '148152734', '148049335', '147639348']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1820.html?filterLang=en
[get_reviews_ids] data-reviewid: ['147636736', '147307024', '147231475', '146335740', '146116732', '145406664', '144936978', '144894572', '144790077', '144644653']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1830.html?filterLang=en
[get_reviews_ids] data-reviewid: ['144577924', '144551294', '143468313', '143365752', '143320478', '143185117', '143135427', '143060322', '142678229', '142639546']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or1840.html?filterLang=en
[get_reviews_ids] data-reviewid: ['142630805',

[get_reviews_ids] data-reviewid: ['413961637', '413651291', '412404399', '412301910', '411325197', '410293706', '409516751', '409238377', '408718837', '405351168']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or2080.html?filterLang=en
[get_reviews_ids] data-reviewid: ['405290780', '404954405', '400846784', '400360528', '399576988', '398727088', '397291284', '392750083', '392229477', '388151796']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or2090.html?filterLang=en
[get_reviews_ids] data-reviewid: ['387960412', '387649772', '386547364', '385997944', '385533825', '385354015', '384232761', '382868911', '380019267', '378037772']
Parsing Reviews -- url: https://www.tripadvisor.com/Restaurant_Review-g34515-d1974070-Reviews-Three_Broomsticks-Orlando_Florida-or2100.html?filterLang=en
[get_reviews_ids] data-reviewid: ['374979953',

##### 