In [None]:
"""
You can run this one by typing this into the terminal: python tripadvisor_restaurant.py "https://www.tripadvisor.com/Restaurant_Review-g293986-d7267482-Reviews-Brisket-Amman_Amman_Governorate.html"
(replace URL for each restaurant)

Right now this outputs the info as dictionaries, can easily be converted into a dataframe
"""


from lxml import html
import requests
from collections import OrderedDict
import pprint
import json
import argparse

def parse(url):
    print("Fetching "+url)
    response = requests.get(url).text
    parser = html.fromstring(response)
	
    XPATH_RATING = '//div[@id="ratingFilter"]//ul//li'

    XPATH_TRAVELER = '//div[@class="col segment extraWidth"]//ul//li'
    XPATH_TIME = '//div[@class="col season extraWidth"]//ul/li'
    XPATH_LANGUAGE = '//div[@class="col language extraWidth"]//ul/li'
    XPATH_DETAILS = '//div[contains(@class,"details_tab")]//div[contains(@class, "table_section")]//text()'
    XPATH_NAME = '//h1[@property="name"]//text()'
    XPATH_HOTEL_RATING = '//span[@property="ratingValue"]//@content'
    XPATH_REVIEWS = '//a[@property="reviewCount"]/@content'
    XPATH_RANK = '//div[@class="slim_ranking"]//text()'
    XPATH_STREET_ADDRESS = "//div[@class='header_container']//span[@class='street-address']//text()"
    XPATH_LOCALITY  = '//div[@class="header_container"]//span[@property="addressLocality"]//text()'
    XPATH_ZIP = '//div[@class="header_container"]//span[@property="postalCode"]//text()'
    XPATH_COUNTRY = '//div[@class="header_container"]//span[@property="addressCountry"]//@content'
    XPATH_OFFICIAL_DESCRIPTION = '//div[contains(@class,"additional_info")]//span[contains(@class,"tabs_descriptive_text")]//text()'
    XPATH_LOCATION = '//div[@class="mapContainer"]//text()'
                                        
    ratings = parser.xpath(XPATH_RATING)

    travelers = parser.xpath(XPATH_TRAVELER)
    times = parser.xpath(XPATH_TIME)
    languages = parser.xpath(XPATH_LANGUAGE)
	
    raw_name = parser.xpath(XPATH_NAME)
    raw_rank = parser.xpath(XPATH_RANK)
    raw_street_address = parser.xpath(XPATH_STREET_ADDRESS)
    raw_locality = parser.xpath(XPATH_LOCALITY)
    raw_zipcode =  parser.xpath(XPATH_ZIP)
    raw_country = parser.xpath(XPATH_COUNTRY)
    raw_review_count = parser.xpath(XPATH_REVIEWS)
    raw_rating = parser.xpath(XPATH_HOTEL_RATING)
    raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
    raw_details = parser.xpath(XPATH_DETAILS)
    raw_location = parser.xpath(XPATH_LOCATION)
					
    name = ''.join(raw_name).strip() if raw_name else None
    rank = ''.join(raw_rank).strip() if raw_rank else None
    street_address = ' '.join(raw_street_address).strip() if raw_street_address else None
    locality = ' '.join(raw_locality).strip() if raw_locality else None
    zipcode = ''.join(raw_zipcode).strip() if raw_zipcode else None
    country  = ' '.join(raw_country).strip() if raw_country else None
    review_count = ''.join(raw_review_count).strip() if raw_review_count else None
    hotel_rating = ''.join(raw_rating).strip() if raw_rating else None
    official_description = ' '.join(' '.join(raw_official_description).split()) if raw_official_description else None
    details = ' '.join(' '.join(raw_details).split()) if raw_details else None
    location = ' '.join(' '.join(raw_location).split()) if raw_location else None       

    ratings_dict = OrderedDict()
    for rating in ratings:
        XPATH_RATING_KEY = './/div[@class="row_label"]//text()'
        XPATH_RATING_VALUE = './/span[@class="row_bar"]/following-sibling::span//text()'
        raw_rating_key = rating.xpath(XPATH_RATING_KEY)
        raw_rating_value = rating.xpath(XPATH_RATING_VALUE)
        cleaned_rating_key = ''.join(raw_rating_key).replace('\n','')
        cleaned_rating_value = ''.join(raw_rating_value).replace('\n','')
        ratings_dict.update({cleaned_rating_key:cleaned_rating_value})
    
    travelers_dict = OrderedDict()   
    for traveler in travelers:
        XPATH_TRAVELER_TYPE = './/label//text()'
        XPATH_TRAVELER_COUNT = './/span//text()'
        raw_traveler_type = traveler.xpath(XPATH_TRAVELER_TYPE)
        raw_traveler_count = traveler.xpath(XPATH_TRAVELER_COUNT)
        cleaned_traveler_type = ''.join(raw_traveler_type).replace('\n','').split(' ')
        cleaned_traveler_type = cleaned_traveler_type[0]
        cleaned_traveler_count = ''.join(raw_traveler_count).replace('\n','')
        travelers_dict.update({cleaned_traveler_type:cleaned_traveler_count})
        
    times_dict = OrderedDict()
    for time in times:
        XPATH_TIME_TYPE = './/label//text()'
        XPATH_TIME_COUNT = './/span//text()'
        raw_time_type = time.xpath(XPATH_TIME_TYPE)
        raw_time_count = time.xpath(XPATH_TIME_COUNT)
        cleaned_time_type = ''.join(raw_time_type).replace('\n','').split(' ')
        cleaned_time_type = cleaned_time_type[0]
        cleaned_time_count = ''.join(raw_time_count).replace('\n','')
        times_dict.update({cleaned_time_type:cleaned_time_count})
        
    languages_dict = OrderedDict()
    for language in languages:
        XPATH_LANGUAGE_TYPE = './/label//text()'
        XPATH_LANGUAGE_COUNT = './/span//text()'
        raw_language_type = language.xpath(XPATH_LANGUAGE_TYPE)
        raw_language_count = language.xpath(XPATH_LANGUAGE_COUNT)
        cleaned_language_type = ''.join(raw_language_type).replace('\n','').split(' ')
        cleaned_language_type = cleaned_language_type[0]
        cleaned_language_count = ''.join(raw_language_count).replace('\n','')
        languages_dict.update({cleaned_language_type:cleaned_language_count})
	
    address = {		'street_address':street_address,
					'locality':locality,
					'zipcode':zipcode,
					'country':country
	}

    data = {
				'address':address,
				'ratings':ratings_dict,
                  'travelers':travelers_dict,
                  'times of year':times_dict,
                  'languages':languages_dict,
				'official_description':official_description,
				'rating':hotel_rating,
				'review_count':review_count,
				'name':name,
				'rank':rank,
                  'details':details,
                  'location':location
	}

    return data

if __name__=='__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('url',help='Tripadvisor restaurant url')
	args = parser.parse_args()
	url = args.url
	scraped_data = parse(url)
	with open('tripadvisor_restaurant_scraped_data.json','w') as f:
		json.dump(scraped_data,f,indent=4)