In [2]:
import math
import sys
import fileinput
from lxml import html  
import unicodecsv as csv
import requests
from time import sleep
import re
import argparse
import pandas as pd
import numpy as np
from urllib.parse import quote, unquote
import re, urllib

In [3]:
def GetParser(url):
    response = requests.get(url).text
    parser = html.fromstring(response)
    return parser

In [25]:
def BusinessInfoScrapper(parser):
    raw_name = parser.xpath("//h1[contains(@class,'page-title')]//text()")
    raw_claimed = parser.xpath("//span[contains(@class,'claim-status_icon--claimed')]/parent::div/text()")
    raw_reviews = parser.xpath("//div[contains(@class,'biz-main-info')]//span[contains(@class,'review-count rating-qualifier')]//text()")
    raw_category  = parser.xpath('//div[contains(@class,"biz-page-header")]//span[@class="category-str-list"]//a/text()')
    hours_table = parser.xpath("//table[contains(@class,'hours-table')]//tr")
    details_table = parser.xpath("//div[@class='short-def-list']//dl")
    raw_map_link = parser.xpath("//a[@class='biz-map-directions']/img/@src")
    raw_phone = parser.xpath(".//span[@class='biz-phone']//text()")
    raw_address = parser.xpath('//div[@class="mapbox-text"]//div[contains(@class,"map-box-address")]//text()')
    raw_wbsite_link = parser.xpath("//span[contains(@class,'biz-website')]/a/@href")
    raw_price_range = parser.xpath("//dd[contains(@class,'price-description')]//text()")
    raw_health_rating = parser.xpath("//dd[contains(@class,'health-score-description')]//text()")
    rating_histogram = parser.xpath("//table[contains(@class,'histogram')]//tr[contains(@class,'histogram_row')]")
    raw_ratings = parser.xpath("//div[contains(@class,'biz-page-header')]//div[contains(@class,'rating')]/@title")
    raw_neighborhood = parser.xpath("//div[@class='map-box-address u-space-l4']/span[@class='neighborhood-str-list']//text()")
    report = parser.xpath('//p[contains(@class,"alert-message text-centered")]/b/text()')
    working_hours = []
    
    for hours in hours_table:
        raw_day = hours.xpath(".//th//text()")
        raw_timing = hours.xpath("./td//text()")
        day = ''.join(raw_day).strip()
        timing = ''.join(raw_timing).strip()
        working_hours.append({day:timing})
    info = []
    for details in details_table:
        raw_description_key = details.xpath('.//dt//text()')
        raw_description_value = details.xpath('.//dd//text()')
        description_key = ''.join(raw_description_key).strip()
        description_value = ''.join(raw_description_value).strip()
        info.append({description_key:description_value})

    ratings_histogram = [] 
    for ratings in rating_histogram:
        raw_rating_key = ratings.xpath(".//th//text()")
        raw_rating_value = ratings.xpath(".//td[@class='histogram_count']//text()")
        rating_key = ''.join(raw_rating_key).strip()
        rating_value = ''.join(raw_rating_value).strip()
        ratings_histogram.append({int(rating_key[0]):int(rating_value)})

    name = ''.join(raw_name).strip()
    phone = ''.join(raw_phone).strip()
    address = ' '.join(' '.join(raw_address).split())
    health_rating = ''.join(raw_health_rating).strip()
    price_range = ''.join(raw_price_range).strip()
    claimed_status = ''.join(raw_claimed).strip()
    category = ','.join(raw_category)
    cleaned_ratings = ''.join(raw_ratings).strip()

    if raw_wbsite_link:
        #pass
        decoded_raw_website_link = urllib.parse.unquote(raw_wbsite_link[0])
        website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
    else:
        website = ''

    if raw_map_link:
        decoded_map_url =  urllib.parse.unquote(raw_map_link[0])
        map_coordinates = re.findall("([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
        latitude = float(map_coordinates[0])
        longitude = float(map_coordinates[1])
    else:
        latitude = ''
        longitude = ''

    if raw_ratings:
        ratings = float(re.findall("\d+[.,]?\d+",cleaned_ratings)[0])
    else:
        ratings = 0

    if raw_neighborhood:
        neighborhood = ''.join(raw_neighborhood).strip()
    else:
        neighborhood = ''

    if raw_reviews:
        reviews = int(''.join(raw_reviews).strip().replace(' reviews','').replace(' review',''))
    else:
        reviews = ''

    if report == []:
        permanently_closed = 0
    else:
        permanently_closed = 1

    data={'working_hours':working_hours,
        'info':info,
        'ratings_histogram':ratings_histogram,
        'name':name,
        'phone':phone,
        'ratings':ratings,
        'address':address,
        'health_rating':health_rating,
        'price_range':price_range,
        'claimed_status':claimed_status,
        'reviews':reviews,
        'category':category,
        'website':website,
        'latitude':latitude,
        'longitude':longitude,
        'neighborhood': neighborhood,  
        'permanently_closed': permanently_closed 
         }
    return data

In [44]:
def GetAllReivews(parser):
    '''Given the parsed first webpage of a restaurant on yelp, return all reviews of that restaurants'''
    review_dict = {'date': [], 'star': [], 'text': []}
    
    review_dates = parser.xpath("//div[@class='review-content']//span[@class='rating-qualifier']")
    for d in review_dates:
        date = ''.join(d.xpath(".//text()")).strip().split('\n')[0]
        review_dict['date'].append(date)    

    review_stars = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/div[@class='biz-rating biz-rating-large clearfix']")
    for s in review_stars:
        star = float(''.join(s.xpath(".//@title")).strip().replace(' star rating',''))
        review_dict['star'].append(star)
        
    review_texts = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/p")
    for t in review_texts:
        text = ' '.join(t.xpath(".//text()"))
        review_dict['text'].append(text)
    
    review = pd.DataFrame(review_dict)
    review['date'] =  pd.to_datetime(review['date'])
    
    review_pages_section = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")     
    review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages_section] if item != '' ]

    if 'Next' not in review_pages:
        return review
    else:
        nextpage = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href')[0]
        nextparser = GetParser(nextpage)
        sleep(1)
        return review.append(GetAllReivews(nextparser), ignore_index=True)    

In [27]:
import os
os.getcwd()

'/Users/cysung/GA/projects/project_capstone/part_02/Gayot'

In [37]:
def GetAllRestaurants(path='./',listfile='./gayot_list_nochain.txt', start=497, end=None, GetReviews=True):
    import random
    yelp_url = 'https://www.yelp.com/biz/'
    restaurants = [line.strip() for line in open(listfile)]
    if end == None:
        end = len(restaurants)
    if start == 0 :
        file = open("did_not_work.txt", "w+")
        file.close()
    for i, r in enumerate(restaurants[start:end]):
        keyword = r.lower().replace('\'','').replace('&', 'and').replace(',','').replace(' ','-')
        url = yelp_url + keyword + '-chicago'
        response = requests.get(url)
        sleep(random.choice([1,2]))
        if response.status_code == 200:    
            parsed = html.fromstring(response.text)
            data = BusinessInfoScrapper(parsed)
            restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
            restaurant['url'] = url
            restaurant.to_csv(path+str(i+start)+'_'+keyword+'.csv', index=False)
            if GetReviews:
                reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
                reviews.to_csv(path+str(i+start)+'_'+keyword+'_review.csv', index=False)        
            print('got '+str(i+start)+': '+url)
        else:
            with open("did_not_work.txt","a+") as file:
                file.write(str(i+start)+': '+keyword+'\n')            
            print('didn\'t get '+str(i+start)+': '+url)

In [33]:
GetAllRestaurants(path='./',listfile='./gayot_list_nochain.txt', start=497, end=None, GetReviews=False)

didn't get 497: https://www.yelp.com/biz/sweet-water-chicago
got 498: https://www.yelp.com/biz/sweets-and-savories-chicago
got 499: https://www.yelp.com/biz/szechwan-east-chicago
got 500: https://www.yelp.com/biz/takashi-chicago
got 501: https://www.yelp.com/biz/tallulah-chicago
got 502: https://www.yelp.com/biz/the-tasting-room-chicago
didn't get 503: https://www.yelp.com/biz/tasty-dog-chicago
got 504: https://www.yelp.com/biz/tavernita-chicago
didn't get 505: https://www.yelp.com/biz/taylor-rose-chicago
didn't get 506: https://www.yelp.com/biz/technicolor-kitchen-chicago
got 507: https://www.yelp.com/biz/telegraph-chicago
got 508: https://www.yelp.com/biz/tepatulco-chicago
didn't get 509: https://www.yelp.com/biz/thai-classic-chicago
didn't get 510: https://www.yelp.com/biz/302-west-chicago
didn't get 511: https://www.yelp.com/biz/thyme-chicago
didn't get 512: https://www.yelp.com/biz/tibet-cafe-chicago
got 513: https://www.yelp.com/biz/timo-chicago
didn't get 514: https://www.yelp.c

In [41]:
updated_restaurants = [line.strip() for line in open('updated_list.txt')]

In [56]:
import random
yelp_url = 'https://www.yelp.com/biz/'
reviewfiles = [f for f in listdir('./') if '_review.csv' in f]

for r in updated_restaurants[55:]:
    number = r.split(': ')[0]
    keyword = r.split(': ')[1].lower().replace(' ', '-').replace('\'','').replace('&', 'and').replace(',','')    
    url = yelp_url + keyword + '-chicago'
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(number)+'_'+keyword+'.csv', index=False)
        if str(number)+'_'+keyword+'_review.csv' not in reviewfiles:
            reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
            reviews.to_csv(str(number)+'_'+keyword+'_review.csv', index=False)        
        print('Got ', number, ':', keyword)
    else:
        print('Didn\'t get ', number, ':', keyword)

Got  296 : kyoto-sushi-steak-seafood
Got  302 : la-donna-italian-cuisine
Got  305 : la-strada-ristorante
Got  308 : la-vita-restaurant
Got  310 : landmark-grill
Got  322 : lindo-mexico-restauant
Got  332 : magnums-prime-steakhouse
Got  346 : mi-peru-restaurant-and-club
Got  352 : mk-restaurant
Got  358 : moon-palace-restaurant
Got  360 : moti-mahal-indian-restaurant
Got  368 : cafe-nhu-hoa
Got  371 : rylons-smokehouse
Didn't get  375 : nuevo-leon-restaurant-chicago-2
Got  383 : one-six-one
Got  388 : leons-bar-b-q
Got  399 : papagus-greek-taverna
Got  404 : the-parthenon
Got  417 : p-j-clarkes
Got  427 : province
Got  455 : sabor-a-cuba-restaurant
Got  481 : terragusto
Got  486 : pump-room
Got  496 : swank-frank-inc
Got  506 : veggie-bite
Got  509 : thai-classic-restaurant
Got  511 : thyme-restaurant
Got  519 : tournesol-french-bistro
Got  522 : treat-restaurant
Got  526 : tsunami-japanese-restaurant
Got  527 : tuscany-restaurant
Got  545 : x-o-chicago
Got  551 : thong-thai
Got  552 : 

In [57]:
def GetRestaurant(number=None, keyword=None, postfix=''):
    yelp_url = 'https://www.yelp.com/biz/'
    url = yelp_url + keyword + '-chicago' + postfix
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(number)+'_'+keyword+'.csv', index=False)
        reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
        reviews.to_csv(str(number)+'_'+keyword+'_review.csv', index=False)        
        print('Got ', number, ':', keyword)
    else:
        print('Didn\'t get ', number, ':', keyword)    

In [13]:
GetRestaurant(number=371, keyword='rylons-smokehouse', postfix=None)

Got  371 : rylons-smokehouse


In [59]:
GetRestaurant(number=375, keyword='nuevo-leon-restaurant', postfix='-2')

Got  375 : nuevo-leon-restaurant


In [60]:
GetRestaurant(number=20, keyword='ambria', postfix='-2')

Got  20 : ambria


In [64]:
GetRestaurant(number=482, keyword='spring-restaurant', postfix='')

Got  482 : spring-restaurant


In [65]:
GetRestaurant(number=198, keyword='fahrenheit', postfix='-2')

Got  198 : fahrenheit


In [45]:
from os import listdir
from os.path import isfile, join

In [48]:
infofiles = [f for f in listdir('./') if '_review.csv' not in f]
infofiles = [f for f in infofiles if '.csv' in f]

In [49]:
infofiles

['0_2-sparrows.csv',
 '100_coq-d-or-restaurant-and-lounge.csv',
 '101_cafe-laguardia.csv',
 '103_cafe-luciano.csv',
 '104_cafe-matou.csv',
 '105_cafe-chien.csv',
 '106_cafe-selmarie.csv',
 '107_caffe-baci.csv',
 '108_caliterra.csv',
 '109_cambridge-house-ltd.csv',
 '10_adobo-grill.csv',
 '110_cannellas-on-grand.csv',
 '113_cantina-1910.csv',
 '114_calo-ristorante.csv',
 '115_filippos-ristorante.csv',
 '116_carmichaels-chicago-steakhouse.csv',
 '117_carriage-house.csv',
 '118_centro.csv',
 '119_cereality-cereal-bar-and-cafe.csv',
 '11_aigre-doux.csv',
 '120_ceres-table.csv',
 '121_cerise.csv',
 '122_chalkboard.csv',
 '123_charlies-ale-house.csv',
 '124_charlies-on-leavitt.csv',
 '125_charlie-trotters.csv',
 '126_chicago-flat-sammies.csv',
 '127_chickpea.csv',
 '128_chilapan.csv',
 '129_chilpancingo.csv',
 '12_aja.csv',
 '130_china-grill.csv',
 '131_chizakaya-japanese-pub.csv',
 '133_cibo-matto.csv',
 '134_cicchetti.csv',
 '135_city-tavern.csv',
 '137_nouveau-tavern.csv',
 '139_club-roya

In [50]:
reviewfiles = [f for f in listdir('../Gayot_copy/') if '_review.csv' in f]

In [51]:
reviewfiles

['0_2-sparrows_review.csv',
 '100_coq-d-or-restaurant-and-lounge_review.csv',
 '101_cafe-laguardia_review.csv',
 '103_cafe-luciano_review.csv',
 '104_cafe-matou_review.csv',
 '105_cafe-chien_review.csv',
 '106_cafe-selmarie_review.csv',
 '107_caffe-baci_review.csv',
 '10_adobo-grill_review.csv',
 '110_cannellas-on-grand_review.csv',
 '113_cantina-1910_review.csv',
 '114_calo-ristorante_review.csv',
 '116_carmichaels-chicago-steakhouse_review.csv',
 '117_carriage-house_review.csv',
 '118_centro_review.csv',
 '119_cereality-cereal-bar-and-cafe_review.csv',
 '11_aigre-doux_review.csv',
 '120_ceres-table_review.csv',
 '121_cerise_review.csv',
 '122_chalkboard_review.csv',
 '123_charlies-ale-house_review.csv',
 '124_charlies-on-leavitt_review.csv',
 '125_charlie-trotters_review.csv',
 '126_chicago-flat-sammies_review.csv',
 '127_chickpea_review.csv',
 '128_chilapan_review.csv',
 '129_chilpancingo_review.csv',
 '12_aja_review.csv',
 '130_china-grill_review.csv',
 '131_chizakaya-japanese-pub_

In [55]:
import shutil
for file in infofiles:
    review_file = file[:-4]+'_review.csv'
    if review_file in reviewfiles:
        shutil.copy2('../Gayot_copy/'+review_file, './')
        print('want to copy '+'../Gayot_copy/'+file[:-4]+'_review.csv')

want to copy ../Gayot_copy/0_2-sparrows_review.csv
want to copy ../Gayot_copy/100_coq-d-or-restaurant-and-lounge_review.csv
want to copy ../Gayot_copy/101_cafe-laguardia_review.csv
want to copy ../Gayot_copy/103_cafe-luciano_review.csv
want to copy ../Gayot_copy/104_cafe-matou_review.csv
want to copy ../Gayot_copy/105_cafe-chien_review.csv
want to copy ../Gayot_copy/106_cafe-selmarie_review.csv
want to copy ../Gayot_copy/107_caffe-baci_review.csv
want to copy ../Gayot_copy/10_adobo-grill_review.csv
want to copy ../Gayot_copy/110_cannellas-on-grand_review.csv
want to copy ../Gayot_copy/113_cantina-1910_review.csv
want to copy ../Gayot_copy/114_calo-ristorante_review.csv
want to copy ../Gayot_copy/116_carmichaels-chicago-steakhouse_review.csv
want to copy ../Gayot_copy/117_carriage-house_review.csv
want to copy ../Gayot_copy/118_centro_review.csv
want to copy ../Gayot_copy/119_cereality-cereal-bar-and-cafe_review.csv
want to copy ../Gayot_copy/11_aigre-doux_review.csv
want to copy ../Gay

want to copy ../Gayot_copy/319_leopold_review.csv
want to copy ../Gayot_copy/31_il-fiasco_review.csv
want to copy ../Gayot_copy/321_lexis_review.csv
want to copy ../Gayot_copy/323_little-bucharest_review.csv
want to copy ../Gayot_copy/326_luna_review.csv
want to copy ../Gayot_copy/327_macello_review.csv
want to copy ../Gayot_copy/32_august-moon_review.csv
want to copy ../Gayot_copy/331_mado_review.csv
want to copy ../Gayot_copy/334_maison-brasserie_review.csv
want to copy ../Gayot_copy/335_mambo-grill_review.csv
want to copy ../Gayot_copy/336_marche_review.csv
want to copy ../Gayot_copy/337_marigold_review.csv
want to copy ../Gayot_copy/338_marysol_review.csv
want to copy ../Gayot_copy/339_mas_review.csv
want to copy ../Gayot_copy/343_menagerie_review.csv
want to copy ../Gayot_copy/344_meritage-cafe-and-wine-bar_review.csv
want to copy ../Gayot_copy/345_meztiso-latin-bistro-and-wine-bar_review.csv
want to copy ../Gayot_copy/34_avenue-m_review.csv
want to copy ../Gayot_copy/359_mortons-