In [18]:
import math
import sys
import fileinput
from lxml import html  
import unicodecsv as csv
import requests
from time import sleep
import re
import argparse
import pandas as pd
import numpy as np
from urllib.parse import quote, unquote
import re, urllib

In [19]:
def GetParser(url):
    response = requests.get(url).text
    parser = html.fromstring(response)
    return parser

In [20]:
def BusinessInfoScrapper(parser):
    raw_name = parser.xpath("//h1[contains(@class,'page-title')]//text()")
    raw_claimed = parser.xpath("//span[contains(@class,'claim-status_icon--claimed')]/parent::div/text()")
    raw_reviews = parser.xpath("//div[contains(@class,'biz-main-info')]//span[contains(@class,'review-count rating-qualifier')]//text()")
    raw_category  = parser.xpath('//div[contains(@class,"biz-page-header")]//span[@class="category-str-list"]//a/text()')
    hours_table = parser.xpath("//table[contains(@class,'hours-table')]//tr")
    details_table = parser.xpath("//div[@class='short-def-list']//dl")
    raw_map_link = parser.xpath("//a[@class='biz-map-directions']/img/@src")
    raw_phone = parser.xpath(".//span[@class='biz-phone']//text()")
    raw_address = parser.xpath('//div[@class="mapbox-text"]//div[contains(@class,"map-box-address")]//text()')
    raw_wbsite_link = parser.xpath("//span[contains(@class,'biz-website')]/a/@href")
    raw_price_range = parser.xpath("//dd[contains(@class,'price-description')]//text()")
    raw_health_rating = parser.xpath("//dd[contains(@class,'health-score-description')]//text()")
    rating_histogram = parser.xpath("//table[contains(@class,'histogram')]//tr[contains(@class,'histogram_row')]")
    raw_ratings = parser.xpath("//div[contains(@class,'biz-page-header')]//div[contains(@class,'rating')]/@title")
    raw_neighborhood = parser.xpath("//div[@class='map-box-address u-space-l4']/span[@class='neighborhood-str-list']//text()")
    report = parsed.xpath('//p[contains(@class,"alert-message text-centered")]/b/text()')
    working_hours = []
    
    for hours in hours_table:
        raw_day = hours.xpath(".//th//text()")
        raw_timing = hours.xpath("./td//text()")
        day = ''.join(raw_day).strip()
        timing = ''.join(raw_timing).strip()
        working_hours.append({day:timing})
    info = []
    for details in details_table:
        raw_description_key = details.xpath('.//dt//text()')
        raw_description_value = details.xpath('.//dd//text()')
        description_key = ''.join(raw_description_key).strip()
        description_value = ''.join(raw_description_value).strip()
        info.append({description_key:description_value})

    ratings_histogram = [] 
    for ratings in rating_histogram:
        raw_rating_key = ratings.xpath(".//th//text()")
        raw_rating_value = ratings.xpath(".//td[@class='histogram_count']//text()")
        rating_key = ''.join(raw_rating_key).strip()
        rating_value = ''.join(raw_rating_value).strip()
        ratings_histogram.append({int(rating_key[0]):int(rating_value)})

    name = ''.join(raw_name).strip()
    phone = ''.join(raw_phone).strip()
    address = ' '.join(' '.join(raw_address).split())
    health_rating = ''.join(raw_health_rating).strip()
    price_range = ''.join(raw_price_range).strip()
    claimed_status = ''.join(raw_claimed).strip()
    category = ','.join(raw_category)
    cleaned_ratings = ''.join(raw_ratings).strip()

    if raw_wbsite_link:
        #pass
        decoded_raw_website_link = urllib.parse.unquote(raw_wbsite_link[0])
        website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
    else:
        website = ''

    if raw_map_link:
        decoded_map_url =  urllib.parse.unquote(raw_map_link[0])
        map_coordinates = re.findall("([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
        latitude = float(map_coordinates[0])
        longitude = float(map_coordinates[1])
    else:
        latitude = ''
        longitude = ''

    if raw_ratings:
        ratings = float(re.findall("\d+[.,]?\d+",cleaned_ratings)[0])
    else:
        ratings = 0

    if raw_neighborhood:
        neighborhood = ''.join(raw_neighborhood).strip()
    else:
        neighborhood = ''

    if raw_reviews:
        reviews = int(''.join(raw_reviews).strip().replace(' reviews','').replace(' review',''))
    else:
        reviews = ''

    if report == []:
        permanently_closed = 0
    else:
        permanently_closed = 1

    data={'working_hours':working_hours,
        'info':info,
        'ratings_histogram':ratings_histogram,
        'name':name,
        'phone':phone,
        'ratings':ratings,
        'address':address,
        'health_rating':health_rating,
        'price_range':price_range,
        'claimed_status':claimed_status,
        'reviews':reviews,
        'category':category,
        'website':website,
        'latitude':latitude,
        'longitude':longitude,
        'neighborhood': neighborhood,  
        'url':url,
        'permanently_closed': permanently_closed 
         }
    return data

In [21]:
def GetAllReivews(parser):
    '''Given the parsed first webpage of a restaurant on yelp, return all reviews of that restaurants'''
    review_dict = {'date': [], 'star': [], 'text': []}
    
    review_dates = parser.xpath("//div[@class='review-content']//span[@class='rating-qualifier']")
    for d in review_dates:
        date = ''.join(d.xpath(".//text()")).strip().split('\n')[0]
        review_dict['date'].append(date)    

    review_stars = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/div[@class='biz-rating biz-rating-large clearfix']")
    for s in review_stars:
        star = float(''.join(s.xpath(".//@title")).strip().replace(' star rating',''))
        review_dict['star'].append(star)
        
    review_texts = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/p")
    for t in review_texts:
        text = ' '.join(t.xpath(".//text()"))
        review_dict['text'].append(text)
    
    review = pd.DataFrame(review_dict)
    review['date'] =  pd.to_datetime(review['date'])
    
    review_pages_section = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")     
    review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages_section] if item != '' ]

    if 'Next' not in review_pages:
        return review
    else:
        nextpage = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href')[0]
        nextparser = GetParser(nextpage)
        sleep(random.choice([1,2]))
        return review.append(GetAllReivews(nextparser), ignore_index=True)
    

In [30]:
import os
os.getcwd()

'/Users/cysung/GA/projects/project_capstone/part_02/Gayot_copy'

In [28]:
def GetAllRestaurants(path='./',listfile='./gayot_list_nochain.txt', start=0, end=559, GetReviews=True):
    import random
    yelp_url = 'https://www.yelp.com/biz/'
    restaurants = [line.strip() for line in open(listfile)]
    for i, r in enumerate(restaurants[start:end]):
        keyword = r.lower().replace('\'','').replace('&', 'and').replace(',','').replace(' ','-')
        url = yelp_url + keyword + '-chicago'
        response = requests.get(url)
        sleep(random.choice([1,2]))
        if response.status_code == 200:    
            parsed = html.fromstring(response.text)
            data = BusinessInfoScrapper(parsed)
            restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
            restaurant.to_csv(path+str(i+start)+'_'+keyword+'.csv', index=False)
            if GetReviews:
                reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
                reviews.to_csv(path+str(i+start)+'_'+keyword+'_review.csv', index=False)        
            print('got '+str(i+start)+': '+url)
        else:
            print('didn\'t get '+str(i+start)+': '+url)

In [29]:
GetAllRestaurants(path='./',listfile='./gayot_list_nochain.txt', start=0, end=559, GetReviews=False)

got 0: https://www.yelp.com/biz/2-sparrows-chicago
got 1: https://www.yelp.com/biz/33-club-chicago
got 2: https://www.yelp.com/biz/42-grams-chicago
got 3: https://www.yelp.com/biz/a-la-turka-turkish-kitchen-chicago
got 4: https://www.yelp.com/biz/a-mano-chicago
didn't get 5: https://www.yelp.com/biz/a-milano-italian-grill-chicago
got 6: https://www.yelp.com/biz/abbey-pub-chicago
didn't get 7: https://www.yelp.com/biz/abril-chicago
didn't get 8: https://www.yelp.com/biz/addis-abeba-chicago
got 9: https://www.yelp.com/biz/adesso-chicago
got 10: https://www.yelp.com/biz/adobo-grill-chicago
got 11: https://www.yelp.com/biz/aigre-doux-chicago
got 12: https://www.yelp.com/biz/aja-chicago
got 13: https://www.yelp.com/biz/al-chulas-american-grill-chicago
didn't get 14: https://www.yelp.com/biz/al-dente-cafe-and-lounge-chicago
got 15: https://www.yelp.com/biz/al-primo-canto-chicago
didn't get 16: https://www.yelp.com/biz/alberts-cafe-and-patisserie-chicago
got 17: https://www.yelp.com/biz/aldin

UnboundLocalError: local variable 'bypass' referenced before assignment

In [9]:
#restaurants_dict = {}
offset = 381
for i, r in enumerate(restaurants[offset:]):
    keyword = r.lower().replace('\'','').replace('&', 'and').replace(',','').replace(' ','-')
    url = yelp_url + keyword + '-chicago'
    #dict = {}
    #restaurants_dict['url'].append(url)
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(i+offset)+'_'+keyword+'.csv', index=False)
        reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
        reviews.to_csv(str(i+offset)+'_'+keyword+'_review.csv', index=False)        
    else:
        not_found['restaurants'].append(r)
        not_found['url'].append(url)

In [6]:
updated_restaurants = [line.strip() for line in open('updated_list.txt')]

In [11]:
for r in updated_restaurants[20:]:
    number = r.split()[0]
    keyword = '-'.join(r.split()[1:]).lower().replace('\'','').replace('&', 'and').replace(',','')
    url = yelp_url + keyword + '-chicago'
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(number)+'_'+keyword+'.csv', index=False)
        reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
        reviews.to_csv(str(number)+'_'+keyword+'_review.csv', index=False)        
        print('Got ', number, ':', keyword)
    else:
        print('Didn\'t get ', number, ':', keyword)

Got  140 : coast-sushi-bar
Got  167 : de-la-costa
Got  171 : dillmans
Got  178 : the-drawing-room
Got  180 : d-vine-restaurant
Got  188 : elis-the-place-for-steak
Got  192 : entre-nous-restaurant
Got  201 : fattoush-restaurant
Got  206 : flo
Got  223 : gennaros-restaurant
Got  224 : gandhi-india-restaurant
Got  231 : grace-o-malleys
Got  240 : the-grotto-on-state
Got  241 : hacienda-tecalitlan-restaurant
Got  247 : hillarys-urban-eatery
Got  249 : hong-min-restaurant
Got  263 : ing-restaurant
Got  272 : japonais-by-morimoto
Got  276 : joey-buonas-pizzareria
Got  284 : kabocha-japanese-brasserie
Got  289 : kaze-sushi
Got  296 : kyoto-sushi-steak-seafood
Got  302 : la-donna-italian-cuisine
Got  305 : la-strada-ristorante
Got  308 : la-vita-restaurant
Got  310 : landmark-grill
Got  322 : lindo-mexico-restauant
Got  332 : magnums-prime-steakhouse
Got  346 : mi-peru-restaurant-and-club
Got  352 : mk-restaurant
Got  358 : moon-palace-restaurant
Got  360 : moti-mahal-indian-restaurant
Got  36

In [15]:
def GetRestaurant(number=None, keyword=None, postfix=None):
    yelp_url = 'https://www.yelp.com/biz/'
    url = yelp_url + keyword + '-chicago' + postfix
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(number)+'_'+keyword+'.csv', index=False)
        reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
        reviews.to_csv(str(number)+'_'+keyword+'_review.csv', index=False)        
        print('Got ', number, ':', keyword)
    else:
        print('Didn\'t get ', number, ':', keyword)    

In [13]:
GetRestaurant(number=371, keyword='rylons-smokehouse', postfix=None)

Got  371 : rylons-smokehouse


In [16]:
GetRestaurant(number=375, keyword='nuevo-leon-restaurant', postfix='-2')

Got  375 : nuevo-leon-restaurant
