In [21]:
import math
import sys
import fileinput
from lxml import html  
import unicodecsv as csv
import requests
#from exceptions import ValueError
from time import sleep
import re
import argparse
import pandas as pd
import numpy as np
from urllib.parse import quote, unquote
import re, urllib

In [22]:
def GetParser(url):
    response = requests.get(url).text
    parser = html.fromstring(response)
    return parser

In [23]:
def BusinessInfoScrapper(parser):
    raw_name = parser.xpath("//h1[contains(@class,'page-title')]//text()")
    raw_claimed = parser.xpath("//span[contains(@class,'claim-status_icon--claimed')]/parent::div/text()")
    raw_reviews = parser.xpath("//div[contains(@class,'biz-main-info')]//span[contains(@class,'review-count rating-qualifier')]//text()")
    raw_category  = parser.xpath('//div[contains(@class,"biz-page-header")]//span[@class="category-str-list"]//a/text()')
    hours_table = parser.xpath("//table[contains(@class,'hours-table')]//tr")
    details_table = parser.xpath("//div[@class='short-def-list']//dl")
    raw_map_link = parser.xpath("//a[@class='biz-map-directions']/img/@src")
    raw_phone = parser.xpath(".//span[@class='biz-phone']//text()")
    raw_address = parser.xpath('//div[@class="mapbox-text"]//div[contains(@class,"map-box-address")]//text()')
    raw_wbsite_link = parser.xpath("//span[contains(@class,'biz-website')]/a/@href")
    raw_price_range = parser.xpath("//dd[contains(@class,'price-description')]//text()")
    raw_health_rating = parser.xpath("//dd[contains(@class,'health-score-description')]//text()")
    rating_histogram = parser.xpath("//table[contains(@class,'histogram')]//tr[contains(@class,'histogram_row')]")
    raw_ratings = parser.xpath("//div[contains(@class,'biz-page-header')]//div[contains(@class,'rating')]/@title")
    raw_neighborhood = parser.xpath("//div[@class='map-box-address u-space-l4']/span[@class='neighborhood-str-list']//text()")
    report = parsed.xpath('//p[contains(@class,"alert-message text-centered")]/b/text()')
    working_hours = []
    
    for hours in hours_table:
        raw_day = hours.xpath(".//th//text()")
        raw_timing = hours.xpath("./td//text()")
        day = ''.join(raw_day).strip()
        timing = ''.join(raw_timing).strip()
        working_hours.append({day:timing})
    info = []
    for details in details_table:
        raw_description_key = details.xpath('.//dt//text()')
        raw_description_value = details.xpath('.//dd//text()')
        description_key = ''.join(raw_description_key).strip()
        description_value = ''.join(raw_description_value).strip()
        info.append({description_key:description_value})

    ratings_histogram = [] 
    for ratings in rating_histogram:
        raw_rating_key = ratings.xpath(".//th//text()")
        raw_rating_value = ratings.xpath(".//td[@class='histogram_count']//text()")
        rating_key = ''.join(raw_rating_key).strip()
        rating_value = ''.join(raw_rating_value).strip()
        ratings_histogram.append({int(rating_key[0]):int(rating_value)})

    name = ''.join(raw_name).strip()
    phone = ''.join(raw_phone).strip()
    address = ' '.join(' '.join(raw_address).split())
    health_rating = ''.join(raw_health_rating).strip()
    price_range = ''.join(raw_price_range).strip()
    claimed_status = ''.join(raw_claimed).strip()
    category = ','.join(raw_category)
    cleaned_ratings = ''.join(raw_ratings).strip()

    if raw_wbsite_link:
        #pass
        decoded_raw_website_link = urllib.parse.unquote(raw_wbsite_link[0])
        website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
    else:
        website = ''

    if raw_map_link:
        decoded_map_url =  urllib.parse.unquote(raw_map_link[0])
        map_coordinates = re.findall("([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
        latitude = float(map_coordinates[0])
        longitude = float(map_coordinates[1])
    else:
        latitude = ''
        longitude = ''

    if raw_ratings:
        ratings = float(re.findall("\d+[.,]?\d+",cleaned_ratings)[0])
    else:
        ratings = 0

    if raw_neighborhood:
        neighborhood = ''.join(raw_neighborhood).strip()
    else:
        neighborhood = ''

    if raw_reviews:
        reviews = int(''.join(raw_reviews).strip().replace(' reviews','').replace(' review',''))
    else:
        reviews = ''

    if report == []:
        permanently_closed = 0
    else:
        permanently_closed = 1

    data={'working_hours':working_hours,
        'info':info,
        'ratings_histogram':ratings_histogram,
        'name':name,
        'phone':phone,
        'ratings':ratings,
        'address':address,
        'health_rating':health_rating,
        'price_range':price_range,
        'claimed_status':claimed_status,
        'reviews':reviews,
        'category':category,
        'website':website,
        'latitude':latitude,
        'longitude':longitude,
        'neighborhood': neighborhood,  
        'url':url,
        'permanently_closed': permanently_closed 
         }
    return data

In [24]:
def GetAllReivews(parser):
    '''Given the parsed first webpage of a restaurant on yelp, return all reviews of that restaurants'''
    review_dict = {'date': [], 'star': [], 'text': []}
    
    review_dates = parser.xpath("//div[@class='review-content']//span[@class='rating-qualifier']")
    for d in review_dates:
        date = ''.join(d.xpath(".//text()")).strip().split('\n')[0]
        review_dict['date'].append(date)    

    review_stars = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/div[@class='biz-rating biz-rating-large clearfix']")
    for s in review_stars:
        star = float(''.join(s.xpath(".//@title")).strip().replace(' star rating',''))
        review_dict['star'].append(star)
        
    review_texts = parser.xpath("//div[@class='review review--with-sidebar']/div[@class='review-wrapper']/div[@class='review-content']/p")
    for t in review_texts:
        text = ' '.join(t.xpath(".//text()"))
        review_dict['text'].append(text)
    
    review = pd.DataFrame(review_dict)
    review['date'] =  pd.to_datetime(review['date'])
    
    review_pages_section = parser.xpath("//div[@class='arrange arrange--stack arrange--baseline arrange--6']//text()")     
    review_pages = [item for item in [e.replace('\n','').replace(' ','') for e in review_pages_section] if item != '' ]

    if 'Next' not in review_pages:
        return review
    else:
        nextpage = parser.xpath('//a[@class="u-decoration-none next pagination-links_anchor"]/@href')[0]
        nextparser = GetParser(nextpage)
        sleep(1)
        return review.append(GetAllReivews(nextparser), ignore_index=True)    

In [25]:
restaurants = [line.strip() for line in open('TribuneList.txt')]
yelp_url = 'https://www.yelp.com/biz/'
not_found = {'restaurants':[], 'url':[]}

In [26]:
#restaurants_dict = {}
offset = 5
end = 6 #len(restaurants)
for i, r in enumerate(restaurants[offset:end]):
    keyword = r.lower().replace('\'','').replace('&', 'and').replace(',','').replace(' ','-')
    url = yelp_url + keyword + '-chicago'
    #dict = {}
    #restaurants_dict['url'].append(url)
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(i+offset)+'_'+keyword+'.csv', index=False)
        reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
        reviews.to_csv(str(i+offset)+'_'+keyword+'_review.csv', index=False)        
    else:
        not_found['restaurants'].append(r)
        not_found['url'].append(url)

ConnectionError: HTTPSConnectionPool(host='www.yelp.com', port=443): Max retries exceeded with url: /biz/apart-pizza-company-chicago (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x10ef8bb70>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))

In [32]:
restaurants = [line.strip() for line in open('TribuneList.txt')]
yelp_url = 'https://www.yelp.com/biz/'
restaurants_dict = {'resturants':restaurants, 'keyword':[]} #, 'parsed':[]}

for r in restaurants:
    keyword = r.lower().replace('\'','').replace('&', 'and').replace(',','').replace(' ','-')
    url = yelp_url + keyword + '-chicago'
    restaurants_dict['keyword'].append(keyword)
Restaurants = pd.DataFrame(restaurants_dict)
Restaurants.reset_index(inplace=True)

In [33]:
Restaurants

Unnamed: 0,index,keyword,resturants
0,0,25-degrees,25 Degrees
1,1,a-bakers-tale,A Baker's Tale
2,2,act-one-pub,Act One Pub
3,3,ale-syndicate,Ale Syndicate
4,4,analogue,Analogue
5,5,apart-pizza-company,Apart Pizza Company
6,6,arcade-brewery,Arcade Brewery
7,7,authentaco,Authentaco
8,8,b-bim-asian-eatery,B Bim Asian Eatery
9,9,baffo,Baffo


In [35]:
from os import listdir
from os.path import isfile, join
files = [f for f in listdir('./') if isfile(f)]

def FindFile(x, feature='', filelist=None):
    listcopy = filelist.copy()
    filename = str(x['index'])+'_'+x['keyword']+feature+'.csv'
    if filename in listcopy:
        listcopy.remove(filename)
        return filename
    else:
        return 'MISSING'
    
Restaurants['info_file'] = Restaurants[['index','keyword']].apply(lambda x: FindFile(x, feature='', filelist=files), axis=1)
Restaurants['review_file'] = Restaurants[['index','keyword']].apply(lambda x: FindFile(x, feature='_review', filelist=files), axis=1)

In [36]:
Restaurants

Unnamed: 0,index,keyword,resturants,info_file,review_file
0,0,25-degrees,25 Degrees,0_25-degrees.csv,0_25-degrees_review.csv
1,1,a-bakers-tale,A Baker's Tale,1_a-bakers-tale.csv,1_a-bakers-tale_review.csv
2,2,act-one-pub,Act One Pub,2_act-one-pub.csv,2_act-one-pub_review.csv
3,3,ale-syndicate,Ale Syndicate,3_ale-syndicate.csv,3_ale-syndicate_review.csv
4,4,analogue,Analogue,4_analogue.csv,4_analogue_review.csv
5,5,apart-pizza-company,Apart Pizza Company,5_apart-pizza-company.csv,5_apart-pizza-company_review.csv
6,6,arcade-brewery,Arcade Brewery,6_arcade-brewery.csv,6_arcade-brewery_review.csv
7,7,authentaco,Authentaco,7_authentaco.csv,7_authentaco_review.csv
8,8,b-bim-asian-eatery,B Bim Asian Eatery,8_b-bim-asian-eatery.csv,8_b-bim-asian-eatery_review.csv
9,9,baffo,Baffo,9_baffo.csv,9_baffo_review.csv


In [37]:
Problematic = Restaurants[(Restaurants['info_file'] == 'MISSING') | (Restaurants['review_file'] == 'MISSING')]                                                                          

In [38]:
Problematic

Unnamed: 0,index,keyword,resturants,info_file,review_file
30,30,costellos-sandwiches,Costello's Sandwiches,MISSING,MISSING
44,44,finch-kitchen,Finch Kitchen,MISSING,MISSING
50,50,georges-lounge,George's Lounge,MISSING,MISSING
53,53,hard-water-bar,Hard Water Bar,MISSING,MISSING
56,56,jerrys-wicker-park,Jerry's Wicker Park,MISSING,MISSING
61,61,letizias-fiore,Letizia's Fiore,MISSING,MISSING
68,68,mash-craft-kitchen-and-patio,Mash Craft Kitchen and Patio,MISSING,MISSING
81,81,paladino’s-pizza-house-no.-1647,Paladino’s Pizza House No. 1647,MISSING,MISSING
96,96,rezas,Reza's,MISSING,MISSING


30 Costellos Sandwich & Sides
44 The Finch Kitchen
53 Hard Water Bar and Grill
56 Jerrys
61 Letizias Fiore Ristorante and Wine Shoppe
68 Mash
81 Pizza House 1647
96 Rezas Restaurant

In [39]:
updated_restaurants = [line.strip() for line in open('updated_list.txt')]

In [40]:
def GetRestaurant(number=None, keyword=None, postfix=None):
    yelp_url = 'https://www.yelp.com/biz/'
    url = yelp_url + keyword + '-chicago' + postfix
    response = requests.get(url)
    sleep(1)
    if response.status_code == 200:    
        parsed = html.fromstring(response.text)
        data = BusinessInfoScrapper(parsed)
        restaurant = pd.DataFrame({k:[v] for k, v in data.items()})
        restaurant.to_csv(str(number)+'_'+keyword+'.csv', index=False)
        reviews = GetAllReivews(parsed).sort_values(by='date', ascending=False).reset_index(drop=True)
        reviews.to_csv(str(number)+'_'+keyword+'_review.csv', index=False)        
        print('Got ', number, ':', keyword)
    else:
        print('Didn\'t get ', number, ':', keyword)    

In [42]:
for r in updated_restaurants:
    number = r.split()[0]
    keyword = '-'.join(r.split()[1:]).lower().replace('\'','').replace('&', 'and').replace(',','')
    GetRestaurant(number=number, keyword=keyword, postfix='')

Got  30 : costellos-sandwich-and-sides
Got  44 : the-finch-kitchen
Got  53 : hard-water-bar-and-grill
Got  56 : jerrys
Got  61 : letizias-fiore-ristorante-and-wine-shoppe
Got  68 : mash
Got  81 : pizza-house-1647
Got  96 : rezas-restaurant
