In [1]:
import pandas as pd
from collections import Counter

import glob # For opening and closing files

# Webpage Stuff
import bs4
from urllib.request import urlopen
import requests
import time
import random
from fake_useragent import UserAgent

import re
import usaddress
import ast # For parsing HTML

# Json stuff
import json
from pandas.io.json import json_normalize
import dask.bag as db

import warnings # Turn off warnings
warnings.filterwarnings('ignore')

In [2]:
def collect_business_results(page):
    
    # Collect Business Info
    business_info_list = page.findAll("script", {"type": "application/ld+json"})
    busineess_info_script = business_info_list[-1]
    business_info_text = busineess_info_script.get_text().strip().replace("true", "True").replace("false", "False").replace("null", "None")
    business_info_dict = ast.literal_eval(business_info_text)

    # Collect map data
    map_data_text = page.findAll("div", attrs = {"class": "lightbox-map hidden"})[0].get("data-map-state").replace("true", "True").replace("false", "False").replace("null", "None")
    map_data_dict = ast.literal_eval(map_data_text)
    
    # Collect page_count
    if 'reviewCount' in business_info_dict['aggregateRating']:
        review_count = business_info_dict['aggregateRating']['reviewCount']
    else:
        review_count = 0

    # Get whether Business is Open or Not
    biz_closed_list = re.findall('"biz_closed": \[[0-9], \"(True|False)\"\]', page.text)
    if biz_closed_list[0]:
        is_open = 1
    else:
        is_open = 0
        
    # Get attributes
    if page.find("div", {"class": "short-def-list"}):
        attribute_key = page.find("div", {"class": "short-def-list"}).findAll("dt", {"class": "attribute-key"})
        attribute_key_list = list(map(lambda x: x.get_text().strip(), attribute_key))
        attribute_value = page.find("div", {"class": "short-def-list"}).findAll("dd")
        attribute_value_list = list(map(lambda x: x.get_text().strip(), attribute_value))
        attribute_dict = dict(zip(attribute_key_list, attribute_value_list))
    else:
        attribute_dict = None
        
    # Get categories    
    category_str_list = page.find("span", attrs = {"class": "category-str-list"}).findAll('a')
    category_list = list(map(lambda x: x.get_text(), category_str_list))
    category_str = ''
    for i in range(len(category_list)):
        category_str = category_str+category_list[i]+", "
    categories = category_str.strip()[:-1] # Remove ending characters

    # Get hours
    hours_dict = {}
    if page.find("table", {"class": "table table-simple hours-table"}):
        day_table = page.find("table", {"class": "table table-simple hours-table"}).findAll("th")
        hours_table = page.find("table", {"class": "table table-simple hours-table"}).findAll('td')

        for i in range(len(day_table)):
            hours_dict[day_table[i].get_text()] = hours_table[2*i].get_text().strip()
    else:
        hours_dict = None  
    
    data_dict = {'business_id': page.find("div", {"class": "lightbox-map hidden"}).get("data-business-id"),
             'link_id': map_data_dict['markers'][1]['url'].replace('/biz/', ''), # Yes
             'name': business_info_dict['name'],
             'address': business_info_dict['address']['streetAddress'].replace('\n', ' '), # Yes
             'city': business_info_dict['address']['addressLocality'],
             'state': business_info_dict['address']['addressRegion'],
             'postal_code': business_info_dict['address']['postalCode'],
             'telephone': business_info_dict['telephone'],
             'latitude': map_data_dict['markers'][1]['location']['latitude'],
             'longitude': map_data_dict['markers'][1]['location']['longitude'],
             'stars': business_info_dict['aggregateRating']['ratingValue'],
             'review_count': review_count,
             'is_open': is_open,
             'attributes': attribute_dict,
             'categories': categories,
             'hours': hours_dict}
    
    return data_dict

In [3]:
with open(r'data/url-list.txt') as f: # Grab API Key
    url_list = f.read().splitlines()
    f.close()
    
# Show the first element of the list
ref_dict = dict(zip(range(72), url_list))
ref_dict[0]

'https://www.yelp.com/biz/aesthetic-climbing-gym-lake-forest'

In [4]:
data_json = [] # Collect results

In [35]:
# Monitor the loop in case it breaks.

for url_index in range(27, len(url_list)):

    URL = url_list[url_index]
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response  = requests.get(URL, headers = user_agent)
    soup = bs4.BeautifulSoup(response.text)    
    
    # Get number of pages
    page_text = soup.find('div',{"class":"page-of-pages arrange_unit arrange_unit--fill"}).get_text().strip() # Get 'Page <current> of <total>'
    num_pages = int(re.search('(?<=of) [0-9]*', page_text).group(0).strip()) # Get total number of pages
    num_reviews = soup.find('span',{"class":"review-count rating-qualifier"}).get_text().strip() # Get total number of reviews
    business_id = soup.find("div", {"class": "lightbox-map hidden"}).get("data-business-id")
    
    print(*[URL, business_id, 'Index: '+str(url_index), num_reviews], sep="\n") # Quality Control
        
    data_dict = collect_business_results(soup) # Function defined above
    data_json.append(data_dict) # Collect data
    
    time.sleep(10+10*random.random())
    
data_json

https://www.yelp.com/biz/mesa-rim-climbing-and-fitness-center-san-diego-3
ShWan26H3hBpmvfE9_nTPw
Index: 27
96 reviews
https://www.yelp.com/biz/mesa-rim-climbing-and-fitness-reno-2
QiNYgFrT6ySPok_6ePFRqw
Index: 28
26 reviews
https://www.yelp.com/biz/metalmark-climbing-and-fitness-fresno
7n-LZwcoUizVGH5lDgMpVA
Index: 29
41 reviews
https://www.yelp.com/biz/mission-cliffs-climbing-and-fitness-san-francisco
yR2AbNAOuzqpJfmGwV0O3g
Index: 30
475 reviews
https://www.yelp.com/biz/outback-climbing-center-san-diego
PrDF2ycpOHJq1pot0RY1zA
Index: 31
2 reviews
https://www.yelp.com/biz/pacific-edge-laguna-beach
pPJLCarPr3kmvnV0OvmZYQ
Index: 32
591 reviews
https://www.yelp.com/biz/pacific-edge-rock-climbing-gym-santa-cruz
dLSkedZvkE1xldE5cwvquA
Index: 33
64 reviews
https://www.yelp.com/biz/planet-granite-belmont-2
OJ1Pd42-T1oFIm2sqvy_hA
Index: 34
225 reviews
https://www.yelp.com/biz/planet-granite-portland-portland
6hOQqQa5yK9son4ZKUKoFg
Index: 35
66 reviews
https://www.yelp.com/biz/planet-granite-san

[{'business_id': '3J-Q42JoMUPqRHfUckA2bw',
  'link_id': 'aesthetic-climbing-gym-lake-forest',
  'name': 'Aesthetic Climbing Gym',
  'address': '26794 Vista Ter',
  'city': 'Lake Forest',
  'state': 'CA',
  'postal_code': '92630',
  'telephone': '+19497167116',
  'latitude': 33.6649859,
  'longitude': -117.663927,
  'stars': 4.5,
  'review_count': 66,
  'is_open': 1,
  'attributes': {'Accepts Credit Cards': 'Yes',
   'Parking': 'Private Lot',
   'Bike Parking': 'Yes',
   'Good for Kids': 'Yes'},
  'categories': 'Climbing, Venues & Event Spaces, Gyms',
  'hours': {'Mon': '12:00 pm - 11:00 pm',
   'Tue': '12:00 pm - 11:00 pm',
   'Wed': '12:00 pm - 11:00 pm',
   'Thu': '12:00 pm - 11:00 pm',
   'Fri': '12:00 pm - 11:00 pm',
   'Sat': '10:00 am - 10:00 pm',
   'Sun': '10:00 am - 6:00 pm'}},
 {'business_id': 'e8qcVa63akVtgO-FFieb3Q',
  'link_id': 'berkeley-ironworks-climbing-and-fitness-club-berkeley',
  'name': 'Berkeley Ironworks Climbing & Fitness Club',
  'address': '800 Potter St',
  '

In [52]:
len(data_json)

71

In [54]:
# Save file
filename = "climbing-gym-business"
with open('data/{}.json'.format(filename), 'w') as f:
    for i in range(len(climbing_review_json)):
        json.dump(climbing_review_json[i], f)
        f.write('\n')
print("Complete")

Complete


In [8]:
# Later for opening
business_full_bag = db.read_text("data/climbing-gym-business.json").map(json.loads) # Loads the json file as a dask bag
business_tuple = business_full_bag.take(10000) # Takes the first 10000 entries of the dask bag and stores as a tuple
len(business_tuple)

71

In [9]:
business_tuple

({'business_id': '3J-Q42JoMUPqRHfUckA2bw',
  'link_id': 'aesthetic-climbing-gym-lake-forest',
  'name': 'Aesthetic Climbing Gym',
  'address': '26794 Vista Ter',
  'city': 'Lake Forest',
  'state': 'CA',
  'postal_code': '92630',
  'telephone': '+19497167116',
  'latitude': 33.6649859,
  'longitude': -117.663927,
  'stars': 4.5,
  'review_count': 66,
  'is_open': 1,
  'attributes': {'Accepts Credit Cards': 'Yes',
   'Parking': 'Private Lot',
   'Bike Parking': 'Yes',
   'Good for Kids': 'Yes'},
  'categories': 'Climbing, Venues & Event Spaces, Gyms',
  'hours': {'Mon': '12:00 pm - 11:00 pm',
   'Tue': '12:00 pm - 11:00 pm',
   'Wed': '12:00 pm - 11:00 pm',
   'Thu': '12:00 pm - 11:00 pm',
   'Fri': '12:00 pm - 11:00 pm',
   'Sat': '10:00 am - 10:00 pm',
   'Sun': '10:00 am - 6:00 pm'}},
 {'business_id': 'e8qcVa63akVtgO-FFieb3Q',
  'link_id': 'berkeley-ironworks-climbing-and-fitness-club-berkeley',
  'name': 'Berkeley Ironworks Climbing & Fitness Club',
  'address': '800 Potter St',
  '