In [2]:
import pandas as pd
from collections import Counter

import glob # For opening and closing files

# Webpage Stuff
import bs4
from urllib.request import urlopen
import requests
import re
import time
import random
from fake_useragent import UserAgent

# Json stuff
import json
from pandas.io.json import json_normalize
import dask.bag as db

import warnings # Turn off warnings
warnings.filterwarnings('ignore')

In [3]:
def get_vote(review):
    
    vote_type = review.findAll("span", {"class": "vote-type"})
    vote_type_list = list(map(lambda x: x.get_text().strip(), vote_type))
    vote_count = review.findAll("span", {"class": "count"})
    vote_count_text = list(map(lambda x: x.get_text(), vote_count))
    vote_count_list = [0 if x == '' else int(x) for x in vote_count_text]
    vote_dict = dict(zip(vote_type_list, vote_count_list))
    
    return vote_dict



def collect_results(review_content):
    """
    For loop that loops through each page and returns a list of tuples
    Uses the vote_dict() function
    """
    data_list = []
    
    for i in range(1, len(review_content)):
        
        vote_dict = get_vote(review_content[i]) # Get useful, funny, and cool attributes
        date = review_content[i].find('span',{"class":"rating-qualifier"}).get_text().replace("\n", "").strip() # Get date
        date_match = re.search('^\d{1,2}\/\d{1,2}\/\d{4}', date) # Use regex to extract text

        data_dict = {'review_id':review_content[i].get("data-review-id"),
                     'user_id': review_content[i].get("data-signup-object").replace("user_id:", ""),
                     'user_name': review_content[i].find('li', {'class': 'user-name'}).get_text().strip(),
                     'business_id': business_id,
                     'stars': int(float(review_content[i].find('div',{"class":"biz-rating biz-rating-large clearfix"}).div.div.get("title").replace(" star rating", ""))),
                     'useful': vote_dict['Useful'],
                     'funny': vote_dict['Funny'],
                     'cool': vote_dict['Cool'],
                     'text': review_content[i].findAll('div',{"class":"review-content"})[0].find('p').get_text(),
                     'date': date_match.group(0)}
        
        data_list.append(data_dict)
    
    return data_list

In [4]:
# Show which files are completed
filenames_list = glob.glob("Data/Climbing Gym Reviews/*.json") # Grab a list of filenames
finished_files_list = list(map(lambda x: x.replace("Data/Climbing Gym Reviews/", "").replace(".json",""), sorted(filenames_list)))
finished_files_list # Show list

['aesthetic_climbing_gym_lake_forest',
 'berkeley_ironworks_climbing_and_fitness_club_berkeley',
 'blue_granite_climbing_gym_south_lake_tahoe',
 'boulderdash_indoor_rock_climbing_thousand_oaks',
 'boulderdash_sfv_chatsworth_6',
 'boulderdash_ventura_ventura',
 'bridges_rock_gym_el_cerrito',
 'cliffs_of_id_culver_city',
 'crux_climbing_center_san_luis_obispo',
 'desert_rocks_indoor_climbing_gym_north_palm_springs',
 'diablo_rock_gym_concord_3',
 'dogpatch_boulders_san_francisco',
 'far_north_climbing_arcata',
 'gold_crush_grass_valley',
 'granite_arch_climbing_center_rancho_cordova',
 'great_western_power_company_oakland',
 'grotto_climbing_and_yoga_san_diego',
 'hangar_18_arcadia_arcadia',
 'hangar_18_east_riverside_riverside',
 'hangar_18_hawthorne_hawthorne',
 'hangar_18_long_beach_signal_hill_2',
 'hangar_18_mission_viejo_mission_viejo',
 'hangar_18_riverside_riverside_2',
 'hangar_18_upland_upland',
 'hollywood_boulders_los_angeles',
 'la_boulders_los_angeles',
 'mesa_rim_climbing_

In [6]:
with open(r'/home/harrisonized/Github/Metis_DS_Projects_Backup/Project 4/Data/url_list.txt') as f: # Grab API Key
    url_list = f.read().splitlines()
    f.close()

In [8]:
# Show the first element of the list
ref_list = dict(zip(range(72), url_list))
ref_list[0]

'https://www.yelp.com/biz/aesthetic-climbing-gym-lake-forest'

In [6]:
# Run this code block by itself
# Monitor this so in case you get blocked by Captchas
# If you send more than 900 requests, you'll get the following error

"""
ConnectionError: HTTPSConnectionPool(host='www.yelp.com', port=443):
Max retries exceeded with url: /biz/planet-granite-san-francisco-2?start=260
(Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fb9a31254e0>:
Failed to establish a new connection: [Errno -2] Name or service not known',)
"""

# To be safe, do 10 at a time

for url_index in range(5, 6):

    URL = url_list[url_index]
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response  = requests.get(URL, headers = user_agent)
    soup = bs4.BeautifulSoup(response.text)    
    
    # Get number of pages
    page_text = soup.find('div',{"class":"page-of-pages arrange_unit arrange_unit--fill"}).get_text().strip() # Get 'Page <current> of <total>'
    num_pages = int(re.search('(?<=of) [0-9]*', page_text).group(0).strip()) # Get total number of pages
    num_reviews = soup.find('span',{"class":"review-count rating-qualifier"}).get_text().strip() # Get total number of reviews
    business_id = soup.find("div", {"class": "lightbox-map hidden"}).get("data-business-id")

    # Generate list of webpages
    start = 20
    end = num_pages*20
    page_list = [URL]
    while (start < end):
        page_list.append(URL + '?start=' + str(start))
        start += 20
    data_json = [] # Collect results
    print(*[URL, business_id, 'Index: '+str(url_index), str(num_pages) + ' pages', num_reviews], sep="\n") # Quality Control
    
    # Grab webpages:
    for page in page_list[0:len(page_list)]:

        time.sleep(10+10*random.random()) # So Yelp doesn't block IP

        # Generate random user to make it seem like the task is distributed across multiple computers
        ua = UserAgent()
        user_agent = {'User-agent': ua.random}
        response  = requests.get(page, headers = user_agent)
        soup = bs4.BeautifulSoup(response.text)

        review_content = soup.findAll('div',{"class":"review--with-sidebar"}) # Get reviews

        # Collect results and add it to the data_json file
        data_list = collect_results(review_content)
        for i in range(len(data_list)):
            data_json.append(data_list[i])

        print("Collected page: ", page_list.index(page)+1)

    print(data_json) # Print final results
    
    print(len(data_json))
    
    # Save final data as actual JSON
    filename = url_list[url_index].replace("https://www.yelp.com/biz/", "").replace("-", "_")
    with open('Climbing Gym Data/{}.json'.format(filename), 'w') as f:
        for j in range(len(data_json)):
            json.dump(data_json[j], f)
            f.write('\n')
    print("Complete")

https://www.yelp.com/biz/boulderdash-ventura-ventura
lmVDhvvk7bwoRf4YwUslVw
Index: 5
1 pages
3 reviews
Collected page:  1
[{'review_id': '4VKBNDSZ9gTwdGUq40b9Lg', 'user_id': 'KkiGd_-z0X4xl957qwB2TA', 'user_name': 'Courtney K.', 'business_id': 'lmVDhvvk7bwoRf4YwUslVw', 'stars': 5, 'useful': 2, 'funny': 0, 'cool': 0, 'text': "We had an AMAZING experience today. My four year old daughter wanted to rock climb for the first time, and we went to Boulderdash not knowing what to do or expect but wanting to help her to try something she was interested in. The extremely professional staff accommodated us on the fly. My daughter became afraid once it was time to climb. But Cat took time to quickly and gently build a rapport with my daughter, and ever-patiently worked as her belay to help her make it to the top. My daughter left feeling accomplished and more confident thanks to Cat's positivity and encouragement. The facility (including the bathroom) is extremely clean. Plenty of space, great ligh

In [23]:
# Below is code for running manually
# The purpose of this cell is to prevent the notebook from running below this code block
break

SyntaxError: 'break' outside loop (<ipython-input-23-0e516dbbe0b3>, line 5)

In [14]:
# Partially manual collection

url_index = 3 # Initialize URL, change the url index manually to prevent getting blocked by Captchas
URL = url_list[url_index]
soup = bs4.BeautifulSoup(urlopen(URL))

# Get number of pages
page_text = soup.find('div',{"class":"page-of-pages arrange_unit arrange_unit--fill"}).get_text().strip() # Get 'Page <current> of <total>'
num_pages = int(re.search('(?<=of) [0-9]*', page_text).group(0).strip()) # Get total number of pages
num_reviews = soup.find('span',{"class":"review-count rating-qualifier"}).get_text().strip() # Get total number of reviews
business_id = soup.find("div", {"class": "lightbox-map hidden"}).get("data-business-id")

# Generate list of webpages
start = 20
end = num_pages*20
page_list = [URL]
while (start < end):
    page_list.append(URL + '?start=' + str(start))
    start += 20
    
data_json = [] # Collect results

print(*[URL, business_id, num_pages, num_reviews], sep="\n") # Quality Control

https://www.yelp.com/biz/boulderdash-indoor-rock-climbing-thousand-oaks
PDbuxryXvx3usOLVNMOF0Q
5
99 reviews


In [None]:
# Grab webpages:

for page in page_list[0:len(page_list)]:
    
    time.sleep(10+10*random.random()) # So Yelp doesn't block IP
    
    # Generate random user to make it seem like the task is distributed across multiple computers
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response  = requests.get(page, headers = user_agent)
    soup = bs4.BeautifulSoup(response.text)
    
    review_content = soup.findAll('div',{"class":"review--with-sidebar"}) # Get reviews
    
    # For each review, collect results
    for i in range(1, len(review_content)):
     
        vote_dict = get_vote(review_content[i]) # Get useful, funny, and cool attributes
        date = review_content[1].find('span',{"class":"rating-qualifier"}).get_text().replace("\n", "").strip() # Get date
        date_match = re.search('^\d{1,2}\/\d{1,2}\/\d{4}', date) # Use regex to extract text
        
        data_dict = {'review_id':review_content[i].get("data-review-id"),
                     'user_id': review_content[i].get("data-signup-object").replace("user_id:", ""),
                     'user_name': review_content[i].find('li', {'class': 'user-name'}).get_text().strip(),
                     'business_id': business_id,
                     'stars': int(float(review_content[i].find('div',{"class":"biz-rating biz-rating-large clearfix"}).div.div.get("title").replace(" star rating", ""))),
                     'useful': vote_dict['Useful'],
                     'funny': vote_dict['Funny'],
                     'cool': vote_dict['Cool'],
                     'text': review_content[i].findAll('div',{"class":"review-content"})[0].find('p').get_text(),
                     'date': date_match.group(0)}
        
        data_json.append(data_dict) # Collect results
        
    print("collected: ", page_list.index(page)+1)
    
data_json # Print results

In [11]:
len(data_json)

132

In [12]:
# Change index manually
filename = url_list[url_index].replace("https://www.yelp.com/biz/", "").replace("-", "_")
with open('Climbing Gym Data/{}.json'.format(filename), 'w') as f:
    for j in range(len(data_json)):
        json.dump(data_json[j], f)
        f.write('\n')
print("Complete")

Complete


In [15]:
# Inspect
review_full_bag = db.read_text("Data/Climbing Gym Reviews/diablo_rock_gym_concord_3.json").map(json.loads) # Loads the json file as a dask bag
review_tuple = review_full_bag.take(10000) # Takes the first 10000 entries of the dask bag and stores as a tuple
len(review_tuple)

143

In [16]:
review_tuple

({'review_id': 'ejMAXZqxwFM85mX1QT5ZYw',
  'user_id': 'QtW4RvmEts6jIvl6eXX8FA',
  'user_name': 'Jorge T.',
  'business_id': 'ImFFWa6UEu_fhfODG1yBiQ',
  'stars': 5,
  'useful': 0,
  'funny': 0,
  'cool': 0,
  'text': 'My wife took me to go rock climbing for the 1st time and we had a blast. \xa0We took the beginner class and they taught us everything we need to get ready to be let lose on the walls. Ended up spend the whole day their until our arms were to tired to continue climbing. Definitely going back soon.',
  'date': '3/3/2019'},
 {'review_id': 'tA9muCjhOV2J_1lR5y8pvQ',
  'user_id': 'Ld0HLICQ0GisBQ-P7YO0lw',
  'user_name': 'Elizabeth R.',
  'business_id': 'ImFFWa6UEu_fhfODG1yBiQ',
  'stars': 2,
  'useful': 0,
  'funny': 0,
  'cool': 0,
  'text': "I love the staff here and the rock climbing is great. Prob is the gym is antiquated and poorly stocked. They dont have the most basic equipment most gyms have..Except in the crossfit area, which costs an additional $20 a month on top of th