In [1]:
import json
import re
import time
import random
from glob import glob
import dask.bag as db
import bs4
import requests
from fake_useragent import UserAgent
from tqdm.notebook import tqdm
from functions.scrape import get_review_data

In [2]:
save=True

# Set Params

In [3]:
with open('data/urls.txt') as f:
    urls = f.read().splitlines()

# Get Review Data

In [4]:
# Warning! Monitor the loop for Captchas

# If you send more than 900 requests, you'll get the following error:
# ConnectionError: HTTPSConnectionPool(host='www.yelp.com', port=443):
# Max retries exceeded with url: /biz/planet-granite-san-francisco-2?start=260
# (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fb9a31254e0>:
# Failed to establish a new connection: [Errno -2] Name or service not known',)

for url in tqdm(urls):

    # Base Page
    ua = UserAgent() # Generate random user to make it seem like the task is distributed across multiple computers
    user_agent = {'User-agent': ua.random}
    response  = requests.get(url, headers = user_agent)
    soup = bs4.BeautifulSoup(response.text)    
    
    # Print information
    page_text = soup.find('div',{"class":"page-of-pages arrange_unit arrange_unit--fill"}).get_text().strip() # Get 'Page <current> of <total>'
    num_pages = int(re.search('(?<=of) [0-9]*', page_text).group(0).strip()) # Get total number of pages
    num_reviews = soup.find('span',{"class":"review-count rating-qualifier"}).get_text().strip() # Get total number of reviews
    business_id = soup.find("div", {"class": "lightbox-map hidden"}).get("data-business-id")
#     print(*[url, business_id, 'Index: '+str(url_index), str(num_pages) + ' pages', num_reviews], sep="\n") # Quality Control
    
    # Generate list of webpages
    start, end = 20, num_pages*20
    pages = [url]
    while start < end:
        pages.append(url + '?start=' + str(start))
        start += 20
    
    data_json = [] # Collect results
    
    for page in tqdm(pages):

        time.sleep(10+10*random.random()) # So Yelp doesn't block IP

        ua = UserAgent() # Generate random user to make it seem like the task is distributed across multiple computers
        user_agent = {'User-agent': ua.random}
        response  = requests.get(page, headers = user_agent)
        soup = bs4.BeautifulSoup(response.text)

        review = soup.findAll('div',{"class":"review--with-sidebar"})
        data_list = get_review_data(review, business_id)
        
        for data in data_list:
            data_json.append(data) # Collect data

        # print("Collected page: ", pages.index(page)+1)

    # Print final results
    # print(*[data_json, len(data_json)], sep='\n')

In [5]:
# Save JSON
if save:
    filename = re.match('https://www.yelp.com/biz/(?P<filename>.*)', urls[0])['filename'].replace('-', '_')
    with open('data/reviews/{}.json'.format(filename), 'w') as f:
        for review in data_json:
            json.dump(review, f)
            f.write('\n')
    print('Complete')

Complete


# Check

In [6]:
# Inspect
review_full_bag = db.read_text("data/reviews/diablo_rock_gym_concord_3.json").map(json.loads) # Loads the json file as a dask bag
review_tuple = review_full_bag.take(10000) # Takes the first 10000 entries of the dask bag and stores as a tuple
len(review_tuple)

143

# Concat

In [7]:
# Show the files in the folder
filenames_list = glob("data/reviews/*.json") # Grab a list of filenames
files = list(map(lambda x: x.replace("reviews/", "").replace(".json","").replace("data/", ""), sorted(filenames_list)))
files[0:5]

['aesthetic_climbing_gym_lake_forest',
 'berkeley_ironworks_climbing_and_fitness_club_berkeley',
 'blue_granite_climbing_gym_south_lake_tahoe',
 'boulderdash_indoor_rock_climbing_thousand_oaks',
 'boulderdash_sfv_chatsworth_6']

In [8]:
# Concat
climbing_reviews = []
for file in tqdm(files):
    review_full_bag = db.read_text("data/reviews/{}.json".format(file)).map(json.loads) # Loads the json file as a dask bag
    review_tuple = review_full_bag.take(10000) # Takes the first 10000 entries of the dask bag and stores as a tuple
    climbing_reviews.extend(review_tuple)

print(len(climbing_reviews))

HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))


8317


In [9]:
# Save
if save:
    filename = "reviews-copy"
    with open(f'data/{filename}.json', 'w') as f:
        for review in climbing_reviews:
            json.dump(review, f)
            f.write('\n')
    print("Complete")

Complete
