In [1]:
import re
import pandas as pd
import bs4
import requests
import time
import random
import warnings # Turn off warnings
warnings.filterwarnings('ignore')

In [2]:
def collect_data(df):
    """
    Uses the link provided in the CSV file to open and grab data from the following categories: StarRatings, SuggestedRatings, OnToDoLists, and Ticks
    """
    # Add empty columns to df
    df["star_ratings"] = 0
    df["suggested_ratings"] = 0
    df["on_to_do_lists"] = 0
    df["ticks"] = 0
    
    # Grab data from stats page and appends to df
    # Delay is 1.5 +/- 1.0 s, so code should run for about 10-15 min for 400 rows

    for i in range(len(df)):
        
        html = bs4.BeautifulSoup((requests.get("https://www.mountainproject.com/route/stats"+df["url"][i][37:]).text)) # Get the URL
        html_section_list = list(html.find('div', {"id":"route-stats"}).find_all("h3")) # Grabs the sections tagged "h3" in the id = "route-stats" section and adds it to a list with the HTML tags
        html_text_list = list(map(lambda x: x.text, html_section_list)) # Removes the html tags in html_header_list

        # If correct header is found in html_text_list, add the listed number to the dataframe under the correct column. 
        for j in range(len(html_text_list)):
            if 'Star Ratings' in html_text_list[j]:
                df["star_ratings"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            if 'Suggested Ratings' in html_text_list[j]:
                df["suggested_ratings"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            if 'On To-Do Lists' in html_text_list[j]:
                df["on_to_do_lists"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            if 'Ticks' in html_text_list[j]:
                df["ticks"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            
        time.sleep(.5+2*random.random()) # Delay by average of 2.5 seconds, at least 0.5 seconds between clicks
        
    return df

In [4]:
buttermilks_clean_df = pd.read_csv(r"data/clean-data/buttermilks-clean.csv")
buttermilks_scrape_df = collect_data(buttermilks_clean_df)
buttermilks_scrape_df.to_csv(r"data/scraped-data/buttermilks-scrape.csv", index=None)

druid_stones_clean_df = pd.read_csv(r"data/clean-data/druid_stones-clean.csv")
druid_stones_scrape_df = collect_data(druid_stones_clean_df)
druid_stones_scrape_df.to_csv(r"data/scraped-data/druid_stones-scrape.csv", index=None)

happy_boulders_clean_df = pd.read_csv(r"data/clean-data/happy_boulders-clean.csv")
happy_boulders_scrape_df = collect_data(happy_boulders_clean_df)
happy_boulders_scrape_df.to_csv(r"data/scraped-data/happy_boulders-scrape.csv", index=None)

sad_boulders_clean_df = pd.read_csv(r"data/clean-data/sad_boulders-clean.csv")
sad_boulders_scrape_df = collect_data(sad_boulders_clean_df)
sad_boulders_scrape_df.to_csv(r"data/scraped-data/sad_boulders-scrape.csv", index=None)

joshua_tree_clean_df = pd.read_csv(r"data/downloads/joshua_tree-clean.csv")
joshua_tree_scrape_df = collect_data(joshua_tree_clean_df)
joshua_tree_scrape_df.to_csv(r"data/scraped-data/joshua_tree-scrape.csv", index=None)