In [1]:
import re
import pandas as pd
import bs4
import requests
import time
import random
import warnings # Turn off warnings
warnings.filterwarnings('ignore')

In [2]:
def collect_data(df):
    """
    Uses the link provided in the CSV file to open and grab data from the following categories: StarRatings, SuggestedRatings, OnToDoLists, and Ticks
    """
    # Add empty columns to df
    df["StarRatings"] = 0
    df["SuggestedRatings"] = 0
    df["OnToDoLists"] = 0
    df["Ticks"] = 0
    
    # Grab data from stats page and appends to df
    # Delay is 1.5 +/- 1.0 s, so code should run for about 10-15 min for 400 rows

    for i in range(len(df)):
        
        html = bs4.BeautifulSoup((requests.get("https://www.mountainproject.com/route/stats"+df["URL"][i][37:]).text)) # Get the URL
        html_section_list = list(html.find('div', {"id":"route-stats"}).find_all("h3")) # Grabs the sections tagged "h3" in the id = "route-stats" section and adds it to a list with the HTML tags
        html_text_list = list(map(lambda x: x.text, html_section_list)) # Removes the html tags in html_header_list

        # If correct header is found in html_text_list, add the listed number to the dataframe under the correct column. 
        for j in range(len(html_text_list)):
            if 'Star Ratings' in html_text_list[j]:
                df["StarRatings"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            if 'Suggested Ratings' in html_text_list[j]:
                df["SuggestedRatings"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            if 'On To-Do Lists' in html_text_list[j]:
                df["OnToDoLists"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            if 'Ticks' in html_text_list[j]:
                df["Ticks"][i] = [int(num) for num in html_text_list[j].split() if num.isdigit()][0]
            
        time.sleep(.5+2*random.random()) # Delay by average of 2.5 seconds, at least 0.5 seconds between clicks

In [4]:
# Bishop, combine the following four files after cleaning

buttermilks_df = pd.read_csv(r"Data/Downloads/buttermilks.csv")
collect_data(buttermilks_df)
buttermilks_df.to_csv(r"Data/Scraped Data/buttermilks_df.csv")

druid_stones_df = pd.read_csv(r"Data/Downloads/druid_stones.csv")
collect_data(druid_stones_df)
druid_stones_df.to_csv(r"Data/Scraped Data/druid_stones_df.csv")

happy_boulders_df = pd.read_csv(r"Data/Downloads/happy_boulders.csv")
collect_data(happy_boulders_df)
happy_boulders_df.to_csv(r"Data/Scraped Data/happy_boulders_df.csv")

sad_boulders_df = pd.read_csv(r"Data/Downloads/sad_boulders.csv")
collect_data(sad_boulders_df)
sad_boulders_df.to_csv(r"Data/Scraped Data/sad_boulders_df.csv")

In [5]:
# Joshua Tree

joshua_tree_df = pd.read_csv(r"Data/Downloads/joshua_tree.csv")
collect_data(joshua_tree_df)
joshua_tree_df.to_csv(r"Data/Scraped Data/joshua_tree_df.csv")