In [1]:
from bs4 import BeautifulSoup
import requests
import time
import random

In [2]:
#Scrape the web
url = 'https://www.mountainproject.com/route/106683500/quo-vadis'

response = requests.get(url)

In [3]:
#Check to make sure the url was found. 200 means success.
response.status_code

200

In [4]:
#Turn the html code into a string
page = response.text

In [5]:
#Parse using BeautifulSoup
soup = BeautifulSoup(page, "lxml")

In [6]:
import re

## Scrape features

In [7]:
#Find route's name
def route_name(soup):
    name = soup.find('h1').text
    name = name.split()
    name = ' '.join(name)
    return name

route_name(soup)

'Quo Vadis'

In [8]:
#Find route's rating
def rating_value(soup):
    runtime_regex = re.compile('starsWithAvgText')
    rating = soup.find(id=runtime_regex).text
    rating = rating.split('from')[0]
    rating = rating.split(':')[1]
    rating = rating.split()
    rating = float(rating[0])
    return rating

rating_value(soup)

4.0

In [9]:
#Find number of votes
def vote_value(soup):
    runtime_regex = re.compile('starsWithAvgText')
    votes = soup.find(id=runtime_regex).text
    votes = votes.split('from')[1]
    votes = votes.split()[0].split()[0].replace(',','')
    votes = int(votes)
    return votes

vote_value(soup)

1

In [10]:
#Find route's grade. It's a string to take V grades into account.
def grade_value(soup):
    if soup.find(class_='rateYDS') == None:
        grade = None
        return grade
    grade = soup.find(class_='rateYDS').text.split()[0]
    grade = str(grade)
    return grade

grade_value(soup)

'5.9+'

In [11]:
#Find climbing type
def type_value(soup):
    list_types = []
    type_climb = str(soup.find(text='Type:').findNext())
    type_climb = type_climb.split() #new
    for element in type_climb: #new
        element = element.replace(',','') #new
        if (element=='Trad') or (element=='Sport') or (element=='TR') or (element=='Boulder') or (element=='Aid') or (element=='Ice') or (element=='Snow') or (element=='Alpine'):
            list_types.append(element)
    #type_climb = type_climb.split()[1].replace(',', '')
    return list_types

type_value(soup)

['Trad', 'Aid']

In [12]:
#Find route's height in feet
def height_value(soup):
    height = str(soup.find(text='Type:').findNext()).split()
    if len(height) <= 3:
        height = None
    elif 'ft' not in height:
        height = None
    else:
        height = int(str(soup.find(text='Type:').findNext()).split('ft')[0].split()[-1])
    return height
    
height_value(soup)

3000

In [13]:
#Find number of pitches
def pitches(soup):
    num_pitches = str(soup.find(text='Type:').findNext()).split()
    if ('pitches,' not in num_pitches) and ('pitches' not in num_pitches):
        num_pitches = 1
    else:
        num_pitches = int(str(soup.find(text='Type:').findNext()).split('pitches')[0].split()[-1])
    return num_pitches

pitches(soup)

22

In [14]:
#Find Safety Rating
def safe_value(soup):
    safety = soup.find(class_="inline-block mr-2").text.split()
    safety = safety[-1].split()[0]
    if (safety != 'R') and (safety != 'PG13') and (safety != 'G') and (safety != 'X'): 
        return None
    return safety

safe_value(soup)

In [15]:
#Find Commitment Rating
def commitment_value(soup):
    NCCS = str(soup.find(text='Type:').findNext()).split()
    #if len(NCCS) <= 9:
    #    NCCS = None
    if 'Grade' not in NCCS:
        NCCS = None
    else:
        NCCS = str(NCCS).split('Grade')[1]
        NCCS = NCCS.split()[1].replace(',', '')
        NCCS = NCCS[1:-1]
    return NCCS

commitment_value(soup)

'VI'

In [16]:
#Find State
def state_value(soup):
    area = soup.find(class_="mb-half small text-warm").text.split('>')
    area_list = []
    for element in area:
        area_list.append(element.split())
    area_list = area_list[1:]
    area_list = area_list[0]
    area_list = ' '.join(area_list)
    return area_list

state_value(soup)

'California'

In [17]:
#Find smallest sub-area
def sub_area_value(soup):
    area = soup.find(class_="mb-half small text-warm").text.split('>')
    area_list = []
    for element in area:
        area_list.append(element.split())
    area_list = area_list[1:]
    area_list = area_list[-2]
    area_list = ' '.join(area_list)
    #area_list = area_list[0]
    return area_list

sub_area_value(soup)

'B. El Capitan'

In [18]:
#Find number of photos
def photo_value(soup):
    photos = 0
    for a in soup.find_all('a', class_='card-with-photo photo-card'):
        for link in a.find_all(class_="img-container position-relative"):
            photos +=1
    if soup.find(id="more-photos-button"):#If there are more photos not displayed
            more_photos = soup.find(id="more-photos-button").text
            more_photos = more_photos.split('More')[0]
            more_photos = int(more_photos.split()[-1])
            photos = photos + more_photos
    return photos
    
photo_value(soup)

4

In [19]:
#Find number of comments
def comment_value(soup):
    comments = soup.find(class_='comment-count').text
    comments = int(comments.split()[0])
    return comments

comment_value(soup)

3

In [20]:
#Find list of grades of nearby routes
def nearby_grades(soup):
    list_nearby_grades = []
    runtime_regex = re.compile('max-height max-height-md-0 max-height-xs-400')
    near_grades = soup.find_all(class_=runtime_regex)
    for element in soup.find_all(class_=runtime_regex):
        for grade in element.find_all(class_='rateYDS'):
            list_nearby_grades.append(grade.text)
    this_grade = grade_value(soup)
    if this_grade in list_nearby_grades:
        list_nearby_grades.remove(this_grade)
    return list_nearby_grades

nearby_grades(soup)
    

['5.8',
 '5.10',
 '5.11',
 '5.13a',
 '5.13a',
 '5.9',
 '5.13b',
 'V10',
 '5.7',
 '5.14a',
 '5.9',
 '5.13',
 '5.10',
 '5.9',
 '5.9',
 '5.9',
 '5.8',
 '5.9',
 '5.9',
 '5.11c',
 '5.10+']

In [21]:
#Find aid grade
def aid_value(soup):
    aid = soup.find(class_="inline-block mr-2").text.split()
    if len(aid) < 2:
        aid = None
        return aid
    aid1 = aid[-2]
    aid1_first_letter = aid[-2][0]
    aid2 = aid[-1]
    aid2_first_letter = aid[-1][0]
    if (aid1_first_letter == 'A'): 
        aid = aid1
    elif (aid2_first_letter == 'A'):
        aid = aid2
    else:
        aid = None
    return aid

aid_value(soup)

'A4+'

## Single function for scraping

In [22]:
def scrape_features(link):
    """"
    Takes in a link and extracts various features from the MP page.
    Returns a dictionary of the features.
    """
    #Scrape the web
    url = link
    response = requests.get(url)
    
    #Turn the html code into a string
    page = response.text
    
    #Parse using BeautifulSoup
    soup = BeautifulSoup(page, "lxml")
    
    #Extract features
    name = route_name(soup)
    rating = rating_value(soup)
    votes = vote_value(soup)
    grade = grade_value(soup)
    type_climb = type_value(soup)
    height = height_value(soup)
    number_pitches = pitches(soup)
    safety = safe_value(soup)
    commitment = commitment_value(soup)
    state = state_value(soup)
    sub_area = sub_area_value(soup)
    num_photos = photo_value(soup)
    num_comments = comment_value(soup)
    neighbor_grades = nearby_grades(soup)
    aid_grade = aid_value(soup)
    
    #Create list with a dictionary of features
    headers = ['name', 'rating', 'votes', 'grade', 'type', 'height', 'pitches', 'safety', 'commitment',
          'state', 'sub_area', 'photos', 'comments', 'near_grades', 'aid_grade']

    mp_data = []
    mp_dict = dict(zip(headers, [name,
                                rating,
                                votes,
                                grade,
                                type_climb,
                                height,
                                number_pitches,
                                safety,
                                commitment,
                                state,
                                sub_area,
                                num_photos,
                                num_comments,
                                neighbor_grades,
                                aid_grade]))

    mp_data.append(mp_dict)
    
    return mp_data

scrape_features('https://www.mountainproject.com/route/106683500/quo-vadis')



[{'name': 'Quo Vadis',
  'rating': 4.0,
  'votes': 1,
  'grade': '5.9+',
  'type': ['Trad', 'Aid'],
  'height': 3000,
  'pitches': 22,
  'safety': None,
  'commitment': 'VI',
  'state': 'California',
  'sub_area': 'B. El Capitan',
  'photos': 4,
  'comments': 3,
  'near_grades': ['5.8',
   '5.10',
   '5.11',
   '5.13a',
   '5.13a',
   '5.9',
   '5.13b',
   'V10',
   '5.7',
   '5.14a',
   '5.9',
   '5.13',
   '5.10',
   '5.9',
   '5.9',
   '5.9',
   '5.8',
   '5.9',
   '5.9',
   '5.11c',
   '5.10+'],
  'aid_grade': 'A4+'}]

## Put all data in a dictionary

In [23]:
import pandas as pd

In [None]:
#Jtree. Type: Boulder
url = 'https://www.mountainproject.com/route-finder?selectedIds=105720495&type=boulder&diffMinrock=800&diffMinboulder=20000&diffMinaid=74500&diffMinice=30000&diffMinmixed=50000&diffMaxrock=12400&diffMaxboulder=21700&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=rating'

response = requests.get(url)
page = response.text

soup2 = BeautifulSoup(page,"lxml")

soup2.head()

In [None]:
#List of links to the routes
list_links = []
for tr in soup2.find_all('tr', class_='route-row'):
    for link in tr.find_all('a')[::4]:
        list_links.append(link.get("href"))
list_links

In [None]:
#Dictionary of routes and their features
link_dict = {}
count = 0

for link in list_links[1:250]:
    print(count, ': ', link)
    features = scrape_features(link) #features is a list with dictionary
    features = features[0] #features is a dictionary
    link_dict[features['name']] = features
    count = count + 1

print(link_dict)

In [None]:
#Turn into data frame
california_routes_info = pd.DataFrame(link_dict).T
california_routes_info.set_index('name', inplace=True)

california_routes_info

## Single function to input list of links

In [24]:
from fake_useragent import UserAgent

def scrape_links_all(url, new_filename):
    #Put into proper Beautiful Soup formal
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}
    response = requests.get(url, headers = user_agent)
    page = response.text
    soup2 = BeautifulSoup(page,"lxml")
    
    #List of links to the routes
    list_links = []
    for tr in soup2.find_all('tr', class_='route-row'):
        for link in tr.find_all('a')[::4]:
            list_links.append(link.get("href"))
            
    #Dictionary of routes and their features    
    link_dict = {}
    count = 0
    for link in list_links[:1000]:
        print("count: ", count, "link: ", link)
        features = scrape_features(link) #features is a list with dictionary
        features = features[0] #features is a dictionary
        link_dict[features['name']] = features
        time.sleep(.5+2*random.random())
        count = count + 1

        
    #Turn into data frame
    california_routes_info = pd.DataFrame(link_dict).T
    california_routes_info.set_index('name', inplace=True)
    
    #Save it as a csv file
    california_routes_info.to_csv(new_filename)

In [None]:
#Yosemite
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105833381&sort1=area&sort2=rating&stars=0&type=rock&viewAll=1', 'yosemite_rock.csv')
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105833381&sort1=area&sort2=rating&stars=0&type=boulder&viewAll=1', 'yosemite_boulder.csv')


In [None]:
#Red Rocks 
#Rocks
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105731932&sort1=area&sort2=rating&stars=0&type=rock&viewAll=1', 'redrocks_rock.csv')
#Boulders
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105731932&sort1=area&sort2=rating&stars=0&type=boulder&viewAll=1', 'redrocks_boulder.csv')

#California Aid Routes
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105708959&sort1=area&sort2=rating&stars=0&type=aid&viewAll=1', 'ca_aid.csv')

#Sierra East
#Boulders
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=74500&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105798288&sort1=area&sort2=rating&stars=0&type=boulder&viewAll=1', 'sierra_east_boulder.csv')
#Rocks
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=74500&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105798288&sort1=area&sort2=rating&stars=0&type=rock&viewAll=1', 'sierra_east_rock.csv')

#Joshua Tree
#Boulders
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=74500&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105720495&sort1=area&sort2=rating&stars=0&type=boulder&viewAll=1', 'jtree_boulder.csv')
#Rocks
scrape_links_all('https://www.mountainproject.com/route-finder?diffMaxaid=75260&diffMaxboulder=21700&diffMaxice=38500&diffMaxmixed=60000&diffMaxrock=12400&diffMinaid=70000&diffMinboulder=20000&diffMinice=30000&diffMinmixed=50000&diffMinrock=800&is_sport_climb=1&is_top_rope=1&is_trad_climb=1&pitches=0&selectedIds=105720495&sort1=area&sort2=rating&stars=0&type=rock&viewAll=1', 'jtree_rock.csv')