In [1]:
#!/usr/bin/env python3

import requests
import re
import sys
import argparse
import altair
import pandas

sys.setrecursionlimit(100000)

parser = argparse.ArgumentParser()
parser.add_argument("num_climbs",help="Specifiy the number of climbs to index here")
args = parser.parse_args()

num_climbs = int(args.num_climbs)

boulders = []
sports = []
trads = []
topropes = []
alpines = []

# Set up a session
s = requests.session()

# Get links from a html page
def get_links(content):
  links = re.findall('''<a\s+(?:[^>]*?\s+)?href="([^"]*)"''', content)  
  return links

# Get boulders from an HTML page
def get_climbs(content):
  info = re.findall('''@type.*\s*\"name\": \"([^\"]*).*\s*\"description\": \"([^\"]*)''',content)
  rating = re.findall('''&nbsp;Avg: (...)''',content)
  if len(rating) > 0 and len(info) > 0:
    rating = rating[0]
    if rating[-1] == 'f':
      rating = rating[0]
    name = info[0][0]
    type = info[0][1]
    type = type.split(" ")
    grade = type[0]
    type = type[1:]
    #print("Type: " + str(type))
    if 'Sport' in type:
      sports.append((grade,rating))
    if 'Boulder' in type:
      boulders.append((grade,rating))
    if 'TR' in type:
      topropes.append((grade,rating))
    if 'Trad' in type:
      trads.append((grade,rating))
    if 'Alpine' in type:
      alpines.append((grade,rating))
    print(name + " " + str(type) + " " + grade + " " + rating + "/4 stars")

# Set of seen links
visited = set()

# Types of link we don't want to follow...
ignore = ['google','facebook','login','user','help','edit','share','upload','add','photo','forum','stats','print','map']

# Crawl a link
def crawl(url):
  if len(boulders) + len(sports) + len(trads) + len(topropes) + len(alpines) < num_climbs:
    response = s.get(url)
    if response == None:
      return

    page_content = response.content.decode('latin-1')
   
    # Get all links on the page
    links = get_links(page_content)

    # Add the climbs on the page
    get_climbs(page_content)

    # Remove links which we dont want to follow
    for word in ignore:
      links = [link for link in links if not word in link]

    # Only keep route and are links, this will get us all climbs
    links = [link for link in links if 'mountainproject' in link and ('area' in link or 'route' in link)]

    # Add the new links to our set so we don't visit them again
    for link in links:
      if not link in visited:
        visited.add(link)
        if len(visited) % 500 == 0:
          print("\n***\n" + str(len(visited)) + " pages searched... " + str(len(boulders)) + " boulders found... " + str(len(sports)) + " sport routes found... " + str(len(trads)) + " trad routes found... " + str(len(topropes)) + " toprope routes found... \n***\n")
        crawl(link)

# Start crawling!        
crawl('https://www.mountainproject.com')



# Define sorting orders for boulders and routes
boulder_order = ['V-easy','VB','V-','V0','V0+','V0-1','V1-','V1','V1+','V1-2','V2-','V2','V2+','V2-3','V3-','V3','V3+','V3-4','V4-','V4','V4+','V4-5','V5-','V5','V5+','V5-6','V6-','V6','V6+','V6-7','V7-','V7','V7+','V7-8','V8-','V8','V8+','V8-9','V9-','V9','V9+','V9-10','V10-','V10','V10+','V10-11','V11-','V11','V11+','V11-12','V12-','V12','V12+','V12-13','V13-','V13','V13+','V13-14','V14-','V14']

route_order = ['5.4','5.5-','5.5','5.5+','5.6-','5.6','5.6+','5.7-','5.7','5.7+','5.8-','5.8','5.8+','5.9-','5.9','5.9+','5.10-','5.10a','5.10-','5.10b','5.10','5.10c','5.10+','5.10d','5.11a','5.11-','5.11b','5.11','5.11c','5.11+','5.11d','5.12a','5.12-','5.12b','5.12','5.12c','5.12+','5.12d','5.13a','5.13-','5.13b','5.13','5.13c','5.13+','5.13d','5.14a','5.14-','5.14b','5.14','5.14c','5.14+','5.14d']

route_order2 = ['5.4','5.5-','5.5','5.5+','5.6-','5.6','5.6+','5.7-','5.7','5.7+','5.8-','5.8','5.8+','5.9-','5.9','5.9+','5.10a','5.10b','5.10c','5.10d','5.11a','5.11b','5.11c','5.11d','5.12a','5.12b','5.12c','5.12d','5.13a','5.13b','5.13c','5.13d','5.14a','5.14b','5.14c','5.14d']

# Sort and pint lists upon completion
boulders = [boulder for grade in boulder_order for boulder in boulders if boulder[0] == grade]
sports = [sport for grade in route_order for sport in sports if sport[0] == grade]
trads = [trad for grade in route_order for trad in trads if trad[0] == grade]
topropes = [toprope for grade in route_order for toprope in topropes if toprope[0] == grade]
alpines = sorted(alpines, key=lambda x: (x[0],x[1]))
#print('CLIMBS: \n\n')

#print('\n\nBOULDERS: ')
#for boulder in boulders:
#  print(boulder[0] + ", " + boulder[1])
#print('\n\nSPORT ROUTES: ')
#for sport in sports:
#  print(sport[0] + ", " + sport[1])
#print('\n\nTRAD ROUTES: ')
#for trad in trads:
#  print(trad[0] + ", " + trad[1])
#print('\n\nTOPROPE ROUTES: ')
#for toprope in topropes:
#  print(toprope[0] + ", " + toprope[1])
#print('\n\nALPINE ROUTES: ')
#for alpine in alpines:
#  print(alpine[0] + ", " + alpine[1])
#print('\n\n\n')

def print_meta(climbs,order,type):
  data = []
  for grade in order:
    sum = 0
    count = 0
    for climb in climbs:
      if climb[0] == grade:
        count += 1
        sum += float(climb[1])
    if count > 0:
      avg = float(float(sum) / float(count))
      data.append((grade,avg))
      print("Average stars for (" + str(count) + ") " + grade + " " + type + ": " + str(round(avg,2)))
  return data


# Print meta info
print('\n\n\n\n*******************\n* RATING AVERAGES *\n*******************\n')
print('\n\n\nBOULDER AVERAGES\n')
boulder_data = print_meta(boulders,boulder_order,'boulders')
print('\n\n\nSPORT CLIMB AVERAGES\n')
sport_data = print_meta(sports,route_order,'sport climbs')
print('\n\n\nTRAD CLIMB AVERAGES\n')
trad_data = print_meta(trads,route_order,'trad climbs')
print('\n\n\nTOPROPE CLIMB AVERAGES\n')
toprope_data = print_meta(topropes,route_order,'toprope climbs')
print('\n\n\n')


source = pandas.DataFrame({
  'a' : [x[0] for x in boulder_data],
  'b' : [x[1] for x in boulder_data]
})

altair.Chart(source).mark_bar().encode(x='a',y='b')

usage: ipykernel_launcher.py [-h] num_climbs
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
python3 mp_crawler 20


SyntaxError: invalid syntax (<ipython-input-2-058cce66d0d3>, line 1)

In [3]:
./mp_crawler 20

SyntaxError: invalid syntax (<ipython-input-3-e7c85b361f4a>, line 1)

In [4]:
!python3 mp_crawler 20

Exum Ridge ['Trad,', 'Alpine'] 5.5 3.6/4 stars
8 Hour Bliss ['X', 'TR,', 'Alpine'] 5.7 1/4 stars
Barely There ['Sport,', 'Alpine'] 5.9 1/4 stars
Misfire ['PG13', 'Sport,', 'Alpine'] 5.10b\/c 3/4 stars
Runnin' on Empty ['Sport,', 'Alpine'] 5.7 1/4 stars
Slim Pickin's ['Sport,', 'Alpine'] 5.10b 2/4 stars
Standard Route, Southwest Face ['R', 'Trad,', 'Alpine'] 5.7 4/4 stars
Garden Creek Waterfall ['Trad,', 'Ice'] WI2 1.5/4 stars
Bee Wayne ['Boulder'] V4 4/4 stars
Divertido ['Boulder'] V2 4/4 stars
Dulceria ['Boulder'] V2-3 4/4 stars
Pinata ['Boulder'] V2 4/4 stars
Raghorn ['Boulder'] V3+ 3/4 stars
Ankle Biter ['Boulder'] V5 3/4 stars
Blue ['Boulder'] V0 4/4 stars
Fat Boy ['Boulder'] V3-4 3/4 stars
Midnight Blues ['Boulder'] V8 4/4 stars
Pocket Protector ['Boulder'] V1 3/4 stars
Ruffed ['Boulder'] V0 4/4 stars
SharpTailed ['Boulder'] V0 4/4 stars
Two at a Time ['Boulder'] V3 3/4 stars




*******************
* RATING AVERAGES *
*******************




BOULDER AVERAGES

Average stars for (3