## Getting Mountain Information from Summit Post

This notebook scrapes the name, location, and the cover image URL for each mountain from the top N pages of mountains, sorted by descending number of hits from https://summitpost.org, a crowd sourced resource for mountaineering and hiking information.

In [16]:
import requests
import time
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import cssutils


First, I get all the unique URLs for each mountain.  Each page lists 24 mountains.

In [2]:
mtn_urls = []

def mtns_top_hits(num_pages):
    for i in range(1,num_pages+1):
        top_url = f'https://www.summitpost.org/mountain/rock/?object_type=1&search_select_1=name_only&contributor_id=&order_type_1=DESC&object_name_1=&sort_select_1=hits&page={i}'
        response = requests.get(top_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml")
            data = soup.find_all('div',attrs={'class':'item-data'})
            for div in data:
                links = div.find_all('a')
                for a in links[::2]: # pulling every other link because every other link is a "parent"
                    mtn_urls.append("http://www.summitpost.org" + a['href'])
        else:
            print(f'Response code error: {response.status_code}')
    return mtn_urls

In [3]:
# get a list of 480 mountains from the top 20 pages from the site
mtns_top_hits(20)

['http://www.summitpost.org/mount-whitney/150227',
 'http://www.summitpost.org/mount-rainier/150291',
 'http://www.summitpost.org/mount-shasta/150188',
 'http://www.summitpost.org/mount-hood/150189',
 'http://www.summitpost.org/denali/150199',
 'http://www.summitpost.org/mount-elbert/150325',
 'http://www.summitpost.org/katahdin/150219',
 'http://www.summitpost.org/aconcagua/150197',
 'http://www.summitpost.org/mount-adams/150198',
 'http://www.summitpost.org/grand-teton/150312',
 'http://www.summitpost.org/longs-peak/150310',
 'http://www.summitpost.org/matterhorn-monte-cervino/150235',
 'http://www.summitpost.org/mont-blanc/150245',
 'http://www.summitpost.org/eiger/150228',
 'http://www.summitpost.org/mount-mansfield/150938',
 'http://www.summitpost.org/humphreys-peak/150241',
 'http://www.summitpost.org/hatu-peak/154227',
 'http://www.summitpost.org/wheeler-peak-nm/150429',
 'http://www.summitpost.org/mt-timpanogos-ut/151365',
 'http://www.summitpost.org/mount-baker/150195',
 'http

Now that I have all the URLs, I go to each URL and get the relevant information.

In [4]:
# test URL to get location data

response = requests.get('https://www.summitpost.org/mont-blanc/150245')
soup = BeautifulSoup(response.text, "lxml")
        
test_loc = soup.find('div',attrs={'class':'location'}).find('span').text

test_loc

'Haute-Savoie/Aosta (Mont Blanc), France/Italy, Europe'

In [19]:
# test URL to get cover image URL

response = requests.get('https://www.summitpost.org/y-mountain/224762')
soup = BeautifulSoup(response.text, "lxml")
        
div_style = soup.find('div',attrs={'class':'cover-image'})['style']

style = cssutils.parseStyle(div_style)

test_imgurl = style['background-image']
test_imgurl = test_imgurl.replace('url(', '').replace(')', '')
test_imgurl

'https://sp-images.summitpost.org/224771.jpg?auto=format&fit=max&h=800&ixlib=php-2.1.1&q=35&s=0d8c9513b6f1d46e331b5b7025a2c7d7'

In [23]:
def get_mtn_info(urls):
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "lxml")
        
        # get location, if there's an issue, append the URL so you can manually fix later
        try:
            loc = soup.find('div',attrs={'class':'location'}).find('span').text
            locations.append(loc)
        except:
            locations.append(url)
            
        # get mountain name, if there's an issue, append the URL so you can manually fix later
        try:
            mountain = soup.find('div',attrs={'class':'content-title'}).text
            mountains.append(mountain)
        except:
            mountains.append(url)
            
        # get cover image URL, if there's an issue, append the URL so you can manually fix later
        try:
            div_style = soup.find('div',attrs={'class':'cover-image'})['style']

            style = cssutils.parseStyle(div_style)

            img_url = style['background-image']
            img_url = img_url.replace('url(', '').replace(')', '')
            img_urls.append(img_url)
        except:
            img_urls.append(url)
                
        time.sleep(.5+2*random.random())

In [24]:
mountains = []
locations = []
img_urls = []

get_mtn_info(mtn_urls)
print(mountains[:10])
print(locations[:10])
print(img_urls[:10])

['Mount Whitney', 'Mount Rainier', 'Mount Shasta', 'Mount Hood', 'Denali', 'Mount Elbert', 'Katahdin', 'Aconcagua', 'Mount Adams', 'Grand Teton']
['California, United States, North America', 'Washington, United States, North America', 'California, United States, North America', 'Oregon, United States, North America', 'Alaska, United States, North America', 'Colorado, United States, North America', 'Maine, United States, North America', 'Mendoza, Argentina, South America', 'Washington, United States, North America', 'Wyoming, United States, North America']
['https://sp-images.summitpost.org/469726.JPG?auto=format&fit=max&h=800&ixlib=php-2.1.1&q=35&s=7225b40bb8b15a1e3df43067fc820375', 'https://sp-images.summitpost.org/457178.jpg?auto=format&fit=max&h=800&ixlib=php-2.1.1&q=35&s=2dfab48d42c4250081216e0e7b8c6859', 'https://sp-images.summitpost.org/878331.jpg?auto=format&fit=max&h=800&ixlib=php-2.1.1&q=35&s=08e7844750df801767744669e8332445', 'https://sp-images.summitpost.org/5175.jpg?auto=fo

In [25]:
df_mtn_loc = pd.DataFrame({'mountain': mountains, 'location': locations, 'img_url': img_urls})
print(df_mtn_loc.info())
df_mtn_loc.sample(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 3 columns):
mountain    480 non-null object
location    480 non-null object
img_url     480 non-null object
dtypes: object(3)
memory usage: 11.3+ KB
None


Unnamed: 0,mountain,location,img_url
354,Big Slide Mountain,"New York, United States, North America",https://sp-images.summitpost.org/72729.jpg?aut...
452,Tofana di Rozes,"Dolomites, Italy, Europe",https://sp-images.summitpost.org/781979.JPG?au...
230,Mount Sopris,"Colorado, United States, North America",https://sp-images.summitpost.org/790153.JPG?au...
161,Fuji-san,"Shizuoka, Japan, Asia",https://sp-images.summitpost.org/52675.jpg?aut...
206,Black Mesa,"Oklahoma, United States, North America",https://sp-images.summitpost.org/370289.JPG?au...
165,Tryfan,"Snowdonia, Wales, Europe",https://sp-images.summitpost.org/397254.jpg?au...
181,Mauna Loa,"Hawaii, United States, Australia/Oceana",https://sp-images.summitpost.org/14856.jpg?aut...
366,Aiguille d'Argentière,"Haute Savoie (Mont Blanc), France, Europe",https://sp-images.summitpost.org/597767.jpg?au...
112,Muztagh Ata,"Xinjiang, China, Asia",https://sp-images.summitpost.org/121947.jpg?au...
355,Mount Magazine-Signal Hill,"Arkansas, United States, North America",https://sp-images.summitpost.org/791685.jpg?au...


In [26]:
pd.options.display.max_rows = 500
df_mtn_loc

Unnamed: 0,mountain,location,img_url
0,Mount Whitney,"California, United States, North America",https://sp-images.summitpost.org/469726.JPG?au...
1,Mount Rainier,"Washington, United States, North America",https://sp-images.summitpost.org/457178.jpg?au...
2,Mount Shasta,"California, United States, North America",https://sp-images.summitpost.org/878331.jpg?au...
3,Mount Hood,"Oregon, United States, North America",https://sp-images.summitpost.org/5175.jpg?auto...
4,Denali,"Alaska, United States, North America",https://sp-images.summitpost.org/831080.JPG?au...
5,Mount Elbert,"Colorado, United States, North America",https://sp-images.summitpost.org/844972.JPG?au...
6,Katahdin,"Maine, United States, North America",https://sp-images.summitpost.org/74769.jpg?aut...
7,Aconcagua,"Mendoza, Argentina, South America",https://sp-images.summitpost.org/146685.jpg?au...
8,Mount Adams,"Washington, United States, North America",https://sp-images.summitpost.org/80003.jpg?aut...
9,Grand Teton,"Wyoming, United States, North America",https://sp-images.summitpost.org/844255.jpg?au...


Scrolling through the dataframe, there are a few mountains where the information didn't pull in correctly:
indices 97, 184, and 318 (The Brothers has lat long).  Let's go through and manually correct these.

In [27]:
df_mtn_loc.iloc[97][0] = 'Kangchenjunga'
df_mtn_loc.iloc[97][1] = 'India/Nepal, Asia'

df_mtn_loc.iloc[147][1] = 'Tibet/Khumbu, China/Nepal, Asia'

df_mtn_loc.iloc[184][0] = 'Mount Tyndall'
df_mtn_loc.iloc[184][1] = 'California, United States, North America'

df_mtn_loc.iloc[226][1] = 'Italy/Switzerland, Europe'

df_mtn_loc.iloc[278][1] = 'Nepal, Asia'

df_mtn_loc.iloc[306][1] = 'Pangnirtung Baffin Island/Nunavut, Canada, North America'

df_mtn_loc.iloc[318][1] = 'Washington, United States, North America'

df_mtn_loc.iloc[473][1] = 'Wyoming, United States, North America'

In [28]:
df_mtn_loc

Unnamed: 0,mountain,location,img_url
0,Mount Whitney,"California, United States, North America",https://sp-images.summitpost.org/469726.JPG?au...
1,Mount Rainier,"Washington, United States, North America",https://sp-images.summitpost.org/457178.jpg?au...
2,Mount Shasta,"California, United States, North America",https://sp-images.summitpost.org/878331.jpg?au...
3,Mount Hood,"Oregon, United States, North America",https://sp-images.summitpost.org/5175.jpg?auto...
4,Denali,"Alaska, United States, North America",https://sp-images.summitpost.org/831080.JPG?au...
5,Mount Elbert,"Colorado, United States, North America",https://sp-images.summitpost.org/844972.JPG?au...
6,Katahdin,"Maine, United States, North America",https://sp-images.summitpost.org/74769.jpg?aut...
7,Aconcagua,"Mendoza, Argentina, South America",https://sp-images.summitpost.org/146685.jpg?au...
8,Mount Adams,"Washington, United States, North America",https://sp-images.summitpost.org/80003.jpg?aut...
9,Grand Teton,"Wyoming, United States, North America",https://sp-images.summitpost.org/844255.jpg?au...


In [29]:
# exporting cleaned data to CSV

df_mtn_loc.to_csv('./mtn_locations.csv', index=False)