# Scraping for Data

## Importing Python Packages

In [33]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Extracting Hiking Details

The first function `extract_html` extracts the HTML code that lies behind a given URL. This function returns two results:

1. A BeautifulSoup object that can easily be searched using find commands
1. The same information in text format where we can use regular expressions to search for our features

The consideration for having both options was to select the easiest one depending on the data I was aiming to retrieve.

In [34]:
def extract_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    return soup, soup.prettify()

Let's give this function a test ride by looking at the first 1000 characters in the html file.

In [35]:
little_si_url = 'https://www.wta.org/go-hiking/hikes/little-si'
soup, html = extract_html(little_si_url)
print(html[:1000])

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="en" xml:lang="en" xmlns:xlink="http://www.w3.org/1999/xlink">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <!--
        /* @license
         * MyFonts Webfont Build ID 2229066, 2012-04-12T15:53:09-0400
         *
         * The fonts listed in this notice are subject to the End User License
         * Agreement(s) entered into by the website owner. All other parties are
         * explicitly restricted from using the Licensed Webfonts(s).
         *
         * You may obtain a valid license at the URLs below.
         *
         * Webfont: URW Egyptienne Narrow Medium by URW++
         * URL: http://www.myfonts.com/fonts/urw/egyptienne/urw-egyptienne-t-medium-narrow/
         * Copyright: Copyright 2010 URW++ Design &amp; Development Hamburg
         * Licensed pageviews: 10,000,000
         *
         *
         * Lic

Success! Now that we have the HTML data, our next goal would be to parse the results into a format that we could easily access for our analysis. The best format for analysis would be to use Python's `pandas` package. We can transform our data into a DataFrame by extracting all the information into a dictionary. Now that we have a plan, let us define a couple helper functions. 

The first function, `regex_finder`, will give us the ability to use regular expression patterns to gather features with just one line of code in our final function. The second function was developed after  

In [36]:
def regex_finder(html, pattern):
    ### Use regular expressions to 
    matcher = re.compile(pattern)
    matches = matcher.findall(html)
    if matches:
        return matches[0]
    
def record_features(soup):
    feature_list = []
    features = soup.findAll("div", {"class": "feature"})
    if features:
        feature_list = [f['data-title'] for f in features]
        return feature_list

In [37]:
def collect_hiking_details(url):
    soup, html = extract_html(url)
    name = regex_finder(html, r'documentFirstHeading".\n\s*([-\w\s.\':]*)\n')
    location = regex_finder(html, r'Location[\\n\s]*<\/h4>[\\n\s]*<div>[\\n\s]*([-\w\s.\/\']*)\n')
    distance = regex_finder(html, r'distance["<>\s\\nspan]*([\d.]*)')
    hike_type = regex_finder(html, r'distance["<>\s\\nspan]*[\d.]*\smiles,\s([-\w]*)\n')
    if hike_type not in ['one-way', 'roundtrip']:
        hike_type = regex_finder(html, r'distance["<>\s\\nspan]*[\d.]*\smiles\s([-\w\s]*)\n')
    gain = regex_finder(html, r'Gain:[\\n\s*<span>]*([\d.]*)\n')
    highest_point = regex_finder(html, r'Point:[\\n\s<>span]*([\d.]*)')
    current_rating = regex_finder(html, r'current-rating["\s\w=:.%>]*\n\s*([\d.]*)\sout')
    rating_count = regex_finder(html, r'rating-count[">\\n\s(]*(\d*)')
    parking_pass_entry_fee = regex_finder(html, r'Entry\sFee\n\s*<\/h4>\n\s*<[\w\s=":\/.-]*>\n\s*([-\w\s]*)\n')
    permit = regex_finder(html, r'Permits\sRequired\n\s*<\/h4>\n\s*<[-\w="\s:\/.?]*>\n\s*([\w\s.()]*)\n')
    latlong = regex_finder(html, r'Co-ordinates:[\\n\s*<span>]*([\d.]*)[\\n<>\s\/\w,]*([-\d.]*)')
    if not latlong:
        latlong = [None, None]

    hike_dict = {
        'name': name,
        'link': url,
        'location': location,
        'distance': distance, 
        'hike_type': hike_type,
        'gain': gain,
        'highest_point': highest_point,
        'current_rating': current_rating,
        'rating_count': rating_count, 
        'parking_pass/entry_fee': parking_pass_entry_fee,
        'permit': permit,
        'latitude': latlong[0],
        'longitude': latlong[1],
        'features': record_features(soup)
    }
    
    return hike_dict

In [38]:
collect_hiking_details(little_si_url)

{'name': 'Little Si',
 'link': 'https://www.wta.org/go-hiking/hikes/little-si',
 'location': 'Snoqualmie Region -- North Bend Area',
 'distance': '3.7',
 'hike_type': 'roundtrip',
 'gain': '1300',
 'highest_point': '1550',
 'current_rating': '3.58',
 'rating_count': '165',
 'parking_pass/entry_fee': 'Discover Pass',
 'permit': None,
 'latitude': '47.4867',
 'longitude': '-121.7535',
 'features': ['Mountain views', 'Dogs allowed on leash', 'Good for kids']}

In [None]:
def extract_links(html):
    hike_link_pattern = r'class="listitem-title" href="([\w:\/.-]*)"'
    hike_link_matcher = re.compile(hike_link_pattern)
    hike_links = hike_link_matcher.findall(html)
    return hike_links

In [None]:
def scroll_through_links(soup):
    next_page = soup.find("li", {"class": "next"})
    if next_page:
        next_url = next_page.find("a", href=True)
        if next_url:
            url = next_url['href']
            return url

In [None]:
def collect_links(url, MAX_PAGES = None):
    if max_pages:
        assert isinstance(MAX_PAGES, int), 'max_pages needs to be an integer'
        assert MAX_PAGES >= 1, 'max_pages needs to be >= 1'
    
    links = []
    current_page = 1
    while url:
        print('Collecting URLs from {}'.format(url))
        soup, html = extract_html(url)
        links.extend(extract_links(html))
        
        current_page += 1
        if not max_pages or (current_page <= max_pages):
            url = scroll_through_links(soup)
        else:
            break
        
    return list(set(links))

In [None]:
def create_hiking_csv(hiking_links):
    assert isinstance(hiking_links, list), "Argument must be a list of hiking links"
    hiking_collection = []
    counter = 1
    for url in hiking_links:
        if counter%10 == 0:
            print('Working on {} of {}'.format(counter, len(hiking_links)))
        hiking_collection.append(collect_hiking_details(url))
        counter += 1
    pd.DataFrame(hiking_collection).to_csv('WTA_Hiking_1.csv')
    return hiking_collection

In [15]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def extract_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    return soup, soup.prettify()

def extract_links(html):
    hike_link_pattern = r'class="listitem-title" href="([\w:\/.-]*)"'
    hike_link_matcher = re.compile(hike_link_pattern)
    hike_links = hike_link_matcher.findall(html)
    return hike_links

def scroll_through_links(soup):
    next_page = soup.find("li", {"class": "next"})
    if next_page:
        next_url = next_page.find("a", href=True)
        if next_url:
            url = next_url['href']
            return url
    return

def collect_links(url, max_pages = None):
    if max_pages:
        assert isinstance(max_pages, int), 'max_pages needs to be an integer'
        assert max_pages >= 1, 'max_pages needs to be >= 1'
    
    links = []
    current_page = 1
    while url:
        print('Collecting URLs from {}'.format(url))
        soup, html = extract_html(url)
        links.extend(extract_links(html))
        
        current_page += 1
        if not max_pages or (current_page <= max_pages):
            url = scroll_through_links(soup)
        else:
            break
        
    return list(set(links))

In [16]:
def regex_finder(html, pattern):
    matcher = re.compile(pattern)
    matches = matcher.findall(html)
    if matches:
        return matches[0]

def record_features(soup):
    feature_list = []
    features = soup.findAll("div", {"class": "feature"})
    if features:
        feature_list = [f['data-title'] for f in features]
        return feature_list

def collect_hiking_details(url):
    soup, html = extract_html(url)
    name = regex_finder(html, r'documentFirstHeading".\n\s*([-\w\s.\':]*)\n')
    location = regex_finder(html, r'Location[\\n\s]*<\/h4>[\\n\s]*<div>[\\n\s]*([-\w\s.\/\']*)\n')
    distance = regex_finder(html, r'distance["<>\s\\nspan]*([\d.]*)')
    hike_type = regex_finder(html, r'distance["<>\s\\nspan]*[\d.]*\smiles,\s([-\w]*)\n')
    if hike_type not in ['one-way', 'roundtrip']:
        hike_type = regex_finder(html, r'distance["<>\s\\nspan]*[\d.]*\smiles\s([-\w\s]*)\n')
    gain = regex_finder(html, r'Gain:[\\n\s*<span>]*([\d.]*)\n')
    highest_point = regex_finder(html, r'Point:[\\n\s<>span]*([\d.]*)')
    current_rating = regex_finder(html, r'current-rating["\s\w=:.%>]*\n\s*([\d.]*)\sout')
    rating_count = regex_finder(html, r'rating-count[">\\n\s(]*(\d*)')
    parking_pass_entry_fee = regex_finder(html, r'Entry\sFee\n\s*<\/h4>\n\s*<[\w\s=":\/.-]*>\n\s*([-\w\s]*)\n')
    permit = regex_finder(html, r'Permits\sRequired\n\s*<\/h4>\n\s*<[-\w="\s:\/.?]*>\n\s*([\w\s.()]*)\n')
    latlong = regex_finder(html, r'Co-ordinates:[\\n\s*<span>]*([\d.]*)[\\n<>\s\/\w,]*([-\d.]*)')
    if not latlong:
        latlong = [None, None]

    hike_dict = {
        'name': name,
        'link': url,
        'location': location,
        'distance': distance, 
        'hike_type': hike_type,
        'gain': gain,
        'highest_point': highest_point,
        'current_rating': current_rating,
        'rating_count': rating_count, 
        'parking_pass/entry_fee': parking_pass_entry_fee,
        'permit': permit,
        'latitude': latlong[0],
        'longitude': latlong[1],
        'features': record_features(soup)
    }
    
    return hike_dict

def create_hiking_csv(hiking_links):
    assert isinstance(hiking_links, list), "Argument must be a list of hiking links"
    hiking_collection = []
    counter = 1
    for url in hiking_links:
        if counter%10 == 0:
            print('Working on {} of {}'.format(counter, len(hiking_links)))
        hiking_collection.append(collect_hiking_details(url))
        counter += 1
    pd.DataFrame(hiking_collection).to_csv('WTA_Hiking_1.csv')
    return hiking_collection

In [98]:
df = create_hiking_csv(collect_links(URL))

Collecting URLs from https://www.wta.org/go-outside/hikes
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=30
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=60
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=90
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=120
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=150
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=180
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=210
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=240
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=270
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=300
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=330
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=360
Collecting URLs from https://www.wta.org/go-outside/hikes

Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3300
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3330
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3360
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3390
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3420
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3450
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3480
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3510
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3540
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3570
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3600
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3630
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=3660
Collecting URLs from http

Working on 2820 of 3902
Working on 2830 of 3902
Working on 2840 of 3902
Working on 2850 of 3902
Working on 2860 of 3902
Working on 2870 of 3902
Working on 2880 of 3902
Working on 2890 of 3902
Working on 2900 of 3902
Working on 2910 of 3902
Working on 2920 of 3902
Working on 2930 of 3902
Working on 2940 of 3902
Working on 2950 of 3902
Working on 2960 of 3902
Working on 2970 of 3902
Working on 2980 of 3902
Working on 2990 of 3902
Working on 3000 of 3902
Working on 3010 of 3902
Working on 3020 of 3902
Working on 3030 of 3902
Working on 3040 of 3902
Working on 3050 of 3902
Working on 3060 of 3902
Working on 3070 of 3902
Working on 3080 of 3902
Working on 3090 of 3902
Working on 3100 of 3902
Working on 3110 of 3902
Working on 3120 of 3902
Working on 3130 of 3902
Working on 3140 of 3902
Working on 3150 of 3902
Working on 3160 of 3902
Working on 3170 of 3902
Working on 3180 of 3902
Working on 3190 of 3902
Working on 3200 of 3902
Working on 3210 of 3902
Working on 3220 of 3902
Working on 3230 

In [25]:
# %%timeit -r 4 
import concurrent.futures
URL = 'https://www.wta.org/go-outside/hikes'
# with concurrent.futures.ProcessPoolExecutor() as executor:
#     ### Get a list of files to process
#     image_files = glob.glob("*.jpg")

#     ### Process the list of files, but split the work across the process pool to use all CPUs
#     ### Loop through all jpg files in the current folder 
#     ### Resize each one to size 600x600
#     fast_links_test = executor.map(collect_links, URL)
# ThreadPoolExecutor
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_lurl = executor.map(collect_links, URL)
#     for future in concurrent.futures.as_completed(future_to_url):
#         test = future_to_url[future]
#         try:
#             data = future.result()

In [17]:
# future_to_lurl.result()
list(future_to_lurl)

[]

In [2]:
import pandas as pd
df = pd.read_csv('WTA_Hiking.csv')

In [6]:
df['link'].values

array(['https://www.wta.org/go-hiking/hikes/bridal-veil-falls',
       'https://www.wta.org/go-hiking/hikes/lba-park',
       'https://www.wta.org/go-hiking/hikes/white-pass-chinook-pass-pct-200',
       ..., 'https://www.wta.org/go-hiking/hikes/lightning-peak',
       'https://www.wta.org/go-hiking/hikes/boundary-west',
       'https://www.wta.org/go-hiking/hikes/north-trail-saint-edward-state-park'],
      dtype=object)

In [8]:
future_to_url.result()

['https://www.wta.org/go-hiking/hikes/franklin-falls',
 'https://www.wta.org/go-hiking/hikes/grand-ridge-park',
 'https://www.wta.org/go-hiking/hikes/lake-serene',
 'https://www.wta.org/go-hiking/hikes/appleway-trail',
 'https://www.wta.org/go-hiking/hikes/elbo-creek',
 'https://www.wta.org/go-hiking/hikes/pilchuck-tree-farm',
 'https://www.wta.org/go-hiking/hikes/ryan-cabin',
 'https://www.wta.org/go-hiking/hikes/lord-hill-regional-park',
 'https://www.wta.org/go-hiking/hikes/lakemont-park',
 'https://www.wta.org/go-hiking/hikes/alaska-lake',
 'https://www.wta.org/go-hiking/hikes/picture-lake',
 'https://www.wta.org/go-hiking/hikes/dungeness-recreation-area',
 'https://www.wta.org/go-hiking/hikes/ira-spring-memorial',
 'https://www.wta.org/go-hiking/hikes/hurricane-hill',
 'https://www.wta.org/go-hiking/hikes/gray-wolf-river',
 'https://www.wta.org/go-hiking/hikes/west-tiger-3',
 'https://www.wta.org/go-hiking/hikes/iverson-railroad-trail',
 'https://www.wta.org/go-hiking/hikes/sun-mo

In [88]:
# latlong = regex_finder(html, r'Co-ordinates:[\\n\s*<span>]*([\d.]*)[\\n<>\s\/\w,]*([-\d.]*)')
# latlong[1]
collect_hiking_details('https://www.wta.org/go-hiking/hikes/pipe-creek-loop')

{'name': 'Pipe Creek Loop Snowshoe',
 'link': 'https://www.wta.org/go-hiking/hikes/pipe-creek-loop',
 'location': 'Central Cascades -- Blewett Pass',
 'distance': '4.1',
 'hike_type': 'roundtrip',
 'gain': '900',
 'highest_point': None,
 'current_rating': '3.00',
 'rating_count': '1',
 'parking_pass/entry_fee': 'Sno-Parks Permit',
 'permit': None,
 'latitude': '47.3315',
 'longitude': '-120.6120',
 'features': ['Mountain views',
  'Wildlife',
  'Good for kids',
  'Dogs allowed on leash',
  'Rivers']}

In [60]:
import pyperclip

In [81]:
soup, html = extract_html('https://www.wta.org/go-hiking/hikes/shelton-view-forest')
# pyperclip.copy(html)
html

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html lang="en" xml:lang="en" xmlns:xlink="http://www.w3.org/1999/xlink">\n <head>\n  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n  <!--\n        /* @license\n         * MyFonts Webfont Build ID 2229066, 2012-04-12T15:53:09-0400\n         *\n         * The fonts listed in this notice are subject to the End User License\n         * Agreement(s) entered into by the website owner. All other parties are\n         * explicitly restricted from using the Licensed Webfonts(s).\n         *\n         * You may obtain a valid license at the URLs below.\n         *\n         * Webfont: URW Egyptienne Narrow Medium by URW++\n         * URL: http://www.myfonts.com/fonts/urw/egyptienne/urw-egyptienne-t-medium-narrow/\n         * Copyright: Copyright 2010 URW++ Design &amp; Development Hamburg\n         * Licensed pageviews: 10,000,000\n         *\n     

In [21]:
next_page = soup.find("div", {"class": "feature alpha"})
next_page['data-title']
# next_url = next_page.find("", href=True)
# if next_page:

In [33]:
next_page = soup.findAll("div", {"class": "feature"})
# next_page['data-title']
next_page[1]['data-title']

'Mountain views'

In [25]:

# next_url

'Wildflowers/Meadows'

In [5]:
URL = 'https://www.wta.org/go-outside/hikes'
# URL = 'https://www.wta.org/go-outside/hikes?b_start:int=3870'
hiking_links = collect_links(URL, 2)

Collecting URLs from https://www.wta.org/go-outside/hikes
Collecting URLs from https://www.wta.org/go-outside/hikes?b_start:int=30


In [13]:
# len(hiking_links)
# collect_hiking_details(hiking_links[0])
soup, html = extract_html(hiking_links[1])
hiking_links[1]

'https://www.wta.org/go-hiking/hikes/cap-sante-park'

In [None]:
URL = 'https://www.wta.org/go-outside/hikes?b_start:int=30'
print(URL)
soup, html = extract_html(URL)
extract_links(html)

In [None]:
Data
#Location<\/h4>\s*<div>(?P<Location>[\w\s-]*)[<\/\w>\s="#-:]*
#distance["<>\s]*span>(?P<Distance>[\d.\s\w,]*)[<\/\w>\s="#-:]*
#Gain:\s<span>(?P<Gain>[\d.]*)[<\/\w>\s="#-:]*
#Point:\s<span>(?P<Highest_Point>[\d.]*)[<\/\w>\s="#-:]*
#current-rating[<\/\w\s="#-:]*>(?P<Rating>[\d.]*)[<\/\w>\s="#-:]*
#rating-count".\((?P<Votes>[\d.]*)[<\/\w>\s="#-:?]*
#Entry Fee<\/h4><[\w\s=":\/.-]*>(?P<Permits>[\w\s]*)[<\/\w\s="#-:<>!@;{}\[\]?’]*
#Co-ordinates: <span>(?P<Latitude>[\d.]*)<\/span>,\s*<span>(?P<Longitude>[-\d.]*)
# Still need title of hike, 

features
#<div class="feature" data-title="Summits">


In [None]:
from multiprocessing import Process, Queue

#Having the function definition here results in
#AttributeError: Can't get attribute 'f' on <module '__main__' (built-in)>

#The solution seems to be importing the function from a separate file.

from web_scraping import *

#Also, the original version of f only had a print statement in it.  
#That doesn't work with Process - in the sense that it prints to the console 
#instead of the notebook.
#The trick is to let f write the string to print into an output-queue.
#When Process is done, the result is retrieved from the queue and printed.

if __name__ == '__main__':    

#     Define an output queue
    output=Queue()
    
    URL = 'https://www.wta.org/go-outside/hikes'
    # Setup a list of processes that we want to run
    p = Process(target=collect_links, args=(URL,1))

    # Run process
    p.start()

    # Exit the completed process
    p.join()

    # Get process results from the output queue
    result = output.get(p)

    print(result)