In [3]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import pymongo
import math
import pandas as pd
from multiprocessing.dummy import Pool
from multiprocessing import cpu_count

In [4]:
mc = pymongo.MongoClient()
db = mc['reports']
mt_ellinor_reports = db['mt_ellinor']

 WARNING: If you want to clear the current database

In [32]:
mt_ellinor_reports.drop()

If you need to problem solve: %pdb

In [5]:
def select_text(parent_element, css_selector):
    element = parent_element.select_one(css_selector)
    return get_text_if_not_none(element)

In [6]:
def get_text_if_not_none(element):
    if element is None:
        text = None
    else:
        text = element.text
    return text

In [13]:
def select_date(parent_element, css_selector):
    element = parent_element.select_one(css_selector)
    return get_date_if_not_none(element)

In [14]:
def get_date_if_not_none(element):
    if element is None:
        text = None
    else:
        text = element.attrs.get('title')
    return text

In [35]:
def parse_trip_report(title,trip_report_div):
    """Return a dictionary representing a single trip report."""
    creator = select_text(trip_report_div, 'div.CreatorInfo span a')
    date = select_date(trip_report_div, 'span.elapsed-time')
    report = select_text(trip_report_div,'div.show-with-full')
    trail_conditions = select_text(trip_report_div,'div.trail-issues')
    votes = select_text(trip_report_div,'span.UpvoteCount')
    return {
        "Trail": title,
        "Creator": creator,
        "Date": date,
        "Report": report,
        "Trail_condtions": trail_conditions,
        "Votes": votes
    }

In [33]:
def get_trail_report(title, hikeurl, params=None):
    """Accepts a url of the hike, finds all of the subsequent trip reports for that hike,
    scrapes them and inserts them into a MongoDB
    **Input parameters**
    ------------------------------------------------------------------------------
    title: string.  Hike name.
    hikeurl: string. Base URL for the request.
    params: dictionary.  Parameters to be included in the request.
    **Output**
    ------------------------------------------------------------------------------
    None. Appends entry to MongoDB using Pymongo
    """
    r = requests.get(hikeurl + '/@@related_tripreport_listing', params).text
    #
    soup = BeautifulSoup(r, 'lxml')
    for trip_report_div in soup.select('div#trip-reports div.item'):
        trip_report = parse_trip_report(title,trip_report_div)
        mt_ellinor_reports.insert_one(trip_report)

In [17]:
def iterate_all_reports(title, hikeurl):
    """Determines the number of times to call getTripReports function based on
    the number of trip reports listed on the hike homepage.
    **Input parameters**
    ------------------------------------------------------------------------------
    title: string.  Hike name.
    hikeurl: string. Base URL for the request.
    **Output**
    ------------------------------------------------------------------------------
    None. Appends entry to MongoDB using pymongo.
    """
    #lists how many reports are on the page
    r = requests.get(hikeurl + '/@@related_tripreport_listing').text
    soup = BeautifulSoup(r, 'lxml')
    numit = math.ceil(float(soup.find('div', {'id': 'count-data'}).text)/5)
    for i in range(int(numit)):
        get_trail_report(title, hikeurl, params={'b_start:int': str(i*5)})

In [None]:
def TripReportBuilder(df):
    """Iterates through the rows of loaded pandas dataframe and calls
    iterateTripReports for each hike/row.
    
    **Input parameters**
    ------------------------------------------------------------------------------
    title: pandas dataframe. Dataframe must contain columns entitled 'numReports'
            and 'hike_name'.
    **Output**
    ------------------------------------------------------------------------------
    None. Calls following functions for input of data into MongoDB using Pymongo
    """
    for row in range(len(df)):
        if df['numReports'][row]:
            iterateTripReports(df['hike_name'][row], df['url'][row])
        else:
            continue

In [38]:
iterate_all_reports('mt_ellinor',"https://www.wta.org/go-hiking/hikes/mount-ellinor")

In [39]:
df = pd.DataFrame(list( mt_ellinor_reports.find()))

In [40]:
df

Unnamed: 0,Creator,Date,Report,Trail,Trail_condtions,Votes,_id
0,\nMoritzCrackers,"May 05, 2018",Friday evening my boyfriend and I drove from S...,mt_ellinor,\nBeware of:\n snow conditions\n ...,1,5af0caf5acf3d61d9c0f1d46
1,\nJunBug,"May 05, 2018","Found women's shoes and gaiters on May 5th, ar...",mt_ellinor,"\nBeware of:\n road, snow conditi...",1,5af0caf5acf3d61d9c0f1d47
2,\nGKeeffe,"May 04, 2018",started hiking at 7 a.m. and the snow conditio...,mt_ellinor,\nBeware of:\n snow conditions\n ...,12,5af0caf5acf3d61d9c0f1d48
3,\nOld Mountain Man,"Apr 29, 2018",We climbed Ellinor via the winter route. The r...,mt_ellinor,"\nBeware of:\n road, snow & trail...",13,5af0caf5acf3d61d9c0f1d49
4,\nGKeeffe,"Apr 27, 2018",Road to lower trail now completely snow free. ...,mt_ellinor,"\nBeware of:\n snow, trail condit...",28,5af0caf5acf3d61d9c0f1d4a
5,\njasonturnerwa,"Apr 25, 2018",Got a late start today and hit it from the low...,mt_ellinor,\nBeware of:\n snow conditions\n ...,8,5af0caf5acf3d61d9c0f1d4b
6,\nMafHoney,"Apr 22, 2018","Road to the lower trailhead is snow free, but ...",mt_ellinor,\nBeware of:\n snow conditions\n ...,2,5af0caf5acf3d61d9c0f1d4c
7,\nkjmac,"Apr 21, 2018",Wowww such a beautiful place. The views from t...,mt_ellinor,\nBeware of:\n snow conditions\n ...,4,5af0caf5acf3d61d9c0f1d4d
8,\nufda94,"Apr 21, 2018",Driving to the lower trailhead in any vehicle ...,mt_ellinor,"\nBeware of:\n snow, trail condit...",4,5af0caf5acf3d61d9c0f1d4e
9,\njfarias1986,"Apr 21, 2018",Took my step son up here today! Great day for ...,mt_ellinor,"\nBeware of:\n snow, trail condit...",6,5af0caf5acf3d61d9c0f1d4f
