# Get Data from Strava API
I obtain data from Strava on all cycling efforts on the segment [Diablo - North Gate to Summit](https://www.strava.com/segments/656860), a popular Bay Area road cycling segment that ascends the north side of Mount Diablo.

In [4]:
# import requisit libraries
import requests
import time
import pickle
import sys
import os
#import pandas as pd
from strava_api import access_token # my Strava API credentials
#import psycopg2 as pg2
#from postgres_login import secret # my postgreSQL password
import urllib
from bs4 import BeautifulSoup
import csv

## Collect Data
I use Strava's API to obtain data on the Mt Diablo south gate-to-summit segment. See details about Strava's authentication protocol [here](https://strava.github.io/api/v3/oauth/). 

For security reasons, I keep my API access token hidden. Anyone wishing to reproduce this analysis should create a file within this working directory called `strava_api.py` and within that file populate a string variable `access_token` with their own access token.

### Parameters

In [359]:
# populate some general parameters
extra_headers = {'Authorization' : 'Bearer %s' % access_token}
api_base_url = 'https://www.strava.com/api/v3/'
api_segment_url = api_base_url + 'segments/%d'
api_segment_all_efforts_url = api_segment_url + '/all_efforts'

per_page = 200 # Strava max

segment_id = 656860 # "Diablo - North Gate to Summit"

# for writing data to file
segment_fname = 'data/raw_data/segment_%d.p' % segment_id
all_efforts_fname = 'data/raw_data/all_efforts_%d.p' % segment_id

### Segment Data
First, I'll get data on the segment itself.

In [360]:
# get some data on the segment itself
# if we already have segment data on disk, unpickle it. otherwise, get it from Strava's API and pickle it.
if os.path.isfile(segment_fname):
    sys.stdout.write('Unpickling segment data %s\n' % segment_fname)
    segment_r = pickle.load(open(segment_fname, 'rb'))
else:
    segment_r = requests.get(api_segment_url % segment_id, headers=extra_headers).json()
    # save segment data
    pickle.dump(segment_r, open(segment_fname, 'wb'))
# how many efforts have been made as of data retrieval date?
n_efforts = segment_r['effort_count']

### Segment Efforts
Strava returns a max of 200 efforts per "page," but there are over 23,000 efforts on this segment. Thus, I iterate through and combine the pages of efforts.

In [361]:
start = time.time()
all_efforts = []

# do we want feedback on progress and elapsed time?
verbose = 0 

# if we already have the data pickled, load the file. otherwise, request effort data from Strava API 
if os.path.isfile(all_efforts_fname):
    sys.stdout.write('Unpickling all efforts from %s\n' % all_efforts_fname)
    all_efforts = pickle.load(open(all_efforts_fname, 'rb'))
else:
    for i in range(1, round(2 + n_efforts / per_page)):
        if verbose:
            sys.stdout.write('Requesting page %d\n' % i)
        r = requests.get(api_segment_all_efforts_url % segment_id, headers=extra_headers, 
                         params={'per_page' : per_page, 'page' : i})

        if r.status_code != 200:
            sys.stderr.write('Error, received code %d for summary request %d\n' % 
                             (r.status_code, i))
        else:
            all_efforts.extend(r.json())  
    
    # how much time elapsed (sec) during the loop?
    end = time.time()
    elapsed = end-start
    if verbose:
        sys.stdout.write('\nTotal time elapsed (seconds): %f\n' % elapsed)

    # save effort data
    pickle.dump(all_efforts, open(all_efforts_fname, 'wb'))

# Scrape Weather Data
Let's scrape temperature, precipitation, and wind data from Weather Underground's weather history for Buchanan Airport (nearest Weather Almanac source to Diablo's north gate entrance). These weather features may affect cycling performance and thus may be useful to this analysis.

Define some functions for scraping tasks

In [353]:
# function to retern BeautifulSoup object from url
def make_soup(url):
    page = urllib.request.urlopen(url)
    soupdata = BeautifulSoup(page, 'html.parser')
    return soupdata

# function to build timestamp
def make_timestamp(m=m, d=d, y=y):
    # Format month
    if len(str(m)) < 2:
        mStamp = '0' + str(m)
    else:
        mStamp = str(m)
    # Format day
    if len(str(d)) < 2:
        dStamp = '0' + str(d)
    else:
        dStamp = str(d)
    # Build timestamp
    time_stamp = str(y) + mStamp + dStamp
    return time_stamp

### Iteratively scrape weather history pages
Iterate through Weather Underground's weather history page for each day within the date range (2006-2017). I use the `BeautifulSoup` library to parse the HTML and find the weather values of interest.

In [354]:
output_fname = "/data/raw_data/wu_weather_history.csv"

if not os.path.isfile('/Users/bkhurley/git/diablo_velo' + output_fname):
    
    # initialize output file
    output_file = open(output_fname, "w")
    output_writer = csv.writer(output_file)
    
    # let's time this process
    start = time.time()

    # define the critical rows that we want from the table weather history page
    out_cols = ['timestamp','Mean Temperature','Max Temperature','Min Temperature',
                'Precipitation','Wind Speed','Max Wind Speed']
    wu_rows = out_cols[1:]

    # write feature names as first row in output file
    output_writer.writerow(out_cols)

    # Iterate through year, month, and day
    for y in range(2006, 2018):
        for m in range(1, 13):
            for d in range(1, 32):
                
                # new row array for each day
                row_array = []

                # Check if leap year
                if y%400 == 0:
                    leap = True
                elif y%100 == 0:
                    leap = False
                elif y%4 == 0:
                    leap = True
                else:
                    leap = False

                # Check if already gone through month
                if (m == 2 and leap and d > 29):
                    continue
                elif (m == 2 and d > 28):
                    continue
                elif (m in [4, 6, 9, 10] and d > 30):
                    continue

                # Build timestamp & append to data row
                timestamp = make_timestamp(m=m, d=d, y=y)
                row_array.append(timestamp)

                # Open wunderground.com url
                url = ("https://www.wunderground.com/history/airport/KCCR/" +
                       str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html")

                # get page content
                soup = make_soup(url)

                # Write loop to scrape each desired value and append to row.
                # Loop through each weather output var, then loop through WU
                # page rows to find value for that var
                for record in soup.find_all('tr'):
                    if record.find_all('td'):
                        record_text = record.find_all('td')[0].text
                        if record.find_all('td')[0].text in wu_rows:
                            row_text = record.find_all('td')[1].text.split()
                            if not record.find_all('td')[1].text.split():
                                continue
                            else:                            
                                row_text = row_text[0]
                            # deal with missing data
                            if row_text in ['', '-']:
                                row_array.append(float('nan'))
                            else:
                                # WU uses T to denote trace of participation. 
                                # I will treat that as 0.0 inches
                                if row_text == 'T':
                                    row_text = 0.0
                                # convert str to numerical value and append to data row
                                row_array.append(float(row_text))
                # write a row of data for each effort
                output_writer.writerow(row_array)

            # update status as we work through each month/year
            sys.stdout.write('Scraped weather records for Month: %d, Year: %d\n' % (m, y))
    
    # Done getting data! Close file.
    output_file.close()
    sys.stdout.write('\n\nFinished scraping weather data! Data written to file: %s\n' % output_fname)

    # end the timer
    end = time.time()
    elapsed = end-start