# Analyzing CyclePronto Data

This is a work-in-progress, analyzing the open data from [Pronto Cycle Share's Data Challenge](http://www.prontocycleshare.com/datachallenge).

In [None]:
# !curl -O https://s3.amazonaws.com/pronto-data/open_data_year_one.zip
# !unzip open_data_year_one.zip

In [None]:
!ls

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()

In [None]:
trips = pd.read_csv('2015_trip_data.csv',
                    parse_dates=['starttime', 'stoptime'],
                    infer_datetime_format=True)
trips.head()

## Trend with Time

In [None]:
trips['date'] = trips.starttime.map(pd.Timestamp.date).astype('datetime64')

In [None]:
by_date = trips.pivot_table('trip_id', index='date', columns='usertype', aggfunc='count')

In [None]:
fig, ax = plt.subplots(2, figsize=(16, 8))
fig.subplots_adjust(hspace=0.4)
by_date.iloc[:, 0].plot(ax=ax[0], title='Annual Members');
by_date.iloc[:, 1].plot(ax=ax[1], title='Short-term Pass');

Big spike in April likely due to the [American Planning Association national conference](http://www.planetizen.com/node/75958/seattle-sets-bikeshare-record-apa-town)

### Trend with Weekday

In [None]:
by_weekday = by_date.groupby(by_date.index.dayofweek).mean()
by_weekday.columns.name = None
by_weekday.index = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(title='Average Use by Day of Week');

## Trip Durations

In [None]:
trips['minutes'] = trips.tripduration / 60
trips.groupby('usertype')['minutes'].hist(bins=np.arange(61), alpha=0.5, normed=True);
plt.xlabel('Duration (minutes)')
plt.ylabel('relative frequency')
plt.title('Trip Durations')
plt.legend(['Annual Members', 'Short-term Pass']);

## Trip Distances

In [None]:
stations = pd.read_csv('2015_station_data.csv')

# For below: some trips start and end at the pronto shop
pronto_shop = dict(id=54, name="Pronto shop",
                   terminal="Pronto shop",
                   lat=47.6173156, long=-122.3414776,
                   dockcount=100, online='10/13/2014')
stations = stations.append(pronto_shop, ignore_index=True)

stations.head()

https://developers.google.com/maps/documentation/distance-matrix/intro

we have 55 stations, but the free API is limited to 2500 elements per day, 100 elements per request, 100 elements per 10 seconds.
There are 54x53/2 = 1431 unique station pairs, so that means we can *just barely* get the info we need for free within one 24 hour period, if we do it correctly the first time.
We'll query one (partial) row at a time, waiting 10 seconds between queries.

In [None]:
from time import sleep

def get_distances(stations):
    latlon_list = ['{0},{1}'.format(lat, long)
                   for (lat, long) in zip(stations.lat, stations.long)]

    def create_url(i):
        URL = ('https://maps.googleapis.com/maps/api/distancematrix/json?'
               'origins={origins}&destinations={destinations}&mode=bicycling')
        return URL.format(origins=latlon_list[i],
                          destinations='|'.join(latlon_list[i + 1:]))

    for i in range(len(latlon_list) - 1):
        url = create_url(i)
        filename = "distances_{0}.json".format(stations.terminal.iloc[i])
        print(i, filename)
        !curl "{url}" -o {filename}
        sleep(11)
        
# Note: you can call this function at most ~twice per day!
# get_distances(stations)

# Move all the queried files into a directory, so we don't accidentally overwrite them

#!mkdir queried_distances
#!mv distances* queried_distances

Build and save the station-to-station bike distance matrix

In [None]:
def build_distance_matrix(stations=stations, distdir='queried_distances'):
    dist = np.zeros((len(stations), len(stations)), dtype=float)
    for i, term in enumerate(stations.terminal[:-1]):
        filename = 'queried_distances/distances_{0}.json'.format(term)
        row = json.load(open(filename))
        dist[i, i + 1:] = [el['distance']['value'] for el in row['rows'][0]['elements']]
    dist += dist.T
    return pd.DataFrame(dist, index=stations.terminal,
                        columns=stations.terminal)

#distances = build_distance_matrix()
#distances.to_csv('station_distances.csv')

Load this distance matrix:

In [None]:
distances = pd.read_csv('station_distances.csv', index_col='terminal')
distances.iloc[:5, :5]

In [None]:
trips['distance'] = [distances.loc[ind] for ind in
                     zip(trips.from_station_id, trips.to_station_id)]
trips['distance'] /= 1609.34  # convert meters to miles

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
trips.groupby('usertype')['distance'].hist(bins=np.linspace(0, 6.5, 40),
                                           alpha=0.5,
                                           ax=ax);
plt.xlabel('distance between start/end (miles)')
plt.ylabel('relative frequency')
plt.title('Minimum Distance of Trip')
plt.legend(['Annual Members', 'Short-term Pass']);

## Speed of Riding

In [None]:
trips['speed'] = trips.distance * 60 / trips.minutes
trips.groupby('usertype')['speed'].hist(bins=np.linspace(0, 15, 50), alpha=0.5, normed=True);
plt.xlabel('min riding speed (MPH)')
plt.ylabel('relative frequency')
plt.title('Rider Speed Lower Bound (MPH)')
plt.legend(['Annual Members', 'Short-term Pass']);

From Pronto's [website (signin required)](https://secure.prontocycleshare.com/profile/statisticsFormulas):

> Distance traveled is an estimate based on your total usage time with
> an assumed average speed of 7.456 miles per hour: Distance Traveled = [Total usage time] * 7.456

From this plot, it looks like this estimated speed is very close to the actual mean speed!
Still, though, pronto could quite easily use the distance grid between spaces to better estimate the distance the user has traveled.

In [None]:
g = sns.FacetGrid(trips, col="usertype", hue='usertype', size=6)
g.map(plt.scatter, "distance", "speed", s=4, alpha=0.2)
g.axes[0, 0].axis([0, 10, 0, 25])

In [None]:
trips

In [None]:
long_trips = trips[(trips.distance > 6.5) & (trips.distance < 7.0)]
long_trips['to_station_id'].value_counts()

In [None]:
ride_distance = distances.loc['UD-01', 'BT-03']
subset = trips[trips.distance == ride_distance / 1609.34].copy()

In [None]:
subset['starttime'] = subset.starttime.map(lambda t: t.time())
subset['inbound'] = subset.to_station_id == 'UD-01'

plt.scatter(subset.starttime.values,
            subset.minutes.values,
            c=subset.inbound);

In [None]:
subset = subset[subset.minutes < 50]
subset.date.min(), subset.date.max()

In [None]:
subset.to_station_name.value_counts()

Just for reference, here is Google's suggested route between these stations:

![Alt text](gmap-blakely.png)

<small>[(source)](https://www.google.com/maps/dir/47.666145%09-122.301491/47.615829%09-122.348564/@47.6410289,-122.3619496,13z/data=!3m1!4b1!4m10!4m9!1m3!2m2!1d-122.301491!2d47.666145!1m3!2m2!1d-122.348564!2d47.615829!3e1)</small>

Given the pattern here, my guess is that a UW student got a summer internship downtown, and rode Pronto to and from work most days.

## Trend with Elevation

2500 requests per day; 512 per request, 10 requests per second

https://developers.google.com/maps/documentation/elevation/intro

In [None]:
# Get Elevations via the google maps API
def get_station_elevations(stations):
    URL = "https://maps.googleapis.com/maps/api/elevation/json?locations="
    locs = '|'.join(['{0},{1}'.format(lat, long)
                     for (lat, long) in zip(stations.lat, stations.long)])
    URL += locs
    !curl "{URL}" -o elevations.json
    
# get_station_elevations(stations)

In [None]:
import json

def process_station_elevations(filename='elevations.json'):
    D = json.load(open(filename))
    def unnest(D):
        loc = D.pop('location')
        loc.update(D)
        return loc
    elevs = pd.DataFrame([unnest(item) for item in D['results']])
    return elevs

elevs = process_station_elevations()

# double check that locations match
print(np.allclose(stations.long, elevs.lng))
print(np.allclose(stations.lat, elevs.lat))

In [None]:
stations['elevation'] = elevs['elevation']

In [None]:
stations.head()

In [None]:
elev = stations[['terminal', 'elevation']].set_index('terminal')
trips['elevation_start'] = trips.join(elev, on='from_station_id')['elevation']
trips['elevation_end'] = trips.join(elev, on='to_station_id')['elevation']
trips['elevation_gain'] = trips['elevation_end'] - trips['elevation_start']

In [None]:
g = sns.FacetGrid(trips, col="usertype", hue='usertype')
g.map(plt.hist, "elevation_gain", bins=np.arange(-145, 150, 10))
g.fig.set_figheight(6)
g.fig.set_figwidth(16);

# plot some lines to guide the eye
for lim in range(60, 150, 20):
    x = np.linspace(-lim, lim, 3)
    for ax in g.axes.flat:
        ax.fill(x, 100 * (lim - abs(x)),
                color='gray', alpha=0.1, zorder=0)

In [None]:
print("total downhill trips:", (trips.elevation_gain < 0).sum())
print("total uphill trips:  ", (trips.elevation_gain > 0).sum())

## Weather

In [None]:
weather = pd.read_csv('2015_weather_data.csv', index_col='Date', parse_dates=True)
weather.columns

In [None]:
by_date = trips.groupby(['date', 'usertype'])['trip_id'].count()
by_date.name = 'count'
by_date = by_date.reset_index('usertype').join(weather)
by_date['weekend'] = (by_date.index.dayofweek >= 5)

In [None]:
g = sns.FacetGrid(by_date, col="weekend", hue='usertype', size=6)
g.map(sns.regplot, "Mean_Temperature_F", "count")
g.add_legend();

In [None]:
g = sns.FacetGrid(by_date, col="weekend", hue='usertype', size=6)
g.map(sns.regplot, "Precipitation_In ", "count")
g.add_legend();
g.axes[0, 0].set_ylim(-50, 600);