## Data exploration

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import warnings
import datetime
%matplotlib inline

# This package will output the execution time of each cell. Pretty neat!
warnings.filterwarnings('ignore')
%install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
warnings.filterwarnings('default')
%load_ext autotime

Installed autotime.py. To use it, type:
  %load_ext autotime


### Let's grab the data from [September, 2016](https://s3.amazonaws.com/tripdata/201609-citibike-tripdata.zip)

In [2]:
rides = pd.read_csv('data/201606-citibike-tripdata.csv')

time: 5.3 s


### How does the data look?

In [3]:
rides.head(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,1470,6/1/2016 00:00:18,6/1/2016 00:24:48,380,W 4 St & 7 Ave S,40.734011,-74.002939,3236,W 42 St & Dyer Ave,40.758985,-73.9938,19859,Subscriber,1972.0,1
1,229,6/1/2016 00:00:20,6/1/2016 00:04:09,3092,Berry St & N 8 St,40.719009,-73.958525,3103,N 11 St & Wythe Ave,40.721533,-73.957824,16233,Subscriber,1967.0,1
2,344,6/1/2016 00:00:21,6/1/2016 00:06:06,449,W 52 St & 9 Ave,40.764618,-73.987895,469,Broadway & W 53 St,40.763441,-73.982681,22397,Subscriber,1989.0,1
3,1120,6/1/2016 00:00:28,6/1/2016 00:19:09,522,E 51 St & Lexington Ave,40.757148,-73.972078,401,Allen St & Rivington St,40.720196,-73.989978,16231,Subscriber,1991.0,1
4,229,6/1/2016 00:00:53,6/1/2016 00:04:42,335,Washington Pl & Broadway,40.729039,-73.994046,285,Broadway & E 14 St,40.734546,-73.990741,15400,Subscriber,1989.0,1


time: 39.8 ms


### Looks good! How much data do we have?

In [None]:
num_rides = len(rides)
num_female = len(rides[rides['gender'] == 2])
percent_female = (num_female / num_rides) * 100
num_male = len(rides[rides['gender'] == 1])
percent_male = (num_male / num_rides) * 100
print(("Our data set has %d rides, " +
      "of which %.2f%% (%d) are female and %.2f%% (%d) are male") % (
        len(rides), percent_female, num_female, percent_male, num_male))

In [None]:
len(rides[(rides['tripduration'] / 60) > 50])

### What are the most popular starting stations?

In [None]:
start_stations = rides.groupby('start station name')['start station id'].count().reset_index()
start_stations.columns = ['Start Station', 'Count']
start_stations = start_stations.sort_values(by='Count', ascending=False).head()

# create bar graph
plt.figure(figsize=(8,8))
plt.bar(list(range(len(start_stations))), start_stations['Count'], 
            tick_label=start_stations['Start Station'])
plt.xticks(rotation=40, fontsize = 15)
plt.xlabel('Stations', fontsize = 15)
plt.ylabel('Number of rides', fontsize = 15)
plt.title('Most popular start stations in Sept. 2016', fontsize = 15)
print()

### What are the most popular ending stations?

In [None]:
end_stations = rides.groupby('end station name')['end station id'].count().reset_index()
end_stations.columns = ['End Station', 'Count']
end_stations = end_stations.sort_values(by='Count', ascending=False).head()

# create bar graph
plt.figure(figsize=(8,8))
plt.bar(list(range(len(end_stations))), end_stations['Count'], 
            tick_label=end_stations['End Station'])
plt.xticks(rotation=40, fontsize = 15)
plt.xlabel('Stations', fontsize = 15)
plt.ylabel('Number of rides', fontsize = 15)
plt.title('Most popular end stations in Sept. 2016', fontsize = 15)
print()

### Looks like just about the same as the most popular start stations!

### How about, what percentage of rides are one-way vs. round trip?

In [None]:
round_trip = rides[rides['start station id'] == rides['end station id']]
percent_round_trip = (len(round_trip) / len(rides)) * 100
print("Round trips account for %.2f%% of bike rides (in September)" % percent_round_trip)

### What about in other months?

In [None]:
june_2015_rides = pd.read_csv('data/201506-citibike-tripdata.csv')
july_2015_rides = pd.read_csv('data/201507-citibike-tripdata.csv')
august_2015_rides = pd.read_csv('data/201508-citibike-tripdata.csv')
june_2016_rides = pd.read_csv('data/201606-citibike-tripdata.csv')
july_2016_rides = pd.read_csv('data/201607-citibike-tripdata.csv')
august_2016_rides = pd.read_csv('data/201508-citibike-tripdata.csv')

In [None]:
june_2015_round_trip = june_2015_rides[june_2015_rides['start station id'] == june_2015_rides['end station id']]
june_2015_percent_round_trip = (len(june_2015_round_trip) / len(june_2015_rides)) * 100
print("Round trips account for %.2f%% of bike rides (in October)" % june_2015_percent_round_trip)
june_2015_round_trip.head()
len(june_2015_rides)

### Okay, so clearly "round trips" account for a very small portion of rides. Will this be viable to use to classify the data? Maybe note. But our version of round trips means starting and stopping at the same station. But stations can be very nearby each other, especially in the village:
<img src="data/citi_bike_stations.png" style="height: 800px">

### So a user may make basically a "round trip" but then return the bike to a station one block away. We should treat this as a round trip when analyzing our data. We have longitude and latitudate data from each station, let's use that!

In [None]:
from geopy.distance import great_circle

### Let's see how the module works

In [None]:
start = (june_2015_rides.iloc[0]['start station longitude'], june_2015_rides.iloc[0]['start station latitude'])
end = (june_2015_rides.iloc[0]['end station longitude'], june_2015_rides.iloc[0]['end station latitude'])
print("Start station: %s" % str(start))
print("End station: %s" % str(end))
print("Distance between: %.4f %s" % (great_circle(start, end).miles, "miles"))

In [None]:
start = (june_2015_rides.iloc[1]['start station longitude'], june_2015_rides.iloc[1]['start station latitude'])
end = (june_2015_rides.iloc[1]['end station longitude'], june_2015_rides.iloc[1]['end station latitude'])
print("Start station: %s" % str(start))
print("End station: %s" % str(end))
print("Distance between: %.4f %s" % (great_circle(start, end).miles, "miles"))

## Time for some serious data cleaning. Here we will determine roundtrips for all rows in all 6 months we're interested in (June, July, August in 2015 and 2016). 2015 is training data and 2016 will be used to test. Because this takes so long to process, we're going to add a 'roundtrip' column and then export the data so we only have to do this once

In [4]:
# input a citi bike month dataframe, return a new dataframe with the column "roundtrip" computed.
# 0 = one way, 1 = roundtrip
def calculate_roundtrips(without_roundtrips):
    
    # for use with tracking our runtime
    count = 0
    
    # we only care about the 'Subscriber' customers
    without_roundtrips = without_roundtrips[without_roundtrips['usertype'] == 'Subscriber']
    
    # add roundtrip column and set them all thre zero by default
    without_roundtrips['roundtrip'] = pd.Series(
        [-1 for i in range(len(without_roundtrips))], index=without_roundtrips.index)
    
    # to measure how long this computation takes
    startTime = datetime.datetime.now().replace(microsecond=0)
    print("Start time: %s" % startTime)
    
    # loop through entire input dataframe and calculate roundtrips
    for i in list(without_roundtrips.index):
        # first let's check if start and stop station are the same (obvious roundtrip)
        if (without_roundtrips.ix[i]['start station id'] == without_roundtrips.ix[i]['end station id']):
            without_roundtrips.set_value(without_roundtrips.ix[i].name, 'roundtrip', 1)
        else:
            # now lets check distance between stations
            start_coordinates = (without_roundtrips.ix[i]['start station longitude'], 
                                 without_roundtrips.ix[i]['start station latitude'])
            end_coordinates = (without_roundtrips.ix[i]['end station longitude'], 
                               without_roundtrips.ix[i]['end station latitude'])
            distance = great_circle(start_coordinates, end_coordinates).miles
            
            if (distance < .3) & (without_roundtrips.ix[i]['tripduration'] > 480):
                without_roundtrips.set_value(without_roundtrips.ix[i].name, 'roundtrip', 1)
            else:
                without_roundtrips.set_value(without_roundtrips.ix[i].name, 'roundtrip', 0)

        # Keep track of how long this calculation is taking
        count += 1
        if (count % 10000 == 0):
            now = datetime.datetime.now().replace(microsecond=0)
            print("%d rows processed, %.2f%% done, %s time elapsed" % (
                    count, (count / len(without_roundtrips)) * 100, now - startTime))
            
    # how long did this take?
    endTime = datetime.datetime.now().replace(microsecond=0)
    print("Runtime: %s" % str(endTime - startTime))
    
    return without_roundtrips

time: 91.6 ms


In [None]:
warnings.filterwarnings('ignore')
roundtrips = calculate_roundtrips(june_2015_rides)
percent_roundtrip = (len(roundtrips[roundtrips['roundtrip'] == 1]) / len(roundtrips)) * 100
print("Out of %d rides in June 2015, %d%% are round trip" % (len(roundtrips), percent_roundtrip))
roundtrips.to_csv('data/june_2015_with_roundtrip.csv', na_rep='NaN', index=False)

In [None]:
roundtrips = calculate_roundtrips(july_2015_rides)
percent_roundtrip = (len(roundtrips[roundtrips['roundtrip'] == 1]) / len(roundtrips)) * 100
print("Out of %d rides in July 2015, %d%% are round trip" % (len(roundtrips), percent_roundtrip))
roundtrips.to_csv('data/july_2015_with_roundtrip.csv', na_rep='NaN', index=False)

In [None]:
roundtrips = calculate_roundtrips(august_2015_rides)
percent_roundtrip = (len(roundtrips[roundtrips['roundtrip'] == 1]) / len(roundtrips)) * 100
print("Out of %d rides in August 2015, %d%% are round trip" % (len(roundtrips), percent_roundtrip))
roundtrips.to_csv('data/august_2015_with_roundtrip.csv', na_rep='NaN', index=False)

In [None]:
roundtrips = calculate_roundtrips(june_2016_rides)
percent_roundtrip = (len(roundtrips[roundtrips['roundtrip'] == 1]) / len(roundtrips)) * 100
print("Out of %d rides in June 2016, %d%% are round trip" % (len(roundtrips), percent_roundtrip))
roundtrips.to_csv('data/june_2016_with_roundtrip.csv', na_rep='NaN', index=False)

In [None]:
roundtrips = calculate_roundtrips(july_2016_rides)
percent_roundtrip = (len(roundtrips[roundtrips['roundtrip'] == 1]) / len(roundtrips)) * 100
print("Out of %d rides in July 2016, %d%% are round trip" % (len(roundtrips), percent_roundtrip))
roundtrips.to_csv('data/july_2016_with_roundtrip.csv', na_rep='NaN', index=False)

In [None]:
roundtrips = calculate_roundtrips(august_2016_rides)
percent_roundtrip = (len(roundtrips[roundtrips['roundtrip'] == 1]) / len(roundtrips)) * 100
print("Out of %d rides in August 2016, %d%% are round trip" % (len(roundtrips), percent_roundtrip))
roundtrips.to_csv('data/august_2016_with_roundtrip.csv', na_rep='NaN', index=False)

### Now that our data is exported, let's check it out in roundtrips.ipynb