In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as dt
import collections
%matplotlib inline

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from scipy import stats
from statsmodels.formula.api import ols

# Raw DataFrame

In [2]:
# read in the Chicagov Divvy Bicycle Sharing Data csv files from Kaggle
pd.set_option('display.max_columns', 30)
filename = 'Divvy_Stations_2014-Q1Q2.csv'
stationdf = pd.read_csv(filename)
stationdf.head()

Unnamed: 0,id,name,latitude,longitude,dpcapacity,online date
0,43,Michigan Ave & Washington St,41.883893,-87.624649,43,6/16/13
1,44,State St & Randolph St,41.88473,-87.627734,27,6/16/13
2,33,State St & Van Buren St,41.877181,-87.627844,27,6/25/13
3,199,Wabash Ave & Grand Ave,41.891738,-87.626937,15,8/10/13
4,51,Clark St & Randolph St,41.884576,-87.63189,31,6/17/13


In [3]:
# read in the Chicagov Divvy Bicycle Sharing Data csv files from Kaggle
pd.set_option('display.max_columns', 30)
filename = 'Divvy_Trips_2014_Q1Q2.csv'
tripdf = pd.read_csv(filename)
tripdf.head()

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,2355134,6/30/2014 23:57,7/1/2014 0:07,2006,604,131,Lincoln Ave & Belmont Ave,303,Broadway & Cornelia Ave,Subscriber,Male,1988.0
1,2355133,6/30/2014 23:56,7/1/2014 0:00,2217,263,282,Halsted St & Maxwell St,22,May St & Taylor St,Subscriber,Male,1992.0
2,2355130,6/30/2014 23:33,6/30/2014 23:35,2798,126,327,Sheffield Ave & Webster Ave,225,Halsted St & Dickens Ave,Subscriber,Male,1993.0
3,2355129,6/30/2014 23:26,7/1/2014 0:24,173,3481,134,Peoria St & Jackson Blvd,194,State St & Wacker Dr,Subscriber,Female,1988.0
4,2355128,6/30/2014 23:16,6/30/2014 23:26,173,638,320,Loomis St & Lexington St,134,Peoria St & Jackson Blvd,Subscriber,Female,1988.0


In [4]:
tripdf = tripdf[['trip_id','starttime','stoptime','from_station_id','from_station_name','to_station_id','to_station_name']]

In [5]:
stationdf = stationdf[['name','dpcapacity']]

In [6]:
stationdata = pd.merge(tripdf, stationdf, how='left',left_on=['from_station_name'], right_on='name')
stationdata = stationdata.sort_values('starttime')

In [7]:
stationdata.head()

Unnamed: 0,trip_id,starttime,stoptime,from_station_id,from_station_name,to_station_id,to_station_name,name,dpcapacity
905698,1109420,1/1/2014 0:17,1/1/2014 0:42,94,Clark St & Armitage Ave,69,Damen Ave & Pierce Ave,Clark St & Armitage Ave,19
905697,1109421,1/1/2014 0:45,1/1/2014 0:55,69,Damen Ave & Pierce Ave,216,California Ave & Division St,Damen Ave & Pierce Ave,19
905649,1109572,1/1/2014 10:13,1/1/2014 10:39,29,Noble St & Milwaukee Ave,222,Milwaukee Ave & Rockwell St,Noble St & Milwaukee Ave,15
905648,1109575,1/1/2014 10:23,1/1/2014 10:31,289,Wells St & Concord Ln,224,Halsted St & Willow St,Wells St & Concord Ln,19
905645,1109578,1/1/2014 10:38,1/1/2014 10:57,152,Lincoln Ave & Diversey Pkwy,291,Wells St & Evergreen Ave,Lincoln Ave & Diversey Pkwy,15


In [8]:
stationdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 905699 entries, 905698 to 249031
Data columns (total 9 columns):
trip_id              905699 non-null int64
starttime            905699 non-null object
stoptime             905699 non-null object
from_station_id      905699 non-null int64
from_station_name    905699 non-null object
to_station_id        905699 non-null int64
to_station_name      905699 non-null object
name                 905699 non-null object
dpcapacity           905699 non-null int64
dtypes: int64(4), object(5)
memory usage: 69.1+ MB


# Data Cleaning and Wrangling

### Trip ID Column

In [9]:
# check for unique trip_ids
stationdata.trip_id.value_counts();

In [10]:
# since there were some trip_ids with more than one count, determine the count of unique rows
stationdata.trip_id.nunique()

905699

In [11]:
# make sure the entries match our nunique count 
stationdata.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 905699 entries, 905698 to 249031
Data columns (total 9 columns):
trip_id              905699 non-null int64
starttime            905699 non-null object
stoptime             905699 non-null object
from_station_id      905699 non-null int64
from_station_name    905699 non-null object
to_station_id        905699 non-null int64
to_station_name      905699 non-null object
name                 905699 non-null object
dpcapacity           905699 non-null int64
dtypes: int64(4), object(5)
memory usage: 69.1+ MB


### General cleaning steps for easier use

In [12]:
# convert start and stop times to datetimes
stationdata['starttime'] = pd.to_datetime(stationdata['starttime'])
stationdata['stoptime'] = pd.to_datetime(stationdata['stoptime'])

### Capacity

In [13]:
departures = stationdata[['starttime','from_station_name','trip_id']].sort_values(['from_station_name','starttime'])
departures = departures.rename(columns={'starttime':'datetime','from_station_name':'station_name'})

departures.head()

Unnamed: 0,datetime,station_name,trip_id
904881,2014-01-09 12:11:00,900 W Harrison,1111738
904715,2014-01-09 16:48:00,900 W Harrison,1112226
904512,2014-01-09 18:43:00,900 W Harrison,1112605
904509,2014-01-09 18:45:00,900 W Harrison,1112609
903649,2014-01-10 17:39:00,900 W Harrison,1114754


In [14]:
# each departure is given a 1 value
departures['trip_counts'] = 1
departures.head()

Unnamed: 0,datetime,station_name,trip_id,trip_counts
904881,2014-01-09 12:11:00,900 W Harrison,1111738,1
904715,2014-01-09 16:48:00,900 W Harrison,1112226,1
904512,2014-01-09 18:43:00,900 W Harrison,1112605,1
904509,2014-01-09 18:45:00,900 W Harrison,1112609,1
903649,2014-01-10 17:39:00,900 W Harrison,1114754,1


In [15]:
departures.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 905699 entries, 904881 to 331
Data columns (total 4 columns):
datetime        905699 non-null datetime64[ns]
station_name    905699 non-null object
trip_id         905699 non-null int64
trip_counts     905699 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 34.5+ MB


In [16]:
arrivals = stationdata[['starttime','to_station_name','trip_id']].sort_values(['to_station_name','starttime'])
arrivals = arrivals.rename(columns={'starttime':'datetime','to_station_name':'station_name'})

arrivals.head()

Unnamed: 0,datetime,station_name,trip_id
905030,2014-01-09 08:45:00,900 W Harrison,1111310
904512,2014-01-09 18:43:00,900 W Harrison,1112605
904086,2014-01-10 08:44:00,900 W Harrison,1113554
904039,2014-01-10 09:01:00,900 W Harrison,1113612
901677,2014-01-13 06:05:00,900 W Harrison,1118830


In [17]:
# each arrivals is given a -1 value
arrivals['trip_counts'] = -1
arrivals.head()

Unnamed: 0,datetime,station_name,trip_id,trip_counts
905030,2014-01-09 08:45:00,900 W Harrison,1111310,-1
904512,2014-01-09 18:43:00,900 W Harrison,1112605,-1
904086,2014-01-10 08:44:00,900 W Harrison,1113554,-1
904039,2014-01-10 09:01:00,900 W Harrison,1113612,-1
901677,2014-01-13 06:05:00,900 W Harrison,1118830,-1


In [18]:
departures_and_arrivals = pd.concat([departures, arrivals])
departures_and_arrivals = departures_and_arrivals.sort_values(['station_name','datetime'])
departures_and_arrivals['date'] = departures_and_arrivals['datetime'].dt.date
departures_and_arrivals = departures_and_arrivals[['datetime','date','station_name','trip_id','trip_counts']]
departures_and_arrivals.head()

Unnamed: 0,datetime,date,station_name,trip_id,trip_counts
905030,2014-01-09 08:45:00,2014-01-09,900 W Harrison,1111310,-1
904881,2014-01-09 12:11:00,2014-01-09,900 W Harrison,1111738,1
904715,2014-01-09 16:48:00,2014-01-09,900 W Harrison,1112226,1
904512,2014-01-09 18:43:00,2014-01-09,900 W Harrison,1112605,1
904512,2014-01-09 18:43:00,2014-01-09,900 W Harrison,1112605,-1


In [19]:
departures_and_arrivals.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1811398 entries, 905030 to 331
Data columns (total 5 columns):
datetime        1811398 non-null datetime64[ns]
date            1811398 non-null object
station_name    1811398 non-null object
trip_id         1811398 non-null int64
trip_counts     1811398 non-null int64
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 82.9+ MB


In [20]:
# get running sum of departures-arrivals
departures_and_arrivals['cumulative_sum'] = departures_and_arrivals.groupby(['station_name','date'])['trip_counts'].cumsum()
departures_and_arrivals


Unnamed: 0,datetime,date,station_name,trip_id,trip_counts,cumulative_sum
905030,2014-01-09 08:45:00,2014-01-09,900 W Harrison,1111310,-1,-1
904881,2014-01-09 12:11:00,2014-01-09,900 W Harrison,1111738,1,0
904715,2014-01-09 16:48:00,2014-01-09,900 W Harrison,1112226,1,1
904512,2014-01-09 18:43:00,2014-01-09,900 W Harrison,1112605,1,2
904512,2014-01-09 18:43:00,2014-01-09,900 W Harrison,1112605,-1,1
904509,2014-01-09 18:45:00,2014-01-09,900 W Harrison,1112609,1,2
904086,2014-01-10 08:44:00,2014-01-10,900 W Harrison,1113554,-1,-1
904039,2014-01-10 09:01:00,2014-01-10,900 W Harrison,1113612,-1,-2
903649,2014-01-10 17:39:00,2014-01-10,900 W Harrison,1114754,1,-1
902956,2014-01-11 16:57:00,2014-01-11,900 W Harrison,1116552,1,1


In [21]:
# maximum departures-arrivals for each station 
max_sum = departures_and_arrivals.groupby(['station_name','date'])['cumulative_sum'].max().reset_index()
max_sum

Unnamed: 0,station_name,date,cumulative_sum
0,900 W Harrison,2014-01-09,2
1,900 W Harrison,2014-01-10,-1
2,900 W Harrison,2014-01-11,1
3,900 W Harrison,2014-01-12,2
4,900 W Harrison,2014-01-13,0
5,900 W Harrison,2014-01-14,2
6,900 W Harrison,2014-01-15,-1
7,900 W Harrison,2014-01-16,7
8,900 W Harrison,2014-01-17,-1
9,900 W Harrison,2014-01-20,0


In [22]:
dfa = pd.merge(max_sum, stationdf, how='left',left_on=['station_name'], right_on='name')
dfa = dfa[['date','station_name','cumulative_sum','dpcapacity']]
dfa['difference'] = dfa['dpcapacity'] - dfa['cumulative_sum']
dfa

Unnamed: 0,date,station_name,cumulative_sum,dpcapacity,difference
0,2014-01-09,900 W Harrison,2,19,17
1,2014-01-10,900 W Harrison,-1,19,20
2,2014-01-11,900 W Harrison,1,19,18
3,2014-01-12,900 W Harrison,2,19,17
4,2014-01-13,900 W Harrison,0,19,19
5,2014-01-14,900 W Harrison,2,19,17
6,2014-01-15,900 W Harrison,-1,19,20
7,2014-01-16,900 W Harrison,7,19,12
8,2014-01-17,900 W Harrison,-1,19,20
9,2014-01-20,900 W Harrison,0,19,19


In [23]:
dfa[(dfa['difference']<0)]

Unnamed: 0,date,station_name,cumulative_sum,dpcapacity,difference
337,2014-06-25,Aberdeen St & Jackson Blvd,16,15,-1
813,2014-06-21,Adler Planetarium,24,19,-5
1179,2014-06-05,Ashland Ave & Augusta Blvd,16,15,-1
1483,2014-05-10,Ashland Ave & Blackhawk St,17,15,-2
1529,2014-06-25,Ashland Ave & Blackhawk St,17,15,-2
1857,2014-06-07,Ashland Ave & Division St,24,19,-5
1875,2014-06-25,Ashland Ave & Division St,26,19,-7
2044,2014-06-29,Ashland Ave & Grace St,16,15,-1
2499,2014-05-24,Ashland Ave & Wellington Ave,21,19,-2
2528,2014-06-22,Ashland Ave & Wellington Ave,20,19,-1


In [24]:
departures_and_arrivals[(departures_and_arrivals['station_name']=='Broadway & Barry Ave')&
                        (departures_and_arrivals['date']==dt.date(2014,6,19))]




Unnamed: 0,datetime,date,station_name,trip_id,trip_counts,cumulative_sum
144322,2014-06-19 06:02:00,2014-06-19,Broadway & Barry Ave,2168021,-1,-1
144308,2014-06-19 06:08:00,2014-06-19,Broadway & Barry Ave,2168036,1,0
144067,2014-06-19 06:54:00,2014-06-19,Broadway & Barry Ave,2168331,1,1
143972,2014-06-19 07:08:00,2014-06-19,Broadway & Barry Ave,2168468,-1,0
143656,2014-06-19 07:40:00,2014-06-19,Broadway & Barry Ave,2168876,1,1
143021,2014-06-19 08:20:00,2014-06-19,Broadway & Barry Ave,2169672,1,2
143031,2014-06-19 08:20:00,2014-06-19,Broadway & Barry Ave,2169657,1,3
142999,2014-06-19 08:21:00,2014-06-19,Broadway & Barry Ave,2169698,-1,2
142541,2014-06-19 08:50:00,2014-06-19,Broadway & Barry Ave,2170324,1,3
141891,2014-06-19 10:22:00,2014-06-19,Broadway & Barry Ave,2171400,1,4


In [25]:
arrivals2 = arrivals.copy()

In [26]:
arrivals2['date'] = arrivals2['datetime'].dt.date

In [27]:
arrivals2.head()

Unnamed: 0,datetime,station_name,trip_id,trip_counts,date
905030,2014-01-09 08:45:00,900 W Harrison,1111310,-1,2014-01-09
904512,2014-01-09 18:43:00,900 W Harrison,1112605,-1,2014-01-09
904086,2014-01-10 08:44:00,900 W Harrison,1113554,-1,2014-01-10
904039,2014-01-10 09:01:00,900 W Harrison,1113612,-1,2014-01-10
901677,2014-01-13 06:05:00,900 W Harrison,1118830,-1,2014-01-13


In [28]:
departures2 = departures.copy()

In [29]:
departures2['date'] = departures2['datetime'].dt.date

In [30]:
departures2.head()

Unnamed: 0,datetime,station_name,trip_id,trip_counts,date
904881,2014-01-09 12:11:00,900 W Harrison,1111738,1,2014-01-09
904715,2014-01-09 16:48:00,900 W Harrison,1112226,1,2014-01-09
904512,2014-01-09 18:43:00,900 W Harrison,1112605,1,2014-01-09
904509,2014-01-09 18:45:00,900 W Harrison,1112609,1,2014-01-09
903649,2014-01-10 17:39:00,900 W Harrison,1114754,1,2014-01-10


In [31]:
arrivals2.loc[( (arrivals2['date']==dt.date(2014,6,19)) & 
               (arrivals2['station_name'] ==  'Broadway & Barry Ave') ), :]

Unnamed: 0,datetime,station_name,trip_id,trip_counts,date
144322,2014-06-19 06:02:00,Broadway & Barry Ave,2168021,-1,2014-06-19
143972,2014-06-19 07:08:00,Broadway & Barry Ave,2168468,-1,2014-06-19
142999,2014-06-19 08:21:00,Broadway & Barry Ave,2169698,-1,2014-06-19
140893,2014-06-19 12:47:00,Broadway & Barry Ave,2172896,-1,2014-06-19
140443,2014-06-19 13:37:00,Broadway & Barry Ave,2173542,-1,2014-06-19
140439,2014-06-19 13:38:00,Broadway & Barry Ave,2173546,-1,2014-06-19
140211,2014-06-19 14:05:00,Broadway & Barry Ave,2173879,-1,2014-06-19
139835,2014-06-19 14:51:00,Broadway & Barry Ave,2174428,-1,2014-06-19
139716,2014-06-19 15:05:00,Broadway & Barry Ave,2174602,-1,2014-06-19
139552,2014-06-19 15:23:00,Broadway & Barry Ave,2174889,-1,2014-06-19


In [32]:
departures2.loc[( (departures2['date']==dt.date(2014,6,19)) & 
               (departures2['station_name'] == 'Broadway & Barry Ave') ), :]

Unnamed: 0,datetime,station_name,trip_id,trip_counts,date
144308,2014-06-19 06:08:00,Broadway & Barry Ave,2168036,1,2014-06-19
144067,2014-06-19 06:54:00,Broadway & Barry Ave,2168331,1,2014-06-19
143656,2014-06-19 07:40:00,Broadway & Barry Ave,2168876,1,2014-06-19
143021,2014-06-19 08:20:00,Broadway & Barry Ave,2169672,1,2014-06-19
143031,2014-06-19 08:20:00,Broadway & Barry Ave,2169657,1,2014-06-19
142541,2014-06-19 08:50:00,Broadway & Barry Ave,2170324,1,2014-06-19
141891,2014-06-19 10:22:00,Broadway & Barry Ave,2171400,1,2014-06-19
141774,2014-06-19 10:41:00,Broadway & Barry Ave,2171601,1,2014-06-19
141227,2014-06-19 12:07:00,Broadway & Barry Ave,2172415,1,2014-06-19
141225,2014-06-19 12:07:00,Broadway & Barry Ave,2172418,1,2014-06-19


In [33]:
stationdf[stationdf['name']=='Broadway & Barry Ave']

Unnamed: 0,name,dpcapacity
110,Broadway & Barry Ave,15
