In [26]:
import pandas as pd
import numpy as np

import geopandas as gpd
from shapely.geometry import Point
import rtree
import pickle

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [29]:
bike_df = pd.read_csv("201701-citibike-tripdata.csv.zip")

In [30]:
bike_df.head()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender
0,680,2017-01-01 00:00:21,2017-01-01 00:11:41,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965.0,2
1,1282,2017-01-01 00:00:45,2017-01-01 00:22:08,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987.0,2
2,648,2017-01-01 00:00:57,2017-01-01 00:11:46,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,18147,Customer,,0
3,631,2017-01-01 00:01:10,2017-01-01 00:11:42,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,21211,Customer,,0
4,621,2017-01-01 00:01:25,2017-01-01 00:11:47,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,26819,Customer,,0


In [31]:
bike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 726676 entries, 0 to 726675
Data columns (total 15 columns):
Trip Duration              726676 non-null int64
Start Time                 726676 non-null object
Stop Time                  726676 non-null object
Start Station ID           726676 non-null int64
Start Station Name         726676 non-null object
Start Station Latitude     726676 non-null float64
Start Station Longitude    726676 non-null float64
End Station ID             726676 non-null int64
End Station Name           726676 non-null object
End Station Latitude       726676 non-null float64
End Station Longitude      726676 non-null float64
Bike ID                    726676 non-null int64
User Type                  723483 non-null object
Birth Year                 697600 non-null float64
Gender                     726676 non-null int64
dtypes: float64(5), int64(5), object(5)
memory usage: 83.2+ MB


In [32]:
bike_df['Start Time'] = pd.to_datetime(bike_df['Start Time'])

In [33]:
bike_df['Stop Time'] = pd.to_datetime(bike_df['Stop Time'])

In [34]:
bike_df['month'] = bike_df['Start Time'].dt.month

In [35]:
bike_df['year'] = bike_df['Start Time'].dt.year

In [36]:
bike_df['day_of_week'] = bike_df['Start Time'].dt.weekday_name

In [37]:
bike_df['time_of_day'] = bike_df['Start Time'].dt.hour

In [38]:
bike_df.head()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,month,year,day_of_week,time_of_day
0,680,2017-01-01 00:00:21,2017-01-01 00:11:41,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965.0,2,1,2017,Sunday,0
1,1282,2017-01-01 00:00:45,2017-01-01 00:22:08,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987.0,2,1,2017,Sunday,0
2,648,2017-01-01 00:00:57,2017-01-01 00:11:46,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,18147,Customer,,0,1,2017,Sunday,0
3,631,2017-01-01 00:01:10,2017-01-01 00:11:42,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,21211,Customer,,0,1,2017,Sunday,0
4,621,2017-01-01 00:01:25,2017-01-01 00:11:47,3143,5 Ave & E 78 St,40.776829,-73.963888,3152,3 Ave & E 71 St,40.768737,-73.961199,26819,Customer,,0,1,2017,Sunday,0


In [40]:
bike_df.day_of_week.value_counts()

Thursday     137739
Wednesday    134246
Friday       116934
Tuesday       94416
Monday        94325
Sunday        82860
Saturday      66156
Name: day_of_week, dtype: int64