### ML Question 4: Modeling Trip Durations and Rider Arrivals

Import all necessary packages.

In [1]:
# Apache parquet files (to save space)
import pyarrow as pa
import pyarrow.parquet as pq

# Dataframes and numerical
import pandas as pd
import numpy as np
import statistics

# Increase pandas default display 
pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

# Suppress scientific notation of data in pandas
pd.set_option('display.float_format', '{:.2f}'.format)

# Graphing
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec
import seaborn as sns
sns.set_style('darkgrid')
plt.style.use('fivethirtyeight')

# For data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats

# Library for model creation
from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

# Model evaluation
from sklearn.metrics import mean_squared_error,r2_score

# Silence warnings
import warnings
warnings.filterwarnings('ignore')

Load **.parquet** file of all CitiBike rides from May 2021 through April 2022 into **pandas** dataframe.

In [2]:
CB_Data = pq.read_table('data/202105-202204-citibike-tripdata.parquet').to_pandas()

Preview the dataset.

In [3]:
CB_Data.head()

Unnamed: 0,member_casual,rideable_type,started_at,start_station_name,start_lat,start_lng,start_boro,start_hood,ended_at,end_station_name,end_lat,end_lng,end_boro,end_hood,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
1,Member,Classic Bike,2021-05-13 12:48:08,Broadway & W 25 St,40.74,-73.99,Manhattan,Flatiron District,2021-05-13 13:07:37,E 2 St & Avenue B,40.72,-73.98,Manhattan,East Village,2021,5,19,3,12,19.48,1.81,5.57
2,Member,Classic Bike,2021-05-16 08:30:13,46 Ave & 5 St,40.75,-73.95,Queens,Long Island City,2021-05-16 08:45:47,34th Ave & Vernon Blvd,40.77,-73.94,Queens,Long Island City,2021,5,19,6,8,15.57,2.26,8.69
3,Member,Classic Bike,2021-05-01 08:38:14,46 Ave & 5 St,40.75,-73.95,Queens,Long Island City,2021-05-01 08:54:27,34th Ave & Vernon Blvd,40.77,-73.94,Queens,Long Island City,2021,5,17,5,8,16.22,2.26,8.35
4,Member,Classic Bike,2021-05-09 08:12:31,46 Ave & 5 St,40.75,-73.95,Queens,Long Island City,2021-05-09 08:27:05,34th Ave & Vernon Blvd,40.77,-73.94,Queens,Long Island City,2021,5,18,6,8,14.57,2.26,9.29
5,Member,Classic Bike,2021-05-27 07:52:27,E 123 St & Lexington Ave,40.8,-73.94,Manhattan,East Harlem,2021-05-27 08:09:01,1 Ave & E 78 St,40.77,-73.95,Manhattan,Upper East Side,2021,5,21,3,7,16.57,3.25,11.78


In [4]:
CB_Data.tail()

Unnamed: 0,member_casual,rideable_type,started_at,start_station_name,start_lat,start_lng,start_boro,start_hood,ended_at,end_station_name,end_lat,end_lng,end_boro,end_hood,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
28816542,Member,Electric Bike,2022-04-29 09:03:34,Broadway & Berry St,40.71,-73.97,Brooklyn,Williamsburg,2022-04-29 09:16:23,Water St & Main St,40.7,-73.99,Brooklyn,DUMBO,2022,4,17,4,9,12.82,2.24,10.46
28816543,Member,Classic Bike,2022-04-22 13:08:15,Allen St & Rivington St,40.72,-73.99,Manhattan,Lower East Side,2022-04-22 13:29:31,W 29 St & 9 Ave,40.75,-74.0,Manhattan,Chelsea,2022,4,16,4,13,21.27,2.64,7.45
28816544,Member,Classic Bike,2022-04-13 17:10:49,St. Nicholas Terrace & Convent Ave,40.82,-73.95,Manhattan,Harlem,2022-04-13 17:26:08,E 138 St & Grand Concourse,40.81,-73.93,Bronx,Mott Haven,2022,4,15,2,17,15.32,1.75,6.85
28816546,Member,Electric Bike,2022-04-28 15:52:49,W 55 St & 6 Ave,40.76,-73.98,Manhattan,Midtown,2022-04-28 16:06:31,E 91 St & 2 Ave,40.78,-73.95,Manhattan,Upper East Side,2022,4,17,3,15,13.7,3.23,14.13
28816547,Member,Electric Bike,2022-04-11 19:55:31,W 87 St & West End Ave,40.79,-73.98,Manhattan,Upper West Side,2022-04-11 20:09:19,E 91 St & 2 Ave,40.78,-73.95,Manhattan,Upper East Side,2022,4,15,0,19,13.8,2.51,10.92


Describe the dataset.

In [5]:
CB_Data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,first,last,mean,std,min,25%,50%,75%,max
member_casual,18658426.0,2.0,Member,13885173.0,NaT,NaT,,,,,,,
rideable_type,18658426.0,2.0,Classic Bike,13019655.0,NaT,NaT,,,,,,,
started_at,18658426.0,12040084.0,2021-06-25 19:08:09,27.0,2021-05-01 00:00:01,2022-04-30 23:59:59,,,,,,,
start_station_name,18658426.0,1587.0,W 21 St & 6 Ave,84118.0,NaT,NaT,,,,,,,
start_lat,18658426.0,,,,NaT,NaT,40.74,0.04,40.63,40.72,40.74,40.76,40.88
start_lng,18658426.0,,,,NaT,NaT,-73.98,0.02,-74.04,-73.99,-73.98,-73.96,-73.88
start_boro,18658426.0,5.0,Manhattan,13542212.0,NaT,NaT,,,,,,,
start_hood,18658426.0,89.0,Chelsea,1429536.0,NaT,NaT,,,,,,,
ended_at,18658426.0,12033810.0,2021-05-05 23:31:09,36.0,2021-05-01 00:06:03,2022-05-01 00:48:28,,,,,,,
end_station_name,18658426.0,1654.0,W 21 St & 6 Ave,88605.0,NaT,NaT,,,,,,,


**Develop Model for Ride Durations Between Stations**

The dependent variable shall be the duration of the ride between two stations.

The independent variables shall be the following:

1. Month of Year
2. Week of Year
3. Hour of Day
4. Start Neighborhood
5. Start Borough
6. End Neighborhood
7. End Borough
8. Bike Type
9. Member Type
10. Distance between Stations

In [6]:
CB_Duration = pd.DataFrame(
    CB_Data.groupby(
        ['start_station_name','end_station_name','month','week_of_year','day_of_week','hour_of_day',
         'start_hood','start_boro','end_hood','end_boro','rideable_type','member_casual','distance_mi']
    )['duration_min'].mean()
).reset_index('start_station_name').reset_index('end_station_name').reset_index()

In [7]:
CB_Duration.shape

(17742205, 14)

In [8]:
CB_Duration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17742205 entries, 0 to 17742204
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   month               int64  
 1   week_of_year        int64  
 2   day_of_week         int64  
 3   hour_of_day         int64  
 4   start_hood          object 
 5   start_boro          object 
 6   end_hood            object 
 7   end_boro            object 
 8   rideable_type       object 
 9   member_casual       object 
 10  distance_mi         float64
 11  end_station_name    object 
 12  start_station_name  object 
 13  duration_min        float64
dtypes: float64(2), int64(4), object(8)
memory usage: 1.9+ GB


In [9]:
CB_Duration_col_order = ['start_station_name','end_station_name','duration_min','distance_mi',
                        'member_casual','rideable_type','month','week_of_year','day_of_week',
                        'hour_of_day','start_hood','start_boro','end_hood','end_boro']
CB_Duration = CB_Duration[CB_Duration_col_order]

Binarize **member_casual** and **rideable_type**.

In [10]:
# Assign 0 for Casual and 1 for Member
CB_Duration.member_casual = CB_Duration.member_casual.apply(lambda x: 1 if (x == 'Member') else 0).astype(int)

# Assign 0 for Classic Bike and 1 for Electric Bike
CB_Duration.rideable_type = CB_Duration.rideable_type.apply(lambda x: 1 if (x == 'Electric Bike') else 0).astype(int)

Binarize **day_of_week** according to whether it is a day of the week (Monday - Friday) or weekend (Saturday/Sunday).

In [11]:
# Assign 0 for weekday (Monday - Friday) and 1 for weekend (Saturday/Sunday)
CB_Duration.day_of_week = CB_Duration.day_of_week.apply(lambda x: 1 if (x >= 5) else 0).astype(int)

Dummify neighborhood and borough data for both origins and destinations.

In [12]:
# Select categorical non-ordinal features to one-hot-encode
dummy_cols = ['start_hood','start_boro','end_hood','end_boro']

# Return selected features
filtered = [i for i in dummy_cols]

# Function to transform data into one-hot-encodings
def get_hot_encode(df):
    encoder_list = list()
#   Iterate through the column labels
    for label in filtered:
        encoder_list.append(pd.get_dummies(df[label],prefix=label,drop_first=True,dtype=np.int64))    
    return encoder_list

# Initialize and transform the dataset with the function
hot_encoded = get_hot_encode(CB_Duration)

CB_Duration = pd.concat([CB_Duration,*hot_encoded],axis=1).drop(filtered,axis=1)

In [13]:
CB_Duration.head()

Unnamed: 0,start_station_name,end_station_name,duration_min,distance_mi,member_casual,rideable_type,month,week_of_year,day_of_week,hour_of_day,start_hood_Battery Park City,start_hood_Bay Ridge,start_hood_Bedford-Stuyvesant,start_hood_Boerum Hill,start_hood_Brooklyn Heights,start_hood_Bushwick,start_hood_Carroll Gardens,start_hood_Central Park,start_hood_Chelsea,start_hood_Chinatown,start_hood_Civic Center,start_hood_Claremont Village,start_hood_Clinton Hill,start_hood_Cobble Hill,start_hood_Columbia St,start_hood_Concourse,start_hood_Concourse Village,start_hood_Crotona Park,start_hood_Crown Heights,start_hood_Cypress Hills,start_hood_DUMBO,start_hood_Ditmars Steinway,start_hood_Downtown Brooklyn,start_hood_East Harlem,start_hood_East Morrisania,start_hood_East Village,start_hood_Financial District,start_hood_Flatbush,start_hood_Flatiron District,start_hood_Fordham,start_hood_Fort Greene,start_hood_Governors Island,start_hood_Gowanus,start_hood_Gramercy,start_hood_Green-Wood Cemetery,start_hood_Greenpoint,start_hood_Greenwich Village,start_hood_Harlem,start_hood_Hell's Kitchen,start_hood_Highbridge,start_hood_Hoboken,start_hood_Hunts Point,start_hood_Inwood,start_hood_Kensington,start_hood_Kingsbridge,start_hood_Kips Bay,start_hood_Long Island City,start_hood_Longwood,start_hood_Lower East Side,start_hood_Melrose,start_hood_Midtown,start_hood_Morningside Heights,start_hood_Morris Heights,start_hood_Morrisania,start_hood_Mott Haven,start_hood_Mount Eden,start_hood_Mount Hope,start_hood_Murray Hill,start_hood_Navy Yard,start_hood_NoHo,start_hood_Nolita,start_hood_Norwood,start_hood_Park Slope,start_hood_Port Morris,start_hood_Prospect Heights,start_hood_Prospect Park,start_hood_Prospect-Lefferts Gardens,start_hood_Randall's Island,start_hood_Red Hook,start_hood_Ridgewood,start_hood_Roosevelt Island,start_hood_SoHo,start_hood_South Slope,start_hood_Stuyvesant Town,start_hood_Sunset Park,start_hood_Theater District,start_hood_Tremont,start_hood_Tribeca,start_hood_Two Bridges,start_hood_University Heights,start_hood_Upper East Side,start_hood_Upper West Side,start_hood_Vinegar Hill,start_hood_Washington Heights,start_hood_West Village,start_hood_Williamsburg,start_hood_Windsor Terrace,start_hood_Woodside,start_boro_Brooklyn,start_boro_Manhattan,start_boro_New Jersey,start_boro_Queens,end_hood_Battery Park City,end_hood_Bay Ridge,end_hood_Bedford-Stuyvesant,end_hood_Boerum Hill,end_hood_Brooklyn Heights,end_hood_Bushwick,end_hood_Carroll Gardens,end_hood_Central Park,end_hood_Chelsea,end_hood_Chinatown,end_hood_Civic Center,end_hood_Claremont Village,end_hood_Clinton Hill,end_hood_Cobble Hill,end_hood_Columbia St,end_hood_Concourse,end_hood_Concourse Village,end_hood_Crotona Park,end_hood_Crown Heights,end_hood_Cypress Hills,end_hood_DUMBO,end_hood_Ditmars Steinway,end_hood_Downtown Brooklyn,end_hood_East Harlem,end_hood_East Morrisania,end_hood_East Village,end_hood_Financial District,end_hood_Flatbush,end_hood_Flatiron District,end_hood_Fordham,end_hood_Fort Greene,end_hood_Governors Island,end_hood_Gowanus,end_hood_Gramercy,end_hood_Green-Wood Cemetery,end_hood_Greenpoint,end_hood_Greenwich Village,end_hood_Harlem,end_hood_Hell's Kitchen,end_hood_Highbridge,end_hood_Hoboken,end_hood_Hunts Point,end_hood_Inwood,end_hood_Jersey City,end_hood_Kensington,end_hood_Kingsbridge,end_hood_Kips Bay,end_hood_Long Island City,end_hood_Longwood,end_hood_Lower East Side,end_hood_Melrose,end_hood_Midtown,end_hood_Morningside Heights,end_hood_Morris Heights,end_hood_Morrisania,end_hood_Mott Haven,end_hood_Mount Eden,end_hood_Mount Hope,end_hood_Murray Hill,end_hood_Navy Yard,end_hood_NoHo,end_hood_Nolita,end_hood_Norwood,end_hood_Park Slope,end_hood_Port Morris,end_hood_Prospect Heights,end_hood_Prospect Park,end_hood_Prospect-Lefferts Gardens,end_hood_Randall's Island,end_hood_Red Hook,end_hood_Ridgewood,end_hood_Roosevelt Island,end_hood_SoHo,end_hood_South Slope,end_hood_Stuyvesant Town,end_hood_Sunset Park,end_hood_Theater District,end_hood_Tremont,end_hood_Tribeca,end_hood_Two Bridges,end_hood_University Heights,end_hood_Upper East Side,end_hood_Upper West Side,end_hood_Vinegar Hill,end_hood_Washington Heights,end_hood_West Village,end_hood_Williamsburg,end_hood_Windsor Terrace,end_hood_Woodside,end_boro_Brooklyn,end_boro_Manhattan,end_boro_New Jersey,end_boro_Queens
0,1 Ave & E 110 St,1 Ave & E 16 St,37.52,7.14,1,1,6,22,1,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1 Ave & E 110 St,1 Ave & E 16 St,29.88,7.14,1,1,7,29,1,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1 Ave & E 110 St,1 Ave & E 16 St,57.35,7.14,0,1,8,33,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1 Ave & E 110 St,1 Ave & E 16 St,30.1,7.14,1,1,12,50,1,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1 Ave & E 110 St,1 Ave & E 30 St,62.9,6.07,0,1,3,9,1,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [14]:
CB_Duration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17742205 entries, 0 to 17742204
Columns: 195 entries, start_station_name to end_boro_Queens
dtypes: float64(2), int64(191), object(2)
memory usage: 25.8+ GB


Perform **Logistic Regression**.

In [None]:
# Determine independent variables
x = CB_Duration[CB_Duration.columns[3:]]

# These are the parameters for which a logistic regression shall me performed
x_log = CB_Duration.columns.tolist()[3:]
x_log_index = x.index.to_list()

# Bump up values for zero in order to compute logistic regression
for j in x_log:
    for i in x_log_index:
        if x[j][i] == 0:
            x[j][i] += 0.001
        else:
            pass

# Convert the independent variables parameters selected above to a logistic regression range
for j in x_log:
    x[j] = np.log(x[j])

y = CB_Duration.duration_min
y = np.log(y)

# To convert whatever strings your data might contain to numeric values. 
# If they're incompatible with conversion, they'll be reduced to NaNs.
x = x.apply(pd.to_numeric, errors='coerce')
y = y.apply(pd.to_numeric, errors='coerce')
x.fillna(0, inplace=True)
y.fillna(0, inplace=True)

# Importing train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

# Importing module
from sklearn.linear_model import LinearRegression

# Creating an object of LinearRegression class
LR = LinearRegression()

# Fitting the training data
LR.fit(x_train,y_train)

print("Intercept: %f" %LR.intercept_)
print("Coefficients: %s" %str(LR.coef_))
print("R^2: %f" %(LR.score(x_train,y_train)))
print("R^2: %f" %(LR.score(x_test,y_test)))

features20 = sm.add_constant(x_train)
ols_sm20   = OLS(y_train,features20)
lm_test20    = ols_sm20.fit()

print(lm_test20.summary())