Import necessary packages.

In [1]:
# Dataframes and numerical
import pandas as pd
import numpy as np

# Apache parquet files (to save space)
import pyarrow as pa
import pyarrow.parquet as pq

Load .parquet file into dataframe.

In [2]:
CB_Data = pq.read_table('CitiBike_data/202106-202205-citibike-tripdata.parquet').to_pandas()

Check raw data of dataframe.

In [3]:
CB_Data.shape

(27380897, 10)

In [4]:
CB_Data.columns

Index(['rideable_type', 'started_at', 'ended_at', 'start_station_name',
       'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [5]:
CB_Data.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,Classic Bike,2021-06-01 23:12:34,2021-06-01 23:14:46,Driggs Ave & N 9 St,Bayard St & Leonard St,40.718169,-73.955201,40.719156,-73.948854,Member
1,Classic Bike,2021-06-16 17:14:56,2021-06-16 17:29:15,Fulton St & Broadway,Mercer St & Spring St,40.711066,-74.009447,40.723627,-73.999496,Casual
2,Classic Bike,2021-06-07 19:41:55,2021-06-07 19:51:28,Devoe St & Lorimer St,Manhattan Av & Leonard St,40.713352,-73.949103,40.72084,-73.94844,Casual
3,Electric Bike,2021-06-17 15:13:15,2021-06-17 15:33:25,Driggs Ave & N 9 St,Greenwich Ave & Charles St,40.718169,-73.955201,40.735238,-74.000271,Member
4,Electric Bike,2021-06-18 08:27:03,2021-06-18 08:53:37,Graham Ave & Conselyea St,E 30 St & Park Ave S,40.715143,-73.944507,40.744449,-73.983035,Member


In [6]:
CB_Data.dtypes

rideable_type          object
started_at             object
ended_at               object
start_station_name     object
end_station_name       object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [7]:
CB_Data.describe

<bound method NDFrame.describe of           rideable_type           started_at             ended_at  \
0          Classic Bike  2021-06-01 23:12:34  2021-06-01 23:14:46   
1          Classic Bike  2021-06-16 17:14:56  2021-06-16 17:29:15   
2          Classic Bike  2021-06-07 19:41:55  2021-06-07 19:51:28   
3         Electric Bike  2021-06-17 15:13:15  2021-06-17 15:33:25   
4         Electric Bike  2021-06-18 08:27:03  2021-06-18 08:53:37   
...                 ...                  ...                  ...   
29032978   Classic Bike  2022-05-15 07:57:48  2022-05-15 08:12:55   
29032979   Classic Bike  2022-05-05 18:13:05  2022-05-05 18:20:10   
29032980   Classic Bike  2022-05-28 00:12:09  2022-05-28 00:30:00   
29032981   Classic Bike  2022-05-19 13:06:36  2022-05-19 13:18:02   
29032982   Classic Bike  2022-05-09 18:47:28  2022-05-09 18:52:38   

                 start_station_name            end_station_name  start_lat  \
0               Driggs Ave & N 9 St      Bayard St & Leonar

In [8]:
CB_Data.isna().sum()

rideable_type         0
started_at            0
ended_at              0
start_station_name    0
end_station_name      0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64

In [9]:
CB_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27380897 entries, 0 to 29032982
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   rideable_type       object 
 1   started_at          object 
 2   ended_at            object 
 3   start_station_name  object 
 4   end_station_name    object 
 5   start_lat           float64
 6   start_lng           float64
 7   end_lat             float64
 8   end_lng             float64
 9   member_casual       object 
dtypes: float64(4), object(6)
memory usage: 2.2+ GB


Create a modified dataframe as a copy of the imported dataframe for feature engineering.

In [10]:
CB_Data_Mod = CB_Data.copy()

Check data types of **started_at** and **ended_at** to see how their time stamp information can be used.

In [12]:
CB_Data_Mod[['started_at', 'ended_at']].dtypes

started_at    object
ended_at      object
dtype: object

Check data types of the first elements of these columns for further detail.

In [16]:
print(type(CB_Data_Mod.started_at[0]))
print(type(CB_Data_Mod.ended_at[0]))

<class 'str'>
<class 'str'>


Since such objects are strings, they need to be converted to timestamps in order to work for extracting date and time information from them to be used as features, as well as calculating ride durations.

In [18]:
# From https://dataindependent.com/pandas/pandas-to-datetime-string-to-date-pd-to_datetime/
CB_Data_Mod.started_at = pd.to_datetime(CB_Data_Mod.started_at, format="%Y-%m-%d %H:%M:%S")
CB_Data_Mod.ended_at = pd.to_datetime(CB_Data_Mod.ended_at, format="%Y-%m-%d %H:%M:%S")

Check the data types of the first elements of these columns again.

In [19]:
print(type(CB_Data_Mod.started_at[0]))
print(type(CB_Data_Mod.ended_at[0]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Examine what are the valuable bits of information to extract from the timestamp.

In [47]:
# Quantitative
print(f"Year:             {CB_Data_Mod.started_at[0].year}")
print(f"Month:            {CB_Data_Mod.started_at[0].month}")
print(f"Week of the Year: {CB_Data_Mod.started_at[0].week}")
print(f"Day of the Week:  {CB_Data_Mod.started_at[0].day_of_week}")
print(f"Hour of the Day:  {CB_Data_Mod.started_at[0].hour}")
print(f"Ride Duration:    {(CB_Data_Mod.ended_at[0] - CB_Data_Mod.started_at[0])/np.timedelta64(1,'m')} minutes")

# Categorical
print(f"Name of Month:    {CB_Data_Mod.started_at[0].month_name()}")
print(f"Name of Weekday:  {CB_Data_Mod.started_at[0].day_name()}")

Year:             2021
Month:            6
Week of the Year: 22
Day of the Week:  1
Hour of the Day:  23
Ride Duration:    2.2 minutes
Name of Month:    June
Name of Weekday:  Tuesday


With this knowledge, time to create new parameters that quantify them.

In [52]:
CB_Data_Mod['year'] = CB_Data_Mod.started_at.dt.year
CB_Data_Mod['month'] = CB_Data_Mod.started_at.dt.month
CB_Data_Mod['week_of_year'] = CB_Data_Mod.started_at.dt.week
CB_Data_Mod['day_of_week'] = CB_Data_Mod.started_at.dt.day_of_week
CB_Data_Mod['hour_of_day'] = CB_Data_Mod.started_at.dt.hour
CB_Data_Mod['duration_min'] = (CB_Data_Mod.ended_at - CB_Data_Mod.started_at)/np.timedelta64(1,'m')

  CB_Data_Mod['week_of_year'] = CB_Data_Mod.started_at.dt.week


Calculate distance based on Manhattan distance between latitude and longitude.

In [None]:
# Conversion factor here:
# https://www.usgs.gov/faqs/how-much-distance-does-degree-minute-and-second-cover-your-maps#:~:text=One%20degree%20of%20latitude%20equals,one%20second%20equals%2080%20feet.
CB_Data_Mod['distance_mi'] = 69 * ( abs( CB_Data_Mod.start_lat - CB_Data_Mod.end_lat ) 
                                  + abs( CB_Data_Mod.start_lng - CB_Data_Mod.end_lng ) )

Calculate speed in mph based on duration and distance.

In [61]:
CB_Data_Mod['speed_mph'] = CB_Data_Mod.distance_mi / (CB_Data_Mod.duration_min / 60)

In [62]:
CB_Data_Mod.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
0,Classic Bike,2021-06-01 23:12:34,2021-06-01 23:14:46,Driggs Ave & N 9 St,Bayard St & Leonard St,40.718169,-73.955201,40.719156,-73.948854,Member,2021,6,22,1,23,2.2,0.506033,13.800891
1,Classic Bike,2021-06-16 17:14:56,2021-06-16 17:29:15,Fulton St & Broadway,Mercer St & Spring St,40.711066,-74.009447,40.723627,-73.999496,Casual,2021,6,24,2,17,14.316667,1.553328,6.509873
2,Classic Bike,2021-06-07 19:41:55,2021-06-07 19:51:28,Devoe St & Lorimer St,Manhattan Av & Leonard St,40.713352,-73.949103,40.72084,-73.94844,Casual,2021,6,23,0,19,9.55,0.562419,3.533523
3,Electric Bike,2021-06-17 15:13:15,2021-06-17 15:33:25,Driggs Ave & N 9 St,Greenwich Ave & Charles St,40.718169,-73.955201,40.735238,-74.000271,Member,2021,6,24,3,15,20.166667,4.287591,12.756469
4,Electric Bike,2021-06-18 08:27:03,2021-06-18 08:53:37,Graham Ave & Conselyea St,E 30 St & Park Ave S,40.715143,-73.944507,40.744449,-73.983035,Member,2021,6,24,4,8,26.566667,4.680581,10.570947
