First, import relevant packages:

In [2]:
# Numerical and data
import pandas as pd
import numpy as np

# Apache parquet files (to save space)
import pyarrow as pa
import pyarrow.parquet as pq

Next, load the .parquet file into a dataframe:

In [6]:
CB_Data = pq.read_table('CitiBike_data/202106-202205-citibike-tripdata.parquet').to_pandas()

In [7]:
CB_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29032983 entries, 0 to 29032982
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   rideable_type       object 
 1   started_at          object 
 2   ended_at            object 
 3   start_station_name  object 
 4   end_station_name    object 
 5   start_lat           float64
 6   start_lng           float64
 7   end_lat             float64
 8   end_lng             float64
 9   member_casual       object 
dtypes: float64(4), object(6)
memory usage: 2.2+ GB


In [8]:
CB_Data.columns

Index(['rideable_type', 'started_at', 'ended_at', 'start_station_name',
       'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [9]:
CB_Data.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual
0,docked_bike,2021-06-01 23:12:34,2021-06-01 23:14:46,Driggs Ave & N 9 St,Bayard St & Leonard St,40.718169,-73.955201,40.719156,-73.948854,member
1,docked_bike,2021-06-16 17:14:56,2021-06-16 17:29:15,Fulton St & Broadway,Mercer St & Spring St,40.711066,-74.009447,40.723627,-73.999496,casual
2,docked_bike,2021-06-07 19:41:55,2021-06-07 19:51:28,Devoe St & Lorimer St,Manhattan Av & Leonard St,40.713352,-73.949103,40.72084,-73.94844,casual
3,electric_bike,2021-06-17 15:13:15,2021-06-17 15:33:25,Driggs Ave & N 9 St,Greenwich Ave & Charles St,40.718169,-73.955201,40.735238,-74.000271,member
4,electric_bike,2021-06-18 08:27:03,2021-06-18 08:53:37,Graham Ave & Conselyea St,E 30 St & Park Ave S,40.715143,-73.944507,40.744449,-73.983035,member


Count total number of rides during that time frame (June 2021 through May 2022):

In [24]:
no_rides = len(CB_Data)

Check for null values.

In [26]:
# "Ghost bikes" coming in from unknown locations
bike_ghos = CB_Data.loc[pd.isnull(CB_Data.start_station_name)]
no_bike_ghos = len(bike_ghos)

# Bikes that are lost, i.e. not docked at the end
bike_lost = CB_Data.loc[pd.isnull(CB_Data.end_station_name)]
no_bike_lost = len(bike_lost)

# Bikes that are docked at the same station they are picked up, for joyride, rider changing mind, defective bike, etc.
bike_joyr = CB_Data.loc[CB_Data.start_station_name == CB_Data.end_station_name]
no_bike_joyr = len(bike_joyr)

print(f'Total number of Citibike rides from June 2021 through May 2022: {no_rides}')
print(f'Total number of "ghost bikes" in that time frame: {no_bike_ghos}')
print(f'Total number of lost bikes in that time frame: {no_bike_lost}')
print(f'Total number of bikes being docked at the same location in that time frame: {no_bike_joyr}')
print(f'Total number of "dud rides" to be removed from data: {no_bike_ghos + no_bike_lost + no_bike_joyr}')

print(f'Percentage of rides from June 2021 through May 2022 with bikes missing: \
      {100*(no_bike_ghos + no_bike_lost)/no_rides}')
print(f'Percentage of rides from June 2021 through May 2022 with bikes docked at the same location: \
      {100*no_bike_joyr/no_rides}')
print(f'Total number of "dud rides" to be removed from data: \
      {100*(no_bike_ghos + no_bike_lost + no_bike_joyr)/no_rides}')

Total number of Citibike rides from June 2021 through May 2022: 29032983
Total number of "ghost bikes" in that time frame: 245
Total number of lost bikes in that time frame: 102300
Total number of bikes being docked at the same location in that time frame: 1549785
Total number of "dud rides" to be removed from data: 1652330
Percentage of rides from June 2021 through May 2022 with bikes missing:       0.35320173610820493
Percentage of rides from June 2021 through May 2022 with bikes docked at the same location:       5.338015043097707
Total number of "dud rides" to be removed from data:       5.691216779205912


To summarize:
<br>Total number of Citibike rides from June 2021 through May 2022: 29,032,983
<br>Total number of "ghost bikes" in that time frame: 245
<br>Total number of lost bikes in that time frame: 102,300
<br>Total number of bikes being docked at the same location in that time frame: 1,549,785
<br>Total number of "dud rides" to be removed from data: 1,652,330
<br>Percentage of rides from June 2021 through May 2022 with bikes missing:       0.35320173610820493
<br>Percentage of rides from June 2021 through May 2022 with bikes docked at the same location:       5.338015043097707
<br>Total number of "dud rides" to be removed from data:       5.691216779205912

Out of 29 million rides taken from June 2021 through May 2022, 1.55 million (5.7%) such "dud rides" shall be removed.

Now, eliminate these "dud rides" once and for all:

In [40]:
dud_rides = bike_ghos.index.tolist() + bike_lost.index.tolist() + bike_joyr.index.tolist()

CB_Data = CB_Data.drop(axis = 0, index = dud_rides)

In [41]:
len(CB_Data)

27380897

Now, there are 27 million rides to work with, which is still more than enough data!

Make sure that all the values of **rideable_type** are consistent. In other words, **docked_bike** = **classic_bike**.

In [None]:
# CB_Data.rideable_type.loc[CB_Data.rideable_type == 'classic_bike'] = 'Classic Bike'
# CB_Data.rideable_type.loc[CB_Data.rideable_type == 'docked_bike'] = 'Classic Bike'

It turns out that distance between two coordinates were improperly calculated. **Haversine** was used instead of **block distance**, which reflects the movement most found within a city with a rectangluar grid. This will fix that:

In [16]:
# Conversion factor here:
# https://www.usgs.gov/faqs/how-much-distance-does-degree-minute-and-second-cover-your-maps#:~:text=One%20degree%20of%20latitude%20equals,one%20second%20equals%2080%20feet.
# CB_Data.distance = 69 * ( abs( CB_Data.start_lat - CB_Data.end_lat ) 
#                                   + abs( CB_Data.start_lng - CB_Data.end_lng ) )

In [19]:
# CB_Data.speed = CB_Data.distance / (CB_Data.duration / 60)

The issue in the R data analysis was the inability to calculate speed, due to null values in the duration. Therefore, as a final check, such a calculation shall be performed in order to assess the cleanliness of this dataframe so far:

In [57]:
# CB_Data['speed'] = CB_Data['distance'] / (0.0166667 * CB_Data['duration']) # in mph, convert min to hr at denom

So far, so good. Now, the station names need cleanup through an iterative process

In [87]:
# nulls = CB_Data.loc[pd.isnull(CB_Data['CB_start_hood']) | pd.isnull(CB_Data['CB_end_hood'])]

In [88]:
# nulls['start_station_name'].value_counts()

2 Ave & E 29 St                63472
8 Ave & W 16 St                63362
W 44 St & 11 Ave               56702
Forsyth St\t& Grand St         53086
6 Ave & W 45 St                53012
                               ...  
Lab - NYC                          7
E 6 St 2 Ave                       6
Avenue D & E 8 St                  3
Grand Concourse & E 161  St        2
Yankee Ferry Terminal              1
Name: start_station_name, Length: 1592, dtype: int64

In [89]:
# nulls['end_station_name'].value_counts()

8 Ave & W 16 St           63465
2 Ave & E 29 St           62081
Forsyth St\t& Grand St    53520
6 Ave & W 45 St           53259
W 44 St & 11 Ave          52587
                          ...  
Brunswick St                  1
Hudson St & 4 St              1
StuyTown Depot                1
Jackson Square                1
Adams St & 2 St               1
Name: end_station_name, Length: 1672, dtype: int64

In [86]:
# ques = ['Broadway & E 21 St','E 13 St & 2 Ave','Broadway & W 58 St','E 20 St & 2 Ave','5 Ave & E 72 St']
# hood = ['Midtown East','East Village','Midtown West','Midtown East','Upper East Side']
# boro = ['Manhattan','Manhattan','Manhattan','Manhattan','Manhattan']
# zipped = zip(ques, hood, boro)

# for q, h, b in zipped:
#     CB_Data.CB_start_hood.loc[CB_Data.start_station_name == q] = h
#     CB_Data.CB_start_boro.loc[CB_Data.start_station_name == q] = b
#     CB_Data.CB_end_hood.loc[CB_Data.end_station_name == q] = h
#     CB_Data.CB_end_boro.loc[CB_Data.end_station_name == q] = b

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [21]:
# CB_Data.to_csv('CitiBike_data/202105-202204-citibike-trip-data.csv')