Import necessary packages.

In [1]:
# Dataframes and numerical
import pandas as pd
import numpy as np

# Geolocation
import geopandas as gpd
import matplotlib.pyplot as plt

# Apache parquet files (to save space)
import pyarrow as pa
import pyarrow.parquet as pq

# Increase pandas default display 
pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

Reference shapefile and list of addresses to be tagged.

In [2]:
# From https://towardsdatascience.com/tagging-a-location-to-a-shapefile-area-using-geopandas-5d74336128bf

# Set the filepath and load in a shapefile
# Shape file found here:
# https://data.beta.nyc/dataset/pediacities-nyc-neighborhoods
fp = "CitiBike_data/archive/pediacitiesnycneighborhoods.json"
map_df = gpd.read_file(fp)

# Check the GeoDataframe
map_df.head()

Unnamed: 0,neighborhood,boroughCode,borough,@id,geometry
0,Allerton,2,Bronx,http://nyc.pediacities.com/Resource/Neighborho...,"POLYGON ((-73.84860 40.87167, -73.84582 40.870..."
1,Alley Pond Park,4,Queens,http://nyc.pediacities.com/Resource/Neighborho...,"POLYGON ((-73.74333 40.73888, -73.74371 40.739..."
2,Arden Heights,5,Staten Island,http://nyc.pediacities.com/Resource/Neighborho...,"POLYGON ((-74.16983 40.56108, -74.16982 40.561..."
3,Arlington,5,Staten Island,http://nyc.pediacities.com/Resource/Neighborho...,"POLYGON ((-74.15975 40.64142, -74.15998 40.641..."
4,Arrochar,5,Staten Island,http://nyc.pediacities.com/Resource/Neighborho...,"POLYGON ((-74.06078 40.59319, -74.06079 40.593..."


Load .parquet file into pandas dataframe.

In [3]:
# Load the CitiBike log file covering May 2021 through April 2022 and their Lat Longs
list_location = pq.read_table('CitiBike_data/202105-202204-citibike-tripdata.parquet').to_pandas()

# Check the Pandas Dataframe
list_location.head()

Unnamed: 0,member_casual,rideable_type,started_at,start_station_name,start_lat,start_lng,start_boro,start_hood,ended_at,end_station_name,end_lat,end_lng,end_boro,end_hood,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
1,Member,Classic Bike,2021-05-13 12:48:08,Broadway & W 25 St,40.742868,-73.989186,Manhattan,Flatiron District,2021-05-13 13:07:37,E 2 St & Avenue B,40.722175,-73.983688,Manhattan,East Village,2021,5,19,3,12,19.483333,1.807254,5.565537
2,Member,Classic Bike,2021-05-16 08:30:13,46 Ave & 5 St,40.74731,-73.95451,Queens,Hunters Point,2021-05-16 08:45:47,34th Ave & Vernon Blvd,40.765354,-73.939863,Queens,Astoria,2021,5,19,6,8,15.566667,2.255646,8.694139
3,Member,Classic Bike,2021-05-01 08:38:14,46 Ave & 5 St,40.74731,-73.95451,Queens,Hunters Point,2021-05-01 08:54:27,34th Ave & Vernon Blvd,40.765354,-73.939863,Queens,Astoria,2021,5,17,5,8,16.216667,2.255646,8.345659
4,Member,Classic Bike,2021-05-09 08:12:31,46 Ave & 5 St,40.74731,-73.95451,Queens,Hunters Point,2021-05-09 08:27:05,34th Ave & Vernon Blvd,40.765354,-73.939863,Queens,Astoria,2021,5,18,6,8,14.566667,2.255646,9.290991
5,Member,Classic Bike,2021-05-27 07:52:27,E 123 St & Lexington Ave,40.802926,-73.9379,Manhattan,East Harlem,2021-05-27 08:09:01,1 Ave & E 78 St,40.771404,-73.953516,Manhattan,Upper East Side,2021,5,21,3,7,16.566667,3.252584,11.779982


In order to conserve memory resources, a dataframe consisting of every conceivable station name and coordinates shall be created, tagged to boro and neighborhood, and finally assigned to the main dataframe.

However, the problem is that there is a lot of noise in the data regarding actual station locations. To illustrate, observe the number of unique station names for start and end stations as opposed to the number of unique latitudes and longitudes for each ride generated when duplicates are dropped. There are clearly more unique coordinates than there are stations to contain them.

In [4]:
len(list_location.start_station_name.unique().tolist())

1588

In [5]:
list_location[['start_lat', 'start_lng']].drop_duplicates()

Unnamed: 0,start_lat,start_lng
1,40.742868,-73.989186
2,40.747310,-73.954510
5,40.802926,-73.937900
11,40.757973,-73.966033
18,40.710447,-73.965251
...,...,...
26498803,40.773913,-73.954395
26499040,40.752762,-73.992804
26504654,40.797523,-73.948942
26518872,40.754136,-73.996459


In [6]:
len(list_location.end_station_name.unique().tolist())

1668

In [7]:
list_location[['end_lat', 'end_lng']].drop_duplicates()

Unnamed: 0,end_lat,end_lng
1,40.722175,-73.983688
2,40.765354,-73.939863
5,40.771404,-73.953516
6,40.735367,-73.987975
17,40.730562,-73.973985
...,...,...
26527966,40.797523,-73.948942
26532123,40.717597,-74.015882
26741144,40.754136,-73.996459
26819204,40.669767,-73.994744


The solution is to average all coordinates for each unique station name, which results in the number of unique coordinates matching their respective number of stations.

In [8]:
list_location[['start_station_name','start_lat', 'start_lng']].groupby(['start_station_name']).mean()

Unnamed: 0_level_0,start_lat,start_lng
start_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Ave & E 110 St,40.792327,-73.938300
1 Ave & E 16 St,40.732219,-73.981655
1 Ave & E 18 St,40.733812,-73.980544
1 Ave & E 30 St,40.741444,-73.975361
1 Ave & E 39 St,40.747140,-73.971130
...,...,...
Wyckoff Av & Stanhope St,40.703545,-73.917775
Wyckoff Ave & Gates Ave,40.699871,-73.911718
Wyckoff St & Nevins St,40.683426,-73.984275
Wythe Ave & Metropolitan Ave,40.716887,-73.963198


In [9]:
list_location[['end_station_name','end_lat', 'end_lng']].groupby(['end_station_name']).mean()

Unnamed: 0_level_0,end_lat,end_lng
end_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Ave & E 110 St,40.792327,-73.938300
1 Ave & E 16 St,40.732219,-73.981655
1 Ave & E 18 St,40.733812,-73.980544
1 Ave & E 30 St,40.741444,-73.975361
1 Ave & E 39 St,40.747140,-73.971130
...,...,...
Wyckoff Av & Stanhope St,40.703545,-73.917775
Wyckoff Ave & Gates Ave,40.699871,-73.911718
Wyckoff St & Nevins St,40.683426,-73.984275
Wythe Ave & Metropolitan Ave,40.716887,-73.963198


Therefore, a dataframe of unique start station names and another one with unique end station names, both with averaged out coordinates, shall be generated and then merged to eliminate overlaps.

In [10]:
# Create a dataframe of unique start station names with coordinates
start_station_df = list_location[['start_station_name','start_lat', 'start_lng']] \
.groupby(['start_station_name']).mean()

# Create a dataframe of unique end station names with coordinates
end_station_df = list_location[['end_station_name','end_lat', 'end_lng']] \
.groupby(['end_station_name']).mean()

# Rename columns in both dataframes to assist merging
start_station_df.rename(columns = {'start_station_name': 'station_name',
                                  'start_lat': 'lat',
                                  'start_lng': 'lng'}, inplace = True)
end_station_df.rename(columns = {'end_station_name': 'station_name',
                                  'end_lat': 'lat',
                                  'end_lng': 'lng'}, inplace = True)

# Concatenate dataframes
stations_df = pd.concat([start_station_df, end_station_df]).drop_duplicates()

# Average out unique stations again to further streamline
stations_df = stations_df.groupby(stations_df.index).mean()

# Check dataframe finally
stations_df

Unnamed: 0,lat,lng
1 Ave & E 110 St,40.792327,-73.938300
1 Ave & E 16 St,40.732219,-73.981655
1 Ave & E 18 St,40.733812,-73.980544
1 Ave & E 30 St,40.741444,-73.975361
1 Ave & E 39 St,40.747140,-73.971130
...,...,...
Wyckoff Av & Stanhope St,40.703545,-73.917775
Wyckoff Ave & Gates Ave,40.699871,-73.911718
Wyckoff St & Nevins St,40.683426,-73.984275
Wythe Ave & Metropolitan Ave,40.716887,-73.963198


Extract coordinates from the streamlined Pandas dataframe

In [11]:
# Changing to a GeoDataFrame to create geometry series
stations_gp = gpd.GeoDataFrame(stations_df, geometry = gpd.points_from_xy(stations_df.lng, stations_df.lat))

# Checking GeoDataFrame
stations_gp.head()

Unnamed: 0,lat,lng,geometry
1 Ave & E 110 St,40.792327,-73.9383,POINT (-73.93830 40.79233)
1 Ave & E 16 St,40.732219,-73.981655,POINT (-73.98166 40.73222)
1 Ave & E 18 St,40.733812,-73.980544,POINT (-73.98054 40.73381)
1 Ave & E 30 St,40.741444,-73.975361,POINT (-73.97536 40.74144)
1 Ave & E 39 St,40.74714,-73.97113,POINT (-73.97113 40.74714)


Finally, create new columns to assign boro and neighborhood based on geometry point.

In [12]:
# Boro (or County)
stations_gp['boro'] = ''
for idx in range(map_df.shape[0]):
    # For every station , find if they reside within a boro
    pip = stations_gp.within(map_df.loc[idx, 'geometry'])
    if pip.sum() > 0: # Found where some of the station reside at map_df.loc[idx]
        stations_gp.loc[pip, 'boro']  = map_df.loc[idx, 'borough']

# Neighborhood
stations_gp['hood'] = ''
for idx in range(map_df.shape[0]):
    # For every station , find if they reside within a boro
    pip = stations_gp.within(map_df.loc[idx, 'geometry'])
    if pip.sum() > 0: # Found where some of the station reside at map_df.loc[idx]
        stations_gp.loc[pip, 'hood']  = map_df.loc[idx, 'neighborhood']

Check new GeoDataFrame of stations now.

In [13]:
stations_gp.head()

Unnamed: 0,lat,lng,geometry,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,POINT (-73.93830 40.79233),Manhattan,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,POINT (-73.98166 40.73222),Manhattan,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,POINT (-73.98054 40.73381),Manhattan,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,POINT (-73.97536 40.74144),Manhattan,Kips Bay
1 Ave & E 39 St,40.74714,-73.97113,POINT (-73.97113 40.74714),Manhattan,Murray Hill


In [14]:
stations_gp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1669 entries, 1 Ave & E 110 St to Yankee Ferry Terminal
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   lat       1669 non-null   float64 
 1   lng       1669 non-null   float64 
 2   geometry  1669 non-null   geometry
 3   boro      1669 non-null   object  
 4   hood      1669 non-null   object  
dtypes: float64(2), geometry(1), object(2)
memory usage: 78.2+ KB


In [15]:
stations_gp.boro.value_counts()

Manhattan    673
Brooklyn     485
Bronx        268
Queens       174
              69
Name: boro, dtype: int64

In [16]:
stations_gp.hood.value_counts()

                             69
Harlem                       68
Williamsburg                 66
Bushwick                     62
Midtown                      57
Long Island City             55
Sunset Park                  52
Astoria                      48
Upper West Side              45
Upper East Side              45
Washington Heights           45
East Harlem                  43
Ditmars Steinway             43
East Village                 42
Bedford-Stuyvesant           36
Chelsea                      35
Longwood                     35
Lower East Side              31
Hell's Kitchen               30
Mott Haven                   28
Greenpoint                   27
Financial District           26
West Village                 24
Fordham                      24
Inwood                       23
Park Slope                   22
Ridgewood                    22
Fort Greene                  21
Kingsbridge                  20
Tribeca                      20
Claremont Village            19
Concours

Investigate which listings are not in any of the distict boroughs of New York City.

In [17]:
stations_gp.loc[stations_gp.boro == '']

Unnamed: 0,lat,lng,geometry,boro,hood
11 St & Washington St,40.749985,-74.02715,POINT (-74.02715 40.74998),,
12 St & Sinatra Dr N,40.750604,-74.02402,POINT (-74.02402 40.75060),,
14 St Ferry - 14 St & Shipyard Ln,40.752961,-74.024353,POINT (-74.02435 40.75296),,
2 St HBLR - 2 St & Marshall St,40.740802,-74.042435,POINT (-74.04243 40.74080),,
5 Corners Library,40.734961,-74.059503,POINT (-74.05950 40.73496),,
6 St & Grand St,40.744398,-74.034501,POINT (-74.03450 40.74440),,
7 St & Monroe St,40.746413,-74.037977,POINT (-74.03798 40.74641),,
8 St & Washington St,40.745984,-74.028199,POINT (-74.02820 40.74598),,
Adams St & 11 St,40.750916,-74.033541,POINT (-74.03354 40.75092),,
Adams St & 2 St,40.739814,-74.036904,POINT (-74.03690 40.73981),,


It appears that those stations are in New Jersey for the few crazy riders that decide to cross the Hudson River, either by ferry with the bike in tow, or through crossing a bridge or tunnel illegally. Fortunately, they are all in Hudson County, so then a shape file shall be used in order to fill in the gaps.

Found here: https://catalog.data.gov/dataset/tiger-line-shapefile-2016-state-new-jersey-current-place-state-based

In [18]:
# Set the filepath and load in a shapefile
fp2 = "CitiBike_data/archive/tl_2016_34_place/tl_2016_34_place.shp"
map_df2 = gpd.read_file(fp2)

# Check the GeoDataframe
map_df2.head()

Unnamed: 0,STATEFP,PLACEFP,PLACENS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,PCICBSA,PCINECTA,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,34,7600,885169,3407600,Bridgeton,Bridgeton city,25,C5,Y,N,G4110,A,16131425,690143,39.429192,-75.228595,"POLYGON ((-75.25302 39.44406, -75.24995 39.445..."
1,34,67020,885393,3467020,Shiloh,Shiloh borough,21,C5,N,N,G4110,A,3116143,1564,39.4624183,-75.2924402,"POLYGON ((-75.31133 39.46395, -75.31132 39.464..."
2,34,7810,885171,3407810,Brigantine,Brigantine city,25,C5,N,N,G4110,A,16931323,11204171,39.4138574,-74.3779024,"POLYGON ((-74.42105 39.38576, -74.42069 39.385..."
3,34,23940,885222,3423940,Folsom,Folsom borough,21,C5,N,N,G4110,A,21403550,635219,39.5967446,-74.8431626,"POLYGON ((-74.88443 39.60217, -74.88338 39.603..."
4,34,24990,885225,3424990,Franklin Lakes,Franklin Lakes borough,21,C5,N,N,G4110,A,24383136,1214222,41.0075259,-74.2056527,"POLYGON ((-74.24851 40.99220, -74.24847 40.992..."


In [19]:
map_df2.geometry

0      POLYGON ((-75.25302 39.44406, -75.24995 39.445...
1      POLYGON ((-75.31133 39.46395, -75.31132 39.464...
2      POLYGON ((-74.42105 39.38576, -74.42069 39.385...
3      POLYGON ((-74.88443 39.60217, -74.88338 39.603...
4      POLYGON ((-74.24851 40.99220, -74.24847 40.992...
                             ...                        
540    POLYGON ((-75.01234 39.67058, -75.00946 39.673...
541    POLYGON ((-74.30847 40.54117, -74.30840 40.541...
542    POLYGON ((-74.70470 40.18483, -74.70469 40.185...
543    POLYGON ((-74.35499 40.30243, -74.35464 40.302...
544    POLYGON ((-74.58222 40.53504, -74.58144 40.535...
Name: geometry, Length: 545, dtype: geometry

Run this again for the New Jersey cities.

In [20]:
# Neighborhood
for idx in range(map_df2.shape[0]):
    # For every station , find if they reside within a boro
    pip = stations_gp.within(map_df2.loc[idx, 'geometry'])
    if pip.sum() > 0: # Found where some of the station reside at map_df.loc[idx]
        stations_gp.loc[pip, 'hood']  = map_df2.loc[idx, 'NAME']

In [21]:
stations_gp.loc[stations_gp.boro == '']['hood'].value_counts()

Jersey City    44
Hoboken        25
Name: hood, dtype: int64

In [22]:
stations_gp['boro'].loc[stations_gp.boro == ''] = 'New Jersey'

In [23]:
stations_gp.boro.value_counts()

Manhattan     673
Brooklyn      485
Bronx         268
Queens        174
New Jersey     69
Name: boro, dtype: int64

In [24]:
stations_gp.isna().sum()

lat         0
lng         0
geometry    0
boro        0
hood        0
dtype: int64

In [25]:
stations_gp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1669 entries, 1 Ave & E 110 St to Yankee Ferry Terminal
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   lat       1669 non-null   float64 
 1   lng       1669 non-null   float64 
 2   geometry  1669 non-null   geometry
 3   boro      1669 non-null   object  
 4   hood      1669 non-null   object  
dtypes: float64(2), geometry(1), object(2)
memory usage: 78.2+ KB


In [26]:
stations_gp.head()

Unnamed: 0,lat,lng,geometry,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,POINT (-73.93830 40.79233),Manhattan,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,POINT (-73.98166 40.73222),Manhattan,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,POINT (-73.98054 40.73381),Manhattan,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,POINT (-73.97536 40.74144),Manhattan,Kips Bay
1 Ave & E 39 St,40.74714,-73.97113,POINT (-73.97113 40.74714),Manhattan,Murray Hill


Since all stations have had their respective boroughs and neighborhoods accounted for, it is time to save this GeoDataFrame as a simple Pandas dataframe with these values saved for easy reference.

In [27]:
CB_stations_locations = pd.DataFrame(stations_gp.drop(columns='geometry'))

In [28]:
type(CB_stations_locations)

pandas.core.frame.DataFrame

In [29]:
CB_stations_locations.head()

Unnamed: 0,lat,lng,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,Manhattan,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,Manhattan,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,Manhattan,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,Manhattan,Kips Bay
1 Ave & E 39 St,40.74714,-73.97113,Manhattan,Murray Hill


In [30]:
CB_stations_locations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1669 entries, 1 Ave & E 110 St to Yankee Ferry Terminal
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lat     1669 non-null   float64
 1   lng     1669 non-null   float64
 2   boro    1669 non-null   object 
 3   hood    1669 non-null   object 
dtypes: float64(2), object(2)
memory usage: 65.2+ KB


Finally, save to .parquet

In [31]:
CB_Data_arrow = pa.Table.from_pandas(CB_stations_locations)
pq.write_table(CB_Data_arrow, 'CitiBike_data/202206-citibike-stations.parquet')