Import necessary packages.

In [1]:
# Dataframes and numerical
import pandas as pd
import numpy as np

# Geolocation
import geopandas as gpd
import matplotlib.pyplot as plt

# Apache parquet files (to save space)
import pyarrow as pa
import pyarrow.parquet as pq

Reference shapefile and list of addresses to be tagged.

In [2]:
# From https://towardsdatascience.com/tagging-a-location-to-a-shapefile-area-using-geopandas-5d74336128bf

# Set the filepath and load in a shapefile
# Shape file found here:
# https://www.kaggle.com/datasets/jackcook/neighborhoods-in-new-york
fp = "CitiBike_data/archive/ZillowNeighborhoods-NY.shp"
map_df = gpd.read_file(fp)

# Check the GeoDataframe
map_df.head()

Unnamed: 0,State,County,City,Name,RegionID,geometry
0,NY,Suffolk,Town of Islip,Bohemia,3736,"POLYGON ((-73.14423 40.78667, -73.14402 40.785..."
1,NY,Albany,Town of Coeymans,Ravena,6687,"POLYGON ((-73.82263 42.50203, -73.82181 42.501..."
2,NY,Queens,New York,Rego Park,6719,"POLYGON ((-73.85630 40.72303, -73.85654 40.722..."
3,NY,Suffolk,Town of Islip,Saltaire,6912,"POLYGON ((-73.18891 40.63929, -73.18889 40.639..."
4,NY,Albany,Guilderland,Westmere,9545,"POLYGON ((-73.87198 42.69916, -73.86125 42.695..."


Load .parquet file into pandas dataframe.

In [3]:
# Load the CitiBike log file covering June 2021 through May 2022 and their Lat Longs
list_location = pq.read_table('CitiBike_data/202106-202205-citibike-tripdata.parquet').to_pandas()

# Check the Pandas Dataframe
list_location.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
0,Classic Bike,2021-06-01 23:12:34,2021-06-01 23:14:46,Driggs Ave & N 9 St,Bayard St & Leonard St,40.718169,-73.955201,40.719156,-73.948854,Member,2021,6,22,1,23,2.2,0.506033,13.800891
1,Classic Bike,2021-06-16 17:14:56,2021-06-16 17:29:15,Fulton St & Broadway,Mercer St & Spring St,40.711066,-74.009447,40.723627,-73.999496,Casual,2021,6,24,2,17,14.316667,1.553328,6.509873
2,Classic Bike,2021-06-07 19:41:55,2021-06-07 19:51:28,Devoe St & Lorimer St,Manhattan Av & Leonard St,40.713352,-73.949103,40.72084,-73.94844,Casual,2021,6,23,0,19,9.55,0.562419,3.533523
3,Electric Bike,2021-06-17 15:13:15,2021-06-17 15:33:25,Driggs Ave & N 9 St,Greenwich Ave & Charles St,40.718169,-73.955201,40.735238,-74.000271,Member,2021,6,24,3,15,20.166667,4.287591,12.756469
4,Electric Bike,2021-06-18 08:27:03,2021-06-18 08:53:37,Graham Ave & Conselyea St,E 30 St & Park Ave S,40.715143,-73.944507,40.744449,-73.983035,Member,2021,6,24,4,8,26.566667,4.680581,10.570947


In order to conserve memory resources, a dataframe consisting of every conceivable station name and coordinates shall be created, tagged to boro and neighborhood, and finally assigned to the main dataframe.

However, the problem is that there is a lot of noise in the data regarding actual station locations. To illustrate, observe the number of unique station names for start and end stations as opposed to the number of unique latitudes and longitudes for each ride generated when duplicates are dropped. There are clearly more unique coordinates than there are stations to contain them.

In [21]:
len(list_location.start_station_name.unique().tolist())

1598

In [23]:
list_location[['start_lat', 'start_lng']].drop_duplicates()

Unnamed: 0,start_lat,start_lng
0,40.718169,-73.955201
1,40.711066,-74.009447
2,40.713352,-73.949103
4,40.715143,-73.944507
5,40.697601,-73.993445
...,...,...
29032932,40.771145,-73.990748
29032933,40.866496,-73.897974
29032950,40.719097,-73.999539
29032967,40.839553,-73.900343


In [24]:
len(list_location.end_station_name.unique().tolist())

1681

In [25]:
list_location[['end_lat', 'end_lng']].drop_duplicates()

Unnamed: 0,end_lat,end_lng
0,40.719156,-73.948854
1,40.723627,-73.999496
2,40.720840,-73.948440
3,40.735238,-74.000271
4,40.744449,-73.983035
...,...,...
26440415,40.714145,-74.033552
27744373,40.749943,-74.035865
27853903,40.740802,-74.042521
28498932,40.706575,-74.086701


The solution is to average all coordinates for each unique station name, which results in the number of unique coordinates matching their respective number of stations.

In [28]:
list_location[['start_station_name','start_lat', 'start_lng']].groupby(['start_station_name']).mean()

Unnamed: 0_level_0,start_lat,start_lng
start_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Ave & E 110 St,40.792327,-73.938299
1 Ave & E 16 St,40.732219,-73.981655
1 Ave & E 18 St,40.733813,-73.980544
1 Ave & E 30 St,40.741444,-73.975361
1 Ave & E 39 St,40.747141,-73.971130
...,...,...
Wyckoff Av & Stanhope St,40.703545,-73.917776
Wyckoff Ave & Gates Ave,40.699871,-73.911717
Wyckoff St & Nevins St,40.683426,-73.984275
Wythe Ave & Metropolitan Ave,40.716887,-73.963198


In [30]:
list_location[['end_station_name','end_lat', 'end_lng']].groupby(['end_station_name']).mean()

Unnamed: 0_level_0,end_lat,end_lng
end_station_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Ave & E 110 St,40.792327,-73.938300
1 Ave & E 16 St,40.732219,-73.981656
1 Ave & E 18 St,40.733812,-73.980544
1 Ave & E 30 St,40.741444,-73.975361
1 Ave & E 39 St,40.747140,-73.971130
...,...,...
Wyckoff Av & Stanhope St,40.703545,-73.917775
Wyckoff Ave & Gates Ave,40.699871,-73.911719
Wyckoff St & Nevins St,40.683426,-73.984275
Wythe Ave & Metropolitan Ave,40.716887,-73.963198


Therefore, a dataframe of unique start station names and another one with unique end station names, both with averaged out coordinates, shall be generated and then merged to eliminate overlaps.

In [46]:
# Create a dataframe of unique start station names with coordinates
start_station_df = list_location[['start_station_name','start_lat', 'start_lng']] \
.groupby(['start_station_name']).mean()

# Create a dataframe of unique end station names with coordinates
end_station_df = list_location[['end_station_name','end_lat', 'end_lng']] \
.groupby(['end_station_name']).mean()

# Rename columns in both dataframes to assist merging
start_station_df.rename(columns = {'start_station_name': 'station_name',
                                  'start_lat': 'lat',
                                  'start_lng': 'lng'}, inplace = True)
end_station_df.rename(columns = {'end_station_name': 'station_name',
                                  'end_lat': 'lat',
                                  'end_lng': 'lng'}, inplace = True)

# Concatenate dataframes
stations_df = pd.concat([start_station_df, end_station_df]).drop_duplicates()

# Average out unique stations again to further streamline
stations_df = stations_df.groupby(stations_df.index).mean()

# Check dataframe finally
stations_df

Unnamed: 0,lat,lng
1 Ave & E 110 St,40.792327,-73.938300
1 Ave & E 16 St,40.732219,-73.981655
1 Ave & E 18 St,40.733812,-73.980544
1 Ave & E 30 St,40.741444,-73.975361
1 Ave & E 39 St,40.747140,-73.971130
...,...,...
Wyckoff Av & Stanhope St,40.703545,-73.917775
Wyckoff Ave & Gates Ave,40.699871,-73.911718
Wyckoff St & Nevins St,40.683426,-73.984275
Wythe Ave & Metropolitan Ave,40.716887,-73.963198


Extract coordinates from the streamlined Pandas dataframe

In [53]:
# Changing to a GeoDataFrame to create geometry series
stations_gp = gpd.GeoDataFrame(stations_df, geometry = gpd.points_from_xy(stations_df.lng, stations_df.lat))

# Checking GeoDataFrame
stations_gp.head()

Unnamed: 0,lat,lng,geometry
1 Ave & E 110 St,40.792327,-73.9383,POINT (-73.93830 40.79233)
1 Ave & E 16 St,40.732219,-73.981655,POINT (-73.98166 40.73222)
1 Ave & E 18 St,40.733812,-73.980544,POINT (-73.98054 40.73381)
1 Ave & E 30 St,40.741444,-73.975361,POINT (-73.97536 40.74144)
1 Ave & E 39 St,40.74714,-73.97113,POINT (-73.97113 40.74714)


Finally, create new columns to assign boro and neighborhood based on geometry point.

In [70]:
# Boro (or County)
stations_gp['boro'] = ''
for idx in range(map_df.shape[0]):
    # For every station , find if they reside within a boro
    pip = stations_gp.within(map_df.loc[idx, 'geometry'])
    if pip.sum() > 0: # Found where some of the station reside at map_df.loc[idx]
        stations_gp.loc[pip, 'boro']  = map_df.loc[idx, 'County']

# Neighborhood
stations_gp['hood'] = ''
for idx in range(map_df.shape[0]):
    # For every station , find if they reside within a boro
    pip = stations_gp.within(map_df.loc[idx, 'geometry'])
    if pip.sum() > 0: # Found where some of the station reside at map_df.loc[idx]
        stations_gp.loc[pip, 'hood']  = map_df.loc[idx, 'Name']

Check new GeoDataFrame of stations now.

In [71]:
stations_gp.head()

Unnamed: 0,lat,lng,geometry,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,POINT (-73.93830 40.79233),New York,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,POINT (-73.98166 40.73222),New York,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,POINT (-73.98054 40.73381),New York,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,POINT (-73.97536 40.74144),New York,Gramercy
1 Ave & E 39 St,40.74714,-73.97113,POINT (-73.97113 40.74714),New York,Tudor City


In [72]:
stations_gp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1682 entries, 1 Ave & E 110 St to Yankee Ferry Terminal
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   lat       1682 non-null   float64 
 1   lng       1682 non-null   float64 
 2   geometry  1682 non-null   geometry
 3   boro      1682 non-null   object  
 4   hood      1682 non-null   object  
dtypes: float64(2), geometry(1), object(2)
memory usage: 78.8+ KB


In [73]:
stations_gp.boro.value_counts()

New York    689
Kings       476
Bronx       270
Queens      175
             72
Name: boro, dtype: int64

In [74]:
stations_gp.hood.value_counts()

Astoria                      115
Williamsburg                  75
                              72
Bushwick                      57
Washington Heights            50
                            ... 
Prospect Park                  2
Prospect Lefferts Gardens      2
Belmont                        1
Ocean Hill                     1
East New York                  1
Name: hood, Length: 87, dtype: int64

Change the name of **New York** to **Manhattan** and **Kings** to **Brooklyn**.

In [75]:
stations_gp.boro.loc[stations_gp.boro == 'New York'] = 'Manhattan'
stations_gp.boro.loc[stations_gp.boro == 'Kings'] = 'Brooklyn'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [76]:
stations_gp.boro.value_counts()

Manhattan    689
Brooklyn     476
Bronx        270
Queens       175
              72
Name: boro, dtype: int64

Investigate which listings are not in any of the distict boroughs of New York City.

In [77]:
stations_gp.loc[stations_gp.boro == '']

Unnamed: 0,lat,lng,geometry,boro,hood
11 St & Washington St,40.749985,-74.027150,POINT (-74.02715 40.74998),,
12 St & Sinatra Dr N,40.750604,-74.024020,POINT (-74.02402 40.75060),,
14 St Ferry - 14 St & Shipyard Ln,40.752961,-74.024353,POINT (-74.02435 40.75296),,
2 St HBLR - 2 St & Marshall St,40.740802,-74.042435,POINT (-74.04243 40.74080),,
5 Corners Library,40.734961,-74.059503,POINT (-74.05950 40.73496),,
...,...,...,...,...,...
Union St,40.718211,-74.083639,POINT (-74.08364 40.71821),,
Van Vorst Park,40.718489,-74.047727,POINT (-74.04773 40.71849),,
Warren St,40.721124,-74.038051,POINT (-74.03805 40.72112),,
Washington St,40.724294,-74.035483,POINT (-74.03548 40.72429),,


It appears that those stations are in New Jersey for the few crazy riders that decide to cross the Hudson River, either by ferry with the bike in tow, or through crossing a bridge or tunnel illegally. Fortunately, they are all in Hudson County, so then a shape file shall be used in order to fill in the gaps.

Found here: https://catalog.data.gov/dataset/tiger-line-shapefile-2016-state-new-jersey-current-place-state-based

In [83]:
# Set the filepath and load in a shapefile
fp2 = "CitiBike_data/archive/tl_2016_34_place/tl_2016_34_place.shp"
map_df2 = gpd.read_file(fp2)

# Check the GeoDataframe
map_df2.head()

Unnamed: 0,STATEFP,PLACEFP,PLACENS,GEOID,NAME,NAMELSAD,LSAD,CLASSFP,PCICBSA,PCINECTA,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,34,7600,885169,3407600,Bridgeton,Bridgeton city,25,C5,Y,N,G4110,A,16131425,690143,39.429192,-75.228595,"POLYGON ((-75.25302 39.44406, -75.24995 39.445..."
1,34,67020,885393,3467020,Shiloh,Shiloh borough,21,C5,N,N,G4110,A,3116143,1564,39.4624183,-75.2924402,"POLYGON ((-75.31133 39.46395, -75.31132 39.464..."
2,34,7810,885171,3407810,Brigantine,Brigantine city,25,C5,N,N,G4110,A,16931323,11204171,39.4138574,-74.3779024,"POLYGON ((-74.42105 39.38576, -74.42069 39.385..."
3,34,23940,885222,3423940,Folsom,Folsom borough,21,C5,N,N,G4110,A,21403550,635219,39.5967446,-74.8431626,"POLYGON ((-74.88443 39.60217, -74.88338 39.603..."
4,34,24990,885225,3424990,Franklin Lakes,Franklin Lakes borough,21,C5,N,N,G4110,A,24383136,1214222,41.0075259,-74.2056527,"POLYGON ((-74.24851 40.99220, -74.24847 40.992..."


In [84]:
map_df2.geometry

0      POLYGON ((-75.25302 39.44406, -75.24995 39.445...
1      POLYGON ((-75.31133 39.46395, -75.31132 39.464...
2      POLYGON ((-74.42105 39.38576, -74.42069 39.385...
3      POLYGON ((-74.88443 39.60217, -74.88338 39.603...
4      POLYGON ((-74.24851 40.99220, -74.24847 40.992...
                             ...                        
540    POLYGON ((-75.01234 39.67058, -75.00946 39.673...
541    POLYGON ((-74.30847 40.54117, -74.30840 40.541...
542    POLYGON ((-74.70470 40.18483, -74.70469 40.185...
543    POLYGON ((-74.35499 40.30243, -74.35464 40.302...
544    POLYGON ((-74.58222 40.53504, -74.58144 40.535...
Name: geometry, Length: 545, dtype: geometry

Run this again for the New Jersey cities.

In [86]:
# Neighborhood
for idx in range(map_df2.shape[0]):
    # For every station , find if they reside within a boro
    pip = stations_gp.within(map_df2.loc[idx, 'geometry'])
    if pip.sum() > 0: # Found where some of the station reside at map_df.loc[idx]
        stations_gp.loc[pip, 'hood']  = map_df2.loc[idx, 'NAME']

In [88]:
stations_gp.loc[stations_gp.boro == '']['hood'].value_counts()

Jersey City    45
Hoboken        27
Name: hood, dtype: int64

In [91]:
stations_gp['boro'].loc[stations_gp.boro == ''] = 'New Jersey'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [92]:
stations_gp.boro.value_counts()

Manhattan     689
Brooklyn      476
Bronx         270
Queens        175
New Jersey     72
Name: boro, dtype: int64

In [93]:
stations_gp.isna().sum()

lat         0
lng         0
geometry    0
boro        0
hood        0
dtype: int64

In [94]:
stations_gp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1682 entries, 1 Ave & E 110 St to Yankee Ferry Terminal
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   lat       1682 non-null   float64 
 1   lng       1682 non-null   float64 
 2   geometry  1682 non-null   geometry
 3   boro      1682 non-null   object  
 4   hood      1682 non-null   object  
dtypes: float64(2), geometry(1), object(2)
memory usage: 78.8+ KB


In [95]:
stations_gp.head()

Unnamed: 0,lat,lng,geometry,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,POINT (-73.93830 40.79233),Manhattan,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,POINT (-73.98166 40.73222),Manhattan,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,POINT (-73.98054 40.73381),Manhattan,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,POINT (-73.97536 40.74144),Manhattan,Gramercy
1 Ave & E 39 St,40.74714,-73.97113,POINT (-73.97113 40.74714),Manhattan,Tudor City


Since all stations have had their respective boroughs and neighborhoods accounted for, it is time to save this GeoDataFrame as a simple Pandas dataframe with these values saved for easy reference.

In [97]:
CB_stations_locations = pd.DataFrame(stations_gp.drop(columns='geometry'))

In [98]:
type(CB_stations_locations)

pandas.core.frame.DataFrame

In [101]:
CB_stations_locations.head()

Unnamed: 0,lat,lng,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,Manhattan,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,Manhattan,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,Manhattan,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,Manhattan,Gramercy
1 Ave & E 39 St,40.74714,-73.97113,Manhattan,Tudor City


In [102]:
CB_stations_locations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1682 entries, 1 Ave & E 110 St to Yankee Ferry Terminal
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lat     1682 non-null   float64
 1   lng     1682 non-null   float64
 2   boro    1682 non-null   object 
 3   hood    1682 non-null   object 
dtypes: float64(2), object(2)
memory usage: 65.7+ KB


Finally, save to .parquet

In [103]:
CB_Data_arrow = pa.Table.from_pandas(CB_stations_locations)
pq.write_table(CB_Data_arrow, 'CitiBike_data/202206-citibike-stations.parquet')