# Data gathering and preparation
This notebook with gather all the different datasets used for this project and prep them to be utilized for exploratory data analysis.

- still need to get COVID data to compare with change in bicycle ridership

In [1]:
# import the necessary packages
import pandas as pd
import numpy as np
import geopandas as gpd
import zipfile
import pickle
from sodapy import Socrata
from shapely.geometry import shape

# import api token
from src import *

### Bicycle Counters

In [2]:
b_counters = pd.read_csv('./data/Bicycle_Counters.csv',
                        index_col='id')
b_counters.sort_values(by='id')

Unnamed: 0_level_0,name,latitude,longitude,domain,site,timezone,interval,sens,installationDate,counter
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Manhattan Bridge 2012 Test Bike Counter,40.69981,-73.98589,New York City DOT,100005020,(UTC-05:00) US/Eastern;DST,15,0,08/31/2012,
1,2nd Avenue - 26th St S,40.73971,-73.97954,New York City DOT,100009424,(UTC-05:00) US/Eastern;DST,15,0,05/22/2015,
2,Prospect Park West,40.671288,-73.971382,New York City DOT,100009425,(UTC-05:00) US/Eastern;DST,15,0,11/07/2016,Y2H13094304
3,Manhattan Bridge Ped Path,40.714573,-73.99495,New York City DOT,100009426,(UTC-05:00) US/Eastern;DST,15,0,12/04/2013,Y2H13074107
4,Williamsburg Bridge Bike Path,40.71053,-73.96145,New York City DOT,100009427,(UTC-05:00) US/Eastern;DST,15,0,12/03/2013,Y2H13074108
5,Ed Koch Queensboro Bridge Shared Path,40.751038,-73.94082,New York City DOT,100009428,(UTC-05:00) US/Eastern;DST,15,0,12/04/2013,Y2H19111445
6,Manhattan Bridge 2013 to 2018 Bike Counter,40.699768,-73.98582,New York City DOT,100009429,(UTC-05:00) US/Eastern;DST,15,0,12/03/2013,
7,Staten Island Ferry,40.643387,-74.072075,New York City DOT,100010017,(UTC-05:00) US/Eastern;DST,15,0,03/31/2016,Y2H13094300
8,Pulaski Bridge,40.742563,-73.951492,New York City DOT,100010018,(UTC-05:00) US/Eastern;DST,15,0,06/24/2017,Y2H13094301
9,Kent Ave btw North 8th St and North 9th St,40.720959,-73.96093,New York City DOT,100010019,(UTC-05:00) US/Eastern;DST,15,0,11/22/2016,Y2H13094302


Bicycle Counter csv has `23` counters with name, lat/lon, site?, installation date and counter columns. Need to figure out what `counter` column is as it has some NaN

In [3]:
b_counters.to_pickle('./pickle/b_counters')

### Bicycle Counts

In [4]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("uczf-rk3c")

# create GeoDataFrame out of request
b_counts = pd.DataFrame.from_records(results)

# preview dataframe
b_counts

Unnamed: 0,id1,counts,date,status,site
0,0,41,2012-08-31T00:00:00.000,4,100005020
1,1,52,2012-08-31T00:15:00.000,4,100005020
2,2,38,2012-08-31T00:30:00.000,4,100005020
3,3,36,2012-08-31T00:45:00.000,4,100005020
4,4,40,2012-08-31T01:00:00.000,4,100005020
...,...,...,...,...,...
995,995,164,2012-09-10T08:45:00.000,4,100005020
996,996,151,2012-09-10T09:00:00.000,4,100005020
997,997,163,2012-09-10T09:15:00.000,4,100005020
998,998,151,2012-09-10T09:30:00.000,4,100005020


#### Next Steps...
`site` column can facilitate merge with `b_counters` dataframe to get lat/lon for mapping purposes.
<br>
I am not sure what the `status` column means.
<br>
I can show change in ridership over time by merging `b_counters` and `b_counts` dataframes.
<br>
If I want to compare with COVID, I can find COVID rates for same time period in areas that bicycle counters are located.

In [5]:
b_counts.to_pickle('./pickle/b_counts')

### Motor Vehicle Collisions

In [6]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("h9gi-nx95", limit=100000)

# create GeoDataFrame out of request
collisions = pd.DataFrame.from_records(results)

pd.set_option('display.max_columns', None)
# preview dataframe
collisions.head()

Unnamed: 0,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,collision_id,vehicle_type_code1,vehicle_type_code2,borough,zip_code,latitude,longitude,location,cross_street_name,off_street_name,contributing_factor_vehicle_3,contributing_factor_vehicle_4,vehicle_type_code_3,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-04-14T00:00:00.000,5:32,BRONX WHITESTONE BRIDGE,0,0,0,0,0,0,0,0,Following Too Closely,Unspecified,4407480,Sedan,Sedan,,,,,,,,,,,,,
1,2021-04-13T00:00:00.000,21:35,,1,0,1,0,0,0,0,0,Unspecified,,4407147,Sedan,,BROOKLYN,11217.0,40.68358,-73.97617,"{'latitude': '40.68358', 'longitude': '-73.976...",620 ATLANTIC AVENUE,,,,,,,
2,2021-04-15T00:00:00.000,16:15,HUTCHINSON RIVER PARKWAY,0,0,0,0,0,0,0,0,Pavement Slippery,,4407665,Station Wagon/Sport Utility Vehicle,,,,,,,,,,,,,,
3,2021-04-13T00:00:00.000,16:00,VANDERVORT AVENUE,0,0,0,0,0,0,0,0,Following Too Closely,Unspecified,4407811,Sedan,,BROOKLYN,11222.0,,,,,ANTHONY STREET,,,,,,
4,2021-04-12T00:00:00.000,8:25,EDSON AVENUE,0,0,0,0,0,0,0,0,Unspecified,Unspecified,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,0.0,0.0,"{'latitude': '0.0', 'longitude': '0.0'}",,,,,,,,


In [7]:
collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   crash_date                     100000 non-null  object
 1   crash_time                     100000 non-null  object
 2   on_street_name                 73201 non-null   object
 3   number_of_persons_injured      99999 non-null   object
 4   number_of_persons_killed       100000 non-null  object
 5   number_of_pedestrians_injured  100000 non-null  object
 6   number_of_pedestrians_killed   100000 non-null  object
 7   number_of_cyclist_injured      100000 non-null  object
 8   number_of_cyclist_killed       100000 non-null  object
 9   number_of_motorist_injured     100000 non-null  object
 10  number_of_motorist_killed      100000 non-null  object
 11  contributing_factor_vehicle_1  99522 non-null   object
 12  contributing_factor_vehicle_2  76526 non-null

In [8]:
# create list of vehicle types that count as bicycles
bike_list = ['Bike','BICYCLE','Minibike','Minicycle']

# filter dataframe for any columns in vehicle type that are in the bike list
b_collisions = collisions[(collisions['vehicle_type_code1'].isin(bike_list)) |\
                         (collisions['vehicle_type_code2'].isin(bike_list)) |\
                         (collisions['vehicle_type_code_3'].isin(bike_list)) |\
                         (collisions['vehicle_type_code_4'].isin(bike_list)) |\
                         (collisions['vehicle_type_code_5'].isin(bike_list))]

In [9]:
# set row display to 20
pd.set_option('display.max_rows',20)

# preview bike list filtered dataframe
b_collisions

Unnamed: 0,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,collision_id,vehicle_type_code1,vehicle_type_code2,borough,zip_code,latitude,longitude,location,cross_street_name,off_street_name,contributing_factor_vehicle_3,contributing_factor_vehicle_4,vehicle_type_code_3,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
53,2021-04-16T00:00:00.000,11:00,,1,0,0,0,1,0,0,0,Turning Improperly,Unspecified,4407792,Station Wagon/Sport Utility Vehicle,Bike,QUEENS,11368,40.74958,-73.86541,"{'latitude': '40.74958', 'longitude': '-73.865...",100-10 ROOSEVELT AVENUE,,,,,,,
92,2021-04-14T00:00:00.000,0:00,BATH AVENUE,1,0,0,0,1,0,0,0,Failure to Yield Right-of-Way,Unspecified,4407649,Station Wagon/Sport Utility Vehicle,Bike,,,40.601864,-74.00232,"{'latitude': '40.601864', 'longitude': '-74.00...",,,,,,,,
130,2021-04-14T00:00:00.000,20:10,WASHINGTON AVENUE,0,0,0,0,0,0,0,0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Unspecified,4407381,Station Wagon/Sport Utility Vehicle,Bike,BROOKLYN,11238,40.68821,-73.96583,"{'latitude': '40.68821', 'longitude': '-73.965...",,LAFAYETTE AVENUE,,,,,,
142,2021-04-13T00:00:00.000,17:55,GRANT HIGHWAY,1,0,0,0,1,0,0,0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Unspecified,4407789,Station Wagon/Sport Utility Vehicle,Bike,BRONX,10452,40.844105,-73.923065,"{'latitude': '40.844105', 'longitude': '-73.92...",,UNIVERSITY AVENUE,,,,,,
148,2021-04-14T00:00:00.000,19:45,FLATBUSH AVENUE EXTENSION,1,0,0,0,1,0,0,0,Driver Inattention/Distraction,Driver Inattention/Distraction,4407414,Sedan,Bike,BROOKLYN,11201,40.69484,-73.98391,"{'latitude': '40.69484', 'longitude': '-73.983...",,JOHNSON STREET,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99882,2020-04-28T00:00:00.000,13:30,GREENE AVENUE,1,0,0,0,1,0,0,0,Driver Inattention/Distraction,Unspecified,4310872,Sedan,Bike,,,40.6874160,-73.9597200,"{'latitude': '40.687416', 'longitude': '-73.95...",,,,,,,,
99927,2020-06-03T00:00:00.000,0:00,CORONA AVENUE,0,0,0,0,0,0,0,0,Failure to Yield Right-of-Way,Unspecified,4317353,Station Wagon/Sport Utility Vehicle,Bike,QUEENS,11368,40.7442800,-73.8617860,"{'latitude': '40.74428', 'longitude': '-73.861...",,102 STREET,,,,,,
99933,2020-05-23T00:00:00.000,23:43,WOODHAVEN BOULEVARD,1,0,0,0,1,0,0,0,Driver Inattention/Distraction,,4315546,Bike,,QUEENS,11421,40.6975560,-73.8527760,"{'latitude': '40.697556', 'longitude': '-73.85...",,PARK LANE SOUTH,,,,,,
99937,2020-04-28T00:00:00.000,18:15,DELANCEY STREET,1,0,0,0,1,0,0,0,Driver Inattention/Distraction,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,4310822,Station Wagon/Sport Utility Vehicle,Bike,,,40.7179260,-73.9856800,"{'latitude': '40.717926', 'longitude': '-73.98...",,,,,,,,


In [10]:
# look at column dtypes and info
b_collisions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5034 entries, 53 to 99975
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     5034 non-null   object
 1   crash_time                     5034 non-null   object
 2   on_street_name                 4012 non-null   object
 3   number_of_persons_injured      5033 non-null   object
 4   number_of_persons_killed       5034 non-null   object
 5   number_of_pedestrians_injured  5034 non-null   object
 6   number_of_pedestrians_killed   5034 non-null   object
 7   number_of_cyclist_injured      5034 non-null   object
 8   number_of_cyclist_killed       5034 non-null   object
 9   number_of_motorist_injured     5034 non-null   object
 10  number_of_motorist_killed      5034 non-null   object
 11  contributing_factor_vehicle_1  5032 non-null   object
 12  contributing_factor_vehicle_2  4552 non-null   object
 13  c

In [11]:
# pickle dataframe
b_collisions.to_pickle('./pickle/b_collisions')

#### Next Steps...
There are a ton of null values in the motor vehicle dataframe...some are ok, others are not. For instance if we don't even have the `latitude` or `longitude` of the crash site we have no idea where the crash occurred. Consider dropping rows where any null in coordinates.
<br>
Also, show change in ridership vs change in Motor Vehicle Accidents containing some type of bicycle.<br>
Probably wil have to engineer a feature that is a percentage of accidents involving bicycle:
- overall
- per year
- per month during the COVID months


## Shapefiles

### Bike Priority Areas (shapefile)

In [12]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("byz4-8k8n")

# create GeoDataFrame out of request
b_priority_gdf = gpd.GeoDataFrame(results)

In [13]:
# name gdf
b_priority_gdf.name = 'bike_priorty_areas'

# preview
b_priority_gdf

Unnamed: 0,boro_cd_cod,shape_leng,the_geom,shape_area
0,BK 3,36213.8362463,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",79460977.3423
1,BK 5,58026.4575893,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",156204154.973
2,BK 12,52468.9113739,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",99833185.8188
3,BK 14,49259.5617348,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",82168722.5327
4,BK 15,96010.9373532,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",131678761.813
5,BK 17,43287.2618759,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",93791381.4552
6,QN 3,37011.0141973,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",82975598.8703
7,QN 4,36820.7520393,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",65634904.8855
8,QN 5,69923.9500542,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",210416759.573
9,BK 4,37060.9166461,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",56660385.9663


In [14]:
b_priority_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   boro_cd_cod  10 non-null     object
 1   shape_leng   10 non-null     object
 2   the_geom     10 non-null     object
 3   shape_area   10 non-null     object
dtypes: object(4)
memory usage: 448.0+ bytes


### Street Improvement Projects (Corridors)

In [15]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("rmgf-vu32")

# create GeoDataFrame out of request
st_imp_corridors_gdf = gpd.GeoDataFrame(results)

# name gdf
st_imp_corridors_gdf.name = 'street_improvement_corridors'

#preview 
st_imp_corridors_gdf

Unnamed: 0,the_geom,pjct_name,sip_year,end_date,shape_leng
0,"{'type': 'MultiLineString', 'coordinates': [[[...",Baychester Ave (Boston Rd to E 233 St),2016,2016-10-27T00:00:00.000Z,2769.06422937
1,"{'type': 'MultiLineString', 'coordinates': [[[...",Castle Hill Ave (E Tremont Ave to Hart St),2016,2016-12-09T00:00:00.000Z,11603.5811528
2,"{'type': 'MultiLineString', 'coordinates': [[[...",Union Square,2016,2016-12-05T00:00:00.000Z,7772.55689005
3,"{'type': 'MultiLineString', 'coordinates': [[[...",20th Ave,2016,2016-11-18T00:00:00.000Z,4590.7476249
4,"{'type': 'MultiLineString', 'coordinates': [[[...",Baychester Ave-E 241 St (E 233 St to Carpenter...,2018,2018-07-31T00:00:00.000Z,6981.19857529
...,...,...,...,...,...
699,"{'type': 'MultiLineString', 'coordinates': [[[...",Hylan Blvd & New Dorp Lane,2009,2009-06-08T00:00:00.000Z,13440.0780147
700,"{'type': 'MultiLineString', 'coordinates': [[[...",Atlantic Ave Ph I - Capital HWD10105,2020,2020-12-22T00:00:00.000Z,11260.3239094
701,"{'type': 'MultiLineString', 'coordinates': [[[...","University Ave, Washington Bridge to Featherbe...",2020,2020-11-13T00:00:00.000Z,450.510345682
702,"{'type': 'MultiLineString', 'coordinates': [[[...",Main St SBS - Capital HWQ100FJM,2020,2020-06-17T00:00:00.000Z,3569.11425307


In [16]:
st_imp_corridors_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   the_geom    704 non-null    object
 1   pjct_name   704 non-null    object
 2   sip_year    704 non-null    object
 3   end_date    703 non-null    object
 4   shape_leng  704 non-null    object
dtypes: object(5)
memory usage: 27.6+ KB


In [17]:
st_imp_corridors_gdf.name

'street_improvement_corridors'

### Street Improvement Projects (Intersections)

In [18]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("5r5m-chix")

# create GeoDataFrame out of request
st_imp_intersect_gdf = gpd.GeoDataFrame(results)

# name gdf
st_imp_intersect_gdf.name = 'street_improvement_intersections'

#preview 
st_imp_intersect_gdf

Unnamed: 0,the_geom,sip_year,pjct_name,end_date
0,"{'type': 'MultiPoint', 'coordinates': [[-73.90...",2012,East New York Ave at Atlantic Ave,2012-11-15T00:00:00.000Z
1,"{'type': 'MultiPoint', 'coordinates': [[-73.97...",2018,Vanderbilt Ave at E 2nd St and E 3rd St,2018-12-19T00:00:00.000Z
2,"{'type': 'MultiPoint', 'coordinates': [[-73.91...",2014,46th Street and Queens Blvd.,2014-07-28T00:00:00.000Z
3,"{'type': 'MultiPoint', 'coordinates': [[-73.95...",2016,St Marks Ave & Classon Ave,2016-12-05T00:00:00.000Z
4,"{'type': 'MultiPoint', 'coordinates': [[-73.96...",2019,Lexington Ave and 59th St,2019-03-31T00:00:00.000Z
...,...,...,...,...
232,"{'type': 'MultiPoint', 'coordinates': [[-73.97...",2011,4th Avenue,2011-12-06T00:00:00.000Z
233,"{'type': 'MultiPoint', 'coordinates': [[-73.86...",2020,White Plains Rd and Arnow Ave,2021-01-12T00:00:00.000Z
234,"{'type': 'MultiPoint', 'coordinates': [[-73.90...",2017,W 259 St & Netherland Ave (EC),2017-09-29T00:00:00.000Z
235,"{'type': 'MultiPoint', 'coordinates': [[-73.96...",2019,Atlantic Av and Grand Av,2019-07-18T00:00:00.000Z


In [19]:
st_imp_intersect_gdf.name

'street_improvement_intersections'

In [20]:
st_imp_intersect_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   the_geom   237 non-null    object
 1   sip_year   237 non-null    object
 2   pjct_name  237 non-null    object
 3   end_date   237 non-null    object
dtypes: object(4)
memory usage: 7.5+ KB


### Cityracks

In [21]:
bike_racks = gpd.read_file(filename='./data/2013-cityracks-shp/city_racks_2013_06_28.shp')
bike_racks

Unnamed: 0,Name,small,large,circular,mini_hoop,total_rack,geometry
0,1 7 AV S,5,0,0,0,5,POINT Z (982903.570 205129.999 0.000)
1,1 BOERUM PL,1,0,0,0,1,POINT Z (987330.416 191302.730 0.000)
2,1 CENTRE ST,10,0,0,0,10,POINT Z (983210.953 199016.513 0.000)
3,1 E 13 ST,1,0,0,0,1,POINT Z (985897.840 207157.885 0.000)
4,1 E 183 ST,0,0,2,0,2,POINT Z (1010993.969 252137.340 0.000)
...,...,...,...,...,...,...,...
11729,997 1 AV,0,1,0,0,1,POINT Z (994160.431 215021.045 0.000)
11730,997 FULTON ST,1,0,0,0,1,POINT Z (994511.753 188057.801 0.000)
11731,998 AMSTERDAM AV,1,0,0,0,1,POINT Z (994125.656 231703.878 0.000)
11732,998 COLUMBUS AV,0,0,0,1,1,POINT Z (994936.194 231279.407 0.000)


In [22]:
bike_racks.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 11734 entries, 0 to 11733
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Name        11734 non-null  object  
 1   small       11734 non-null  int64   
 2   large       11734 non-null  int64   
 3   circular    11734 non-null  int64   
 4   mini_hoop   11734 non-null  int64   
 5   total_rack  11734 non-null  int64   
 6   geometry    11734 non-null  geometry
dtypes: geometry(1), int64(5), object(1)
memory usage: 641.8+ KB


In [23]:
bike_racks.crs

<Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- X[east]: Easting (US survey foot)
- Y[north]: Northing (US survey foot)
Area of Use:
- name: USA - New York - SPCS - Long Island
- bounds: (-74.26, 40.47, -71.8, 41.3)
Coordinate Operation:
- name: SPCS83 New York Long Island zone (US Survey feet)
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

#### Next Steps...
Where are there more city racks, and are they distributed equitably?
<br>
Maybe bring in some demographic data to compare.
But how would this help with the final ask?

### Bicycle Routes

In [24]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("cc5c-sm6z", limit=100000)

# create GeoDataFrame out of request
b_routes_gdf = gpd.GeoDataFrame(results)

# name gdf
b_routes_gdf.name = 'bike_routes'

# preview
b_routes_gdf

Unnamed: 0,segmentid,street,the_geom,shape_leng,boro,facilitycl,fromstreet,tostreet,onoffst,allclasses,bikedir,lanecount,ft_facilit,tf_facilit,comments,tf2facilit,ft2facilit
0,53,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",131.771990229,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
1,57,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",268.689582437,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
2,59,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",420.193252605,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
3,61,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",238.242197819,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
4,64,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",646.293832417,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19870,79081,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",116.876290616,2,I,EDGEWATER RD,BRONX RIVER GREENWAY,OFF,I,2,2,Sidewalk,,,,
19871,196825,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",101.800150667,2,I,EDGEWATER RD,BRONX RIVER GREENWAY,OFF,I,2,2,Sidewalk,,,,
19872,196801,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",66.0292754239,2,I,BRONX RIVER GREENWAY,EDGEWATER RD,OFF,I,2,2,,Sidewalk,,,
19873,196800,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",158.356411407,2,I,BRONX RIVER GREENWAY,EDGEWATER RD,OFF,I,2,2,,Sidewalk,,,


In [25]:
b_routes_gdf.name

'bike_routes'

In [26]:
b_routes_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19875 entries, 0 to 19874
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   segmentid   19875 non-null  object
 1   street      19875 non-null  object
 2   the_geom    19875 non-null  object
 3   shape_leng  19875 non-null  object
 4   boro        19875 non-null  object
 5   facilitycl  19875 non-null  object
 6   fromstreet  19875 non-null  object
 7   tostreet    19875 non-null  object
 8   onoffst     19875 non-null  object
 9   allclasses  19875 non-null  object
 10  bikedir     19875 non-null  object
 11  lanecount   19875 non-null  object
 12  ft_facilit  14732 non-null  object
 13  tf_facilit  14557 non-null  object
 14  comments    1638 non-null   object
 15  tf2facilit  37 non-null     object
 16  ft2facilit  11 non-null     object
dtypes: object(17)
memory usage: 2.6+ MB


### Fromat shapfiles

- create custom function to set CRS
- apply function to each geodataframe
- check each GDF to ensure in correct CRS
- export to file directory

In [27]:
# function to set the geometry for shapefiles
def set_geom(gdf,geom_col,set_crs=4326, to_crs=2263):
    """
    function that converts geom column from a geojson to an
    independent geometry with coordinates
    sets the geometry column 
    sets CRS default is NAD 2263 for NYC
    renames to geomtry
    pass in df
    pass in column name as string
    """
    gdf[geom_col] = gdf[geom_col].apply(shape)
    gdf.set_geometry(geom_col, inplace=True)
    gdf.rename_geometry('geometry', inplace=True)
    gdf.set_crs(set_crs, inplace=True)
    gdf.to_crs(to_crs, inplace=True)
    return gdf.head()

In [28]:
# create list of api derived shape files
shp_files = [b_routes_gdf, st_imp_corridors_gdf, st_imp_intersect_gdf, b_priority_gdf]

In [30]:
# apply custom formula to list of shp_files
for gdf in shp_files:
    set_geom(gdf, 'the_geom')

In [37]:
# check crs for each shapefile
for gdf in shp_files:
    print("{} is in {} crs".format(gdf.name, gdf.crs))

bike_routes is in epsg:2263 crs
street_improvement_corridors is in epsg:2263 crs
street_improvement_intersections is in epsg:2263 crs
bike_priorty_areas is in epsg:2263 crs


In [33]:
# view info for api shpiles
for file in shp_files:
    print(file.info())

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19875 entries, 0 to 19874
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   segmentid   19875 non-null  object  
 1   street      19875 non-null  object  
 2   geometry    19875 non-null  geometry
 3   shape_leng  19875 non-null  object  
 4   boro        19875 non-null  object  
 5   facilitycl  19875 non-null  object  
 6   fromstreet  19875 non-null  object  
 7   tostreet    19875 non-null  object  
 8   onoffst     19875 non-null  object  
 9   allclasses  19875 non-null  object  
 10  bikedir     19875 non-null  object  
 11  lanecount   19875 non-null  object  
 12  ft_facilit  14732 non-null  object  
 13  tf_facilit  14557 non-null  object  
 14  comments    1638 non-null   object  
 15  tf2facilit  37 non-null     object  
 16  ft2facilit  11 non-null     object  
dtypes: geometry(1), object(16)
memory usage: 2.6+ MB
None
<class 'geopandas.geodataf

In [35]:
# export api shpfiles to shp folder 
for file in shp_files:
    file.to_file('./data/shp/' + file.name + '.shp')