# Data gathering and preparation
This notebook with gather all the different datasets used for this project and prep them to be utilized for exploratory data analysis.

- still need to get COVID data to compare with change in bicycle ridership

In [1]:
# import the necessary packages
import pandas as pd
import numpy as np
import geopandas as gpd
import pickle
from sodapy import Socrata
from shapely.geometry import shape
import os

# import api token
from src import *

### COVID-19 Daily Counts of Cases, Hopitalizations, and Deaths

In [2]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("rc75-m7u3", limit=100000)

# create Dataframe out of request
covid_counts = pd.DataFrame.from_records(results)

# preview dataset
covid_counts.head()

Unnamed: 0,date_of_interest,case_count,probable_case_count,hospitalized_count,death_count,death_count_probable,case_count_7day_avg,all_case_count_7day_avg,hosp_count_7day_avg,death_count_7day_avg,...,si_probable_case_count,si_hospitalized_count,si_death_count,si_probable_death_count,si_case_count_7day_avg,si_all_case_count_7day_avg,si_hospitalized_count_7day_avg,si_death_count_7day_avg,si_all_death_count_7day_avg,incomplete
0,2020-02-29T00:00:00.000,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-03-01T00:00:00.000,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-03-02T00:00:00.000,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-03-03T00:00:00.000,1,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-03-04T00:00:00.000,5,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
covid_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 62 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   date_of_interest                471 non-null    object
 1   case_count                      471 non-null    object
 2   probable_case_count             471 non-null    object
 3   hospitalized_count              471 non-null    object
 4   death_count                     471 non-null    object
 5   death_count_probable            471 non-null    object
 6   case_count_7day_avg             471 non-null    object
 7   all_case_count_7day_avg         471 non-null    object
 8   hosp_count_7day_avg             471 non-null    object
 9   death_count_7day_avg            471 non-null    object
 10  all_death_count_7day_avg        471 non-null    object
 11  bx_case_count                   471 non-null    object
 12  bx_probable_case_count          471 non-null    ob

NYC Open Data SOCRATA API Doesn't really give me what I need. Will probably have to clone the github repo and pull from there.
<br>
I need COVID-19 data on a weekly basis by ZCTA.

### COVID-19 NYC Github Data

#### Antibody by MODZCTA

In [4]:
anti_by_modzcta = pd.read_csv('./coronavirus-data/totals/antibody-by-modzcta.csv')
anti_by_modzcta.head()

Unnamed: 0,modzcta_first,NEIGHBORHOOD_NAME,label,lat,lon,NUM_PEOP_TEST,NUM_PEOP_POS,PERCENT_POSITIVE,TEST_RATE
0,10001,Chelsea/NoMad/West Chelsea,"10001, 10118",40.750693,-73.997137,10720,2063,19.2,38822.169
1,10002,Chinatown/Lower East Side,10002,40.715781,-73.986176,30767,7592,24.7,40846.911
2,10003,East Village/Gramercy/Greenwich Village,10003,40.731825,-73.989164,18887,3383,17.9,34990.304
3,10004,Financial District,10004,40.703675,-74.013106,1526,259,17.0,51343.892
4,10005,Financial District,10005,40.706092,-74.008861,3244,566,17.4,37043.695


In [5]:
# make all column headers lowercase
anti_by_modzcta.columns = [x.lower() for x in anti_by_modzcta.columns]

In [6]:
anti_by_modzcta.to_pickle('./pickle/anti_by_modzcta')

#### Data by MODZCTA

In [7]:
data_by_modzcta = pd.read_csv('./coronavirus-data/totals/data-by-modzcta.csv')
data_by_modzcta.head()

Unnamed: 0,MODIFIED_ZCTA,NEIGHBORHOOD_NAME,BOROUGH_GROUP,label,lat,lon,COVID_CASE_COUNT,COVID_CASE_RATE,POP_DENOMINATOR,COVID_DEATH_COUNT,COVID_DEATH_RATE,PERCENT_POSITIVE,TOTAL_COVID_TESTS
0,10001,Chelsea/NoMad/West Chelsea,Manhattan,"10001, 10118",40.750693,-73.997137,1622,5874.03,27613.09,35,126.75,7.2,22537
1,10002,Chinatown/Lower East Side,Manhattan,10002,40.715781,-73.986176,6113,8115.75,75322.71,287,381.03,11.91,51451
2,10003,East Village/Gramercy/Greenwich Village,Manhattan,10003,40.731825,-73.989164,2929,5426.3,53977.81,49,90.78,6.78,43151
3,10004,Financial District,Manhattan,10004,40.703675,-74.013106,253,8512.45,2972.12,2,67.29,6.58,3860
4,10005,Financial District,Manhattan,10005,40.706092,-74.008861,430,4910.23,8757.23,0,0.0,6.39,6671


In [8]:
data_by_modzcta.columns = [x.lower() for x in data_by_modzcta.columns]

In [9]:
data_by_modzcta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   modified_zcta      177 non-null    int64  
 1   neighborhood_name  177 non-null    object 
 2   borough_group      177 non-null    object 
 3   label              177 non-null    object 
 4   lat                177 non-null    float64
 5   lon                177 non-null    float64
 6   covid_case_count   177 non-null    int64  
 7   covid_case_rate    177 non-null    float64
 8   pop_denominator    177 non-null    float64
 9   covid_death_count  177 non-null    int64  
 10  covid_death_rate   177 non-null    float64
 11  percent_positive   177 non-null    float64
 12  total_covid_tests  177 non-null    int64  
dtypes: float64(6), int64(4), object(3)
memory usage: 18.1+ KB


In [10]:
data_by_modzcta.to_pickle('./pickle/data_by_modzcta')

#### Data by Day

In [11]:
data_by_day = pd.read_csv('coronavirus-data/trends/data-by-day.csv',
                         parse_dates=['date_of_interest'])
data_by_day.head()

Unnamed: 0,date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,PROBABLE_DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,...,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_DEATH_COUNT,SI_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,SI_ALL_DEATH_COUNT_7DAY_AVG,INCOMPLETE
0,2020-02-29,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020-03-01,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-03-02,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-03-03,1,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-03-04,5,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
data_by_day.columns = [x.lower() for x in data_by_day.columns]

In [13]:
data_by_day.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 62 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   date_of_interest                471 non-null    datetime64[ns]
 1   case_count                      471 non-null    int64         
 2   probable_case_count             471 non-null    int64         
 3   hospitalized_count              471 non-null    int64         
 4   death_count                     471 non-null    int64         
 5   probable_death_count            471 non-null    int64         
 6   case_count_7day_avg             471 non-null    int64         
 7   all_case_count_7day_avg         471 non-null    int64         
 8   hosp_count_7day_avg             471 non-null    int64         
 9   death_count_7day_avg            471 non-null    int64         
 10  all_death_count_7day_avg        471 non-null    int64         
 11  bx_cas

In [14]:
data_by_day.shape

(471, 62)

In [15]:
data_by_day.to_pickle('./pickle/data_by_day')

#### Caserate by MODZCTA

In [16]:
caserate_by_modzcta = pd.read_csv('./coronavirus-data/trends/caserate-by-modzcta.csv',
                                 parse_dates=['week_ending'])
caserate_by_modzcta.head()

Unnamed: 0,week_ending,CASERATE_CITY,CASERATE_BX,CASERATE_BK,CASERATE_MN,CASERATE_QN,CASERATE_SI,CASERATE_10001,CASERATE_10002,CASERATE_10003,...,CASERATE_11432,CASERATE_11433,CASERATE_11434,CASERATE_11435,CASERATE_11436,CASERATE_11691,CASERATE_11692,CASERATE_11693,CASERATE_11694,CASERATE_11697
0,2020-08-08,19.05,26.51,18.75,15.96,17.48,16.38,7.24,34.52,5.56,...,23.04,16.43,8.99,20.54,9.98,29.91,28.53,0.0,24.06,58.95
1,2020-08-15,20.06,28.13,19.34,15.84,18.41,22.05,14.49,10.62,3.71,...,41.15,30.12,14.98,23.97,9.98,19.44,19.02,7.89,19.25,0.0
2,2020-08-22,18.75,23.41,16.56,17.74,18.99,18.9,10.86,19.91,16.67,...,24.69,16.43,11.98,17.12,9.98,37.39,4.76,0.0,33.69,29.48
3,2020-08-29,19.68,25.88,20.82,16.7,16.95,18.27,7.24,10.62,18.53,...,18.11,13.69,14.98,18.83,9.98,56.84,4.76,15.78,14.44,29.48
4,2020-09-05,19.95,18.76,21.95,15.66,19.12,31.29,32.59,7.97,18.53,...,11.52,16.43,10.48,22.25,4.99,46.37,14.27,7.89,14.44,88.43


In [17]:
caserate_by_modzcta.columns = [x.lower() for x in caserate_by_modzcta.columns]

In [18]:
caserate_by_modzcta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Columns: 184 entries, week_ending to caserate_11697
dtypes: datetime64[ns](1), float64(183)
memory usage: 63.4 KB


In [19]:
caserate_by_modzcta.shape

(44, 184)

In [20]:
caserate_by_modzcta.to_pickle('./pickle/caserate_by_modzcta')

#### Percent positive by MODZCTA

In [21]:
perc_pos_modzcta = pd.read_csv('./coronavirus-data/trends/percentpositive-by-modzcta.csv',
                              parse_dates=['week_ending'])
perc_pos_modzcta.head()

Unnamed: 0,week_ending,PCTPOS_CITY,PCTPOS_BX,PCTPOS_BK,PCTPOS_MN,PCTPOS_QN,PCTPOS_SI,PCTPOS_10001,PCTPOS_10002,PCTPOS_10003,...,PCTPOS_11432,PCTPOS_11433,PCTPOS_11434,PCTPOS_11435,PCTPOS_11436,PCTPOS_11691,PCTPOS_11692,PCTPOS_11693,PCTPOS_11694,PCTPOS_11697
0,2020-08-08,1.27,1.68,1.3,0.91,1.23,1.45,0.75,1.99,0.27,...,1.92,1.48,0.71,1.39,0.89,1.65,1.38,0.86,1.43,2.47
1,2020-08-15,1.23,1.68,1.25,0.87,1.21,1.45,0.92,0.56,0.17,...,2.82,2.65,0.76,1.55,1.2,1.15,1.42,0.84,0.94,0.0
2,2020-08-22,1.04,1.34,1.01,0.78,1.15,1.18,0.37,1.03,0.26,...,1.65,1.15,1.11,1.58,0.84,1.32,0.13,0.0,2.7,1.06
3,2020-08-29,1.07,1.57,1.19,0.7,1.06,1.04,0.53,0.43,0.37,...,1.2,0.87,1.06,1.19,0.82,2.1,0.63,0.82,0.95,1.14
4,2020-09-05,0.96,1.04,1.13,0.63,1.05,1.25,1.03,0.5,0.38,...,1.42,0.89,0.56,1.27,0.5,1.88,0.57,0.37,0.98,3.42


In [22]:
perc_pos_modzcta.columns = [x.lower() for x in perc_pos_modzcta.columns]

In [23]:
perc_pos_modzcta.shape

(44, 184)

In [24]:
perc_pos_modzcta.to_pickle('./pickle/perc_pos_modzcta')

In [25]:
%who DataFrame

anti_by_modzcta	 caserate_by_modzcta	 covid_counts	 data_by_day	 data_by_modzcta	 perc_pos_modzcta	 


### Bicycle Counters

In [26]:
b_counters = pd.read_csv('./data/Bicycle_Counters.csv',
                        index_col='id')
b_counters.sort_values(by='id')

Unnamed: 0_level_0,name,latitude,longitude,domain,site,timezone,interval,sens,installationDate,counter
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Manhattan Bridge 2012 Test Bike Counter,40.69981,-73.98589,New York City DOT,100005020,(UTC-05:00) US/Eastern;DST,15,0,08/31/2012,
1,2nd Avenue - 26th St S,40.73971,-73.97954,New York City DOT,100009424,(UTC-05:00) US/Eastern;DST,15,0,05/22/2015,
2,Prospect Park West,40.671288,-73.971382,New York City DOT,100009425,(UTC-05:00) US/Eastern;DST,15,0,11/07/2016,Y2H13094304
3,Manhattan Bridge Ped Path,40.714573,-73.99495,New York City DOT,100009426,(UTC-05:00) US/Eastern;DST,15,0,12/04/2013,Y2H13074107
4,Williamsburg Bridge Bike Path,40.71053,-73.96145,New York City DOT,100009427,(UTC-05:00) US/Eastern;DST,15,0,12/03/2013,Y2H13074108
5,Ed Koch Queensboro Bridge Shared Path,40.751038,-73.94082,New York City DOT,100009428,(UTC-05:00) US/Eastern;DST,15,0,12/04/2013,Y2H19111445
6,Manhattan Bridge 2013 to 2018 Bike Counter,40.699768,-73.98582,New York City DOT,100009429,(UTC-05:00) US/Eastern;DST,15,0,12/03/2013,
7,Staten Island Ferry,40.643387,-74.072075,New York City DOT,100010017,(UTC-05:00) US/Eastern;DST,15,0,03/31/2016,Y2H13094300
8,Pulaski Bridge,40.742563,-73.951492,New York City DOT,100010018,(UTC-05:00) US/Eastern;DST,15,0,06/24/2017,Y2H13094301
9,Kent Ave btw North 8th St and North 9th St,40.720959,-73.96093,New York City DOT,100010019,(UTC-05:00) US/Eastern;DST,15,0,11/22/2016,Y2H13094302


Bicycle Counter csv has `23` counters with name, lat/lon, site?, installation date and counter columns. Need to figure out what `counter` column is as it has some NaN

In [27]:
b_counters.to_pickle('./pickle/b_counters')

### Bicycle Counts

In [28]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("uczf-rk3c")

# create GeoDataFrame out of request
b_counts = pd.DataFrame.from_records(results)

# preview dataframe
b_counts

Unnamed: 0,id1,counts,date,status,site
0,0,41,2012-08-31T00:00:00.000,4,100005020
1,1,52,2012-08-31T00:15:00.000,4,100005020
2,2,38,2012-08-31T00:30:00.000,4,100005020
3,3,36,2012-08-31T00:45:00.000,4,100005020
4,4,40,2012-08-31T01:00:00.000,4,100005020
...,...,...,...,...,...
995,995,164,2012-09-10T08:45:00.000,4,100005020
996,996,151,2012-09-10T09:00:00.000,4,100005020
997,997,163,2012-09-10T09:15:00.000,4,100005020
998,998,151,2012-09-10T09:30:00.000,4,100005020


#### Next Steps...
`site` column can facilitate merge with `b_counters` dataframe to get lat/lon for mapping purposes.
<br>
I am not sure what the `status` column means.
<br>
I can show change in ridership over time by merging `b_counters` and `b_counts` dataframes.
<br>
If I want to compare with COVID, I can find COVID rates for same time period in areas that bicycle counters are located.

In [29]:
b_counts.to_pickle('./pickle/b_counts')

### Motor Vehicle Collisions

In [30]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("h9gi-nx95", limit=100000)

# create GeoDataFrame out of request
collisions = pd.DataFrame.from_records(results)

pd.set_option('display.max_columns', None)
# preview dataframe
collisions.head()

Unnamed: 0,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,collision_id,vehicle_type_code1,vehicle_type_code2,borough,zip_code,latitude,longitude,location,cross_street_name,off_street_name,contributing_factor_vehicle_3,contributing_factor_vehicle_4,vehicle_type_code_3,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-04-14T00:00:00.000,5:32,BRONX WHITESTONE BRIDGE,0,0,0,0,0,0,0,0,Following Too Closely,Unspecified,4407480,Sedan,Sedan,,,,,,,,,,,,,
1,2021-04-13T00:00:00.000,21:35,,1,0,1,0,0,0,0,0,Unspecified,,4407147,Sedan,,BROOKLYN,11217.0,40.68358,-73.97617,"{'latitude': '40.68358', 'longitude': '-73.976...",620 ATLANTIC AVENUE,,,,,,,
2,2021-04-15T00:00:00.000,16:15,HUTCHINSON RIVER PARKWAY,0,0,0,0,0,0,0,0,Pavement Slippery,,4407665,Station Wagon/Sport Utility Vehicle,,,,,,,,,,,,,,
3,2021-04-13T00:00:00.000,16:00,VANDERVORT AVENUE,0,0,0,0,0,0,0,0,Following Too Closely,Unspecified,4407811,Sedan,,BROOKLYN,11222.0,,,,,ANTHONY STREET,,,,,,
4,2021-04-12T00:00:00.000,8:25,EDSON AVENUE,0,0,0,0,0,0,0,0,Unspecified,Unspecified,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,0.0,0.0,"{'latitude': '0.0', 'longitude': '0.0'}",,,,,,,,


In [31]:
collisions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 29 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   crash_date                     100000 non-null  object
 1   crash_time                     100000 non-null  object
 2   on_street_name                 73147 non-null   object
 3   number_of_persons_injured      99999 non-null   object
 4   number_of_persons_killed       100000 non-null  object
 5   number_of_pedestrians_injured  100000 non-null  object
 6   number_of_pedestrians_killed   100000 non-null  object
 7   number_of_cyclist_injured      100000 non-null  object
 8   number_of_cyclist_killed       100000 non-null  object
 9   number_of_motorist_injured     100000 non-null  object
 10  number_of_motorist_killed      100000 non-null  object
 11  contributing_factor_vehicle_1  99514 non-null   object
 12  contributing_factor_vehicle_2  76675 non-null

In [32]:
# create list of vehicle types that count as bicycles
bike_list = ['Bike','BICYCLE','Minibike','Minicycle']

# filter dataframe for any columns in vehicle type that are in the bike list
b_collisions = collisions[(collisions['vehicle_type_code1'].isin(bike_list)) |\
                         (collisions['vehicle_type_code2'].isin(bike_list)) |\
                         (collisions['vehicle_type_code_3'].isin(bike_list)) |\
                         (collisions['vehicle_type_code_4'].isin(bike_list)) |\
                         (collisions['vehicle_type_code_5'].isin(bike_list))]

In [33]:
# set row display to 20
pd.set_option('display.max_rows',20)

# preview bike list filtered dataframe
b_collisions

Unnamed: 0,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,contributing_factor_vehicle_2,collision_id,vehicle_type_code1,vehicle_type_code2,borough,zip_code,latitude,longitude,location,cross_street_name,off_street_name,contributing_factor_vehicle_3,contributing_factor_vehicle_4,vehicle_type_code_3,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
54,2021-04-16T00:00:00.000,11:00,,1,0,0,0,1,0,0,0,Turning Improperly,Unspecified,4407792,Station Wagon/Sport Utility Vehicle,Bike,QUEENS,11368,40.74958,-73.86541,"{'latitude': '40.74958', 'longitude': '-73.865...",100-10 ROOSEVELT AVENUE,,,,,,,
93,2021-04-14T00:00:00.000,0:00,BATH AVENUE,1,0,0,0,1,0,0,0,Failure to Yield Right-of-Way,Unspecified,4407649,Station Wagon/Sport Utility Vehicle,Bike,,,40.601864,-74.00232,"{'latitude': '40.601864', 'longitude': '-74.00...",,,,,,,,
131,2021-04-14T00:00:00.000,20:10,WASHINGTON AVENUE,0,0,0,0,0,0,0,0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Unspecified,4407381,Station Wagon/Sport Utility Vehicle,Bike,BROOKLYN,11238,40.68821,-73.96583,"{'latitude': '40.68821', 'longitude': '-73.965...",,LAFAYETTE AVENUE,,,,,,
143,2021-04-13T00:00:00.000,17:55,GRANT HIGHWAY,1,0,0,0,1,0,0,0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Unspecified,4407789,Station Wagon/Sport Utility Vehicle,Bike,BRONX,10452,40.844105,-73.923065,"{'latitude': '40.844105', 'longitude': '-73.92...",,UNIVERSITY AVENUE,,,,,,
149,2021-04-14T00:00:00.000,19:45,FLATBUSH AVENUE EXTENSION,1,0,0,0,1,0,0,0,Driver Inattention/Distraction,Driver Inattention/Distraction,4407414,Sedan,Bike,BROOKLYN,11201,40.69484,-73.98391,"{'latitude': '40.69484', 'longitude': '-73.983...",,JOHNSON STREET,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99922,2020-07-19T00:00:00.000,20:50,BEDFORD AVENUE,1,0,0,0,1,0,0,0,Alcohol Involvement,Driver Inattention/Distraction,4330581,Station Wagon/Sport Utility Vehicle,Bike,BROOKLYN,11211,40.7189400,-73.9565400,"{'latitude': '40.71894', 'longitude': '-73.956...",,NORTH 9 STREET,,,,,,
99923,2020-08-13T00:00:00.000,19:48,,1,0,0,0,1,0,0,0,Pedestrian/Bicyclist/Other Pedestrian Error/Co...,Unspecified,4338269,Station Wagon/Sport Utility Vehicle,Bike,BRONX,10463,40.8834900,-73.8985700,"{'latitude': '40.88349', 'longitude': '-73.898...",3626 BAILEY AVENUE,,,,,,,
99956,2020-08-11T00:00:00.000,17:45,39 STREET,1,0,0,0,1,0,0,0,Unspecified,Unspecified,4337546,Station Wagon/Sport Utility Vehicle,Bike,BROOKLYN,11218,40.6431660,-73.9905000,"{'latitude': '40.643166', 'longitude': '-73.99...",,FORT HAMILTON PARKWAY,,,,,,
99983,2020-08-13T00:00:00.000,8:14,CATALPA AVENUE,1,0,0,0,1,0,0,0,Unspecified,Unspecified,4338789,Sedan,Bike,QUEENS,11385,40.7034150,-73.8976500,"{'latitude': '40.703415', 'longitude': '-73.89...",,60 LANE,,,,,,


In [34]:
# look at column dtypes and info
b_collisions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4977 entries, 54 to 99992
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     4977 non-null   object
 1   crash_time                     4977 non-null   object
 2   on_street_name                 3959 non-null   object
 3   number_of_persons_injured      4976 non-null   object
 4   number_of_persons_killed       4977 non-null   object
 5   number_of_pedestrians_injured  4977 non-null   object
 6   number_of_pedestrians_killed   4977 non-null   object
 7   number_of_cyclist_injured      4977 non-null   object
 8   number_of_cyclist_killed       4977 non-null   object
 9   number_of_motorist_injured     4977 non-null   object
 10  number_of_motorist_killed      4977 non-null   object
 11  contributing_factor_vehicle_1  4975 non-null   object
 12  contributing_factor_vehicle_2  4508 non-null   object
 13  c

In [35]:
# pickle dataframe
b_collisions.to_pickle('./pickle/b_collisions')

#### Next Steps...
There are a ton of null values in the motor vehicle dataframe...some are ok, others are not. For instance if we don't even have the `latitude` or `longitude` of the crash site we have no idea where the crash occurred. Consider dropping rows where any null in coordinates.
<br>
Also, show change in ridership vs change in Motor Vehicle Accidents containing some type of bicycle.<br>
Probably wil have to engineer a feature that is a percentage of accidents involving bicycle:
- overall
- per year
- per month during the COVID months


In [61]:
# verify what dataframes are currently active in the notebook
%who DataFrame

anti_by_modzcta	 b_collisions	 b_counters	 b_counts	 caserate_by_modzcta	 collisions	 covid_counts	 data_by_day	 data_by_modzcta	 
perc_pos_modzcta	 


## Shapefiles

### Bike Priority Areas (shapefile)

In [36]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("byz4-8k8n")

# create GeoDataFrame out of request
b_priority_gdf = gpd.GeoDataFrame(results)

In [37]:
# name gdf
b_priority_gdf.name = 'bike_priorty_areas'

# preview
b_priority_gdf

Unnamed: 0,the_geom,shape_leng,boro_cd_cod,shape_area
0,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",36213.8362463,BK 3,79460977.3423
1,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",58026.4575893,BK 5,156204154.973
2,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",52468.9113739,BK 12,99833185.8188
3,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",49259.5617348,BK 14,82168722.5327
4,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",96010.9373532,BK 15,131678761.813
5,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",43287.2618759,BK 17,93791381.4552
6,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",37011.0141973,QN 3,82975598.8703
7,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",36820.7520393,QN 4,65634904.8855
8,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",69923.9500542,QN 5,210416759.573
9,"{'type': 'MultiPolygon', 'coordinates': [[[[-7...",37060.9166461,BK 4,56660385.9663


In [38]:
b_priority_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   the_geom     10 non-null     object
 1   shape_leng   10 non-null     object
 2   boro_cd_cod  10 non-null     object
 3   shape_area   10 non-null     object
dtypes: object(4)
memory usage: 448.0+ bytes


### Street Improvement Projects (Corridors)

In [40]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("dy2j-gkig")

# create GeoDataFrame out of request
st_imp_corridors_gdf = gpd.GeoDataFrame(results)

# name gdf
st_imp_corridors_gdf.name = 'street_improvement_corridors'

#preview 
st_imp_corridors_gdf

Unnamed: 0,the_geom,pjct_name,sip_year,end_date,shape_leng
0,"{'type': 'MultiLineString', 'coordinates': [[[...",Baychester Ave (Boston Rd to E 233 St),2016,2016-10-27T00:00:00.000Z,2769.06422937
1,"{'type': 'MultiLineString', 'coordinates': [[[...",Castle Hill Ave (E Tremont Ave to Hart St),2016,2016-12-09T00:00:00.000Z,11603.5811528
2,"{'type': 'MultiLineString', 'coordinates': [[[...",Union Square,2016,2016-12-05T00:00:00.000Z,7772.55689005
3,"{'type': 'MultiLineString', 'coordinates': [[[...",20th Ave,2016,2016-11-18T00:00:00.000Z,4590.7476249
4,"{'type': 'MultiLineString', 'coordinates': [[[...",Baychester Ave-E 241 St (E 233 St to Carpenter...,2018,2018-07-31T00:00:00.000Z,6981.19857529
...,...,...,...,...,...
701,"{'type': 'MultiLineString', 'coordinates': [[[...",Tillary Phase II Capital HWK639WA,2020,2020-06-05T00:00:00.000Z,7874.73172332
702,"{'type': 'MultiLineString', 'coordinates': [[[...",Atlantic Ave Ph I - Capital HWD10105,2020,2020-12-22T00:00:00.000Z,11260.3239094
703,"{'type': 'MultiLineString', 'coordinates': [[[...","University Ave, Washington Bridge to Featherbe...",2020,2020-11-13T00:00:00.000Z,450.510345682
704,"{'type': 'MultiLineString', 'coordinates': [[[...",Main St SBS - Capital HWQ100FJM,2020,2020-06-17T00:00:00.000Z,3569.11425307


In [41]:
st_imp_corridors_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 706 entries, 0 to 705
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   the_geom    706 non-null    object
 1   pjct_name   706 non-null    object
 2   sip_year    706 non-null    object
 3   end_date    705 non-null    object
 4   shape_leng  706 non-null    object
dtypes: object(5)
memory usage: 27.7+ KB


In [42]:
st_imp_corridors_gdf.name

'street_improvement_corridors'

### Street Improvement Projects (Intersections)

In [44]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("egh6-q2b9")

# create GeoDataFrame out of request
st_imp_intersect_gdf = gpd.GeoDataFrame(results)

# name gdf
st_imp_intersect_gdf.name = 'street_improvement_intersections'

#preview 
st_imp_intersect_gdf

Unnamed: 0,sip_year,pjct_name,end_date,the_geom
0,2018,Vanderbilt Ave at E 2nd St and E 3rd St,2018-12-19T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.97..."
1,2012,East New York Ave at Atlantic Ave,2012-11-15T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.90..."
2,2014,46th Street and Queens Blvd.,2014-07-28T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.91..."
3,2016,St Marks Ave & Classon Ave,2016-12-05T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.95..."
4,2019,Lexington Ave and 59th St,2019-03-31T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.96..."
...,...,...,...,...
233,2011,4th Avenue,2011-12-06T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.97..."
234,2020,White Plains Rd and Arnow Ave,2021-01-12T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.86..."
235,2017,W 259 St & Netherland Ave (EC),2017-09-29T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.90..."
236,2019,Atlantic Av and Grand Av,2019-07-18T00:00:00.000Z,"{'type': 'MultiPoint', 'coordinates': [[-73.96..."


In [45]:
st_imp_intersect_gdf.name

'street_improvement_intersections'

In [46]:
st_imp_intersect_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sip_year   238 non-null    object
 1   pjct_name  238 non-null    object
 2   end_date   238 non-null    object
 3   the_geom   238 non-null    object
dtypes: object(4)
memory usage: 7.6+ KB


### Cityracks

In [47]:
bike_racks = gpd.read_file(filename='./data/2013-cityracks-shp/city_racks_2013_06_28.shp')
bike_racks

Unnamed: 0,Name,small,large,circular,mini_hoop,total_rack,geometry
0,1 7 AV S,5,0,0,0,5,POINT Z (982903.570 205129.999 0.000)
1,1 BOERUM PL,1,0,0,0,1,POINT Z (987330.416 191302.730 0.000)
2,1 CENTRE ST,10,0,0,0,10,POINT Z (983210.953 199016.513 0.000)
3,1 E 13 ST,1,0,0,0,1,POINT Z (985897.840 207157.885 0.000)
4,1 E 183 ST,0,0,2,0,2,POINT Z (1010993.969 252137.340 0.000)
...,...,...,...,...,...,...,...
11729,997 1 AV,0,1,0,0,1,POINT Z (994160.431 215021.045 0.000)
11730,997 FULTON ST,1,0,0,0,1,POINT Z (994511.753 188057.801 0.000)
11731,998 AMSTERDAM AV,1,0,0,0,1,POINT Z (994125.656 231703.878 0.000)
11732,998 COLUMBUS AV,0,0,0,1,1,POINT Z (994936.194 231279.407 0.000)


In [48]:
bike_racks.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 11734 entries, 0 to 11733
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Name        11734 non-null  object  
 1   small       11734 non-null  int64   
 2   large       11734 non-null  int64   
 3   circular    11734 non-null  int64   
 4   mini_hoop   11734 non-null  int64   
 5   total_rack  11734 non-null  int64   
 6   geometry    11734 non-null  geometry
dtypes: geometry(1), int64(5), object(1)
memory usage: 641.8+ KB


In [49]:
bike_racks.crs

<Projected CRS: EPSG:2263>
Name: NAD83 / New York Long Island (ftUS)
Axis Info [cartesian]:
- X[east]: Easting (US survey foot)
- Y[north]: Northing (US survey foot)
Area of Use:
- name: USA - New York - SPCS - Long Island
- bounds: (-74.26, 40.47, -71.8, 41.3)
Coordinate Operation:
- name: SPCS83 New York Long Island zone (US Survey feet)
- method: Lambert Conic Conformal (2SP)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

#### Next Steps...
Where are there more city racks, and are they distributed equitably?
<br>
Maybe bring in some demographic data to compare.
But how would this help with the final ask?

### Bicycle Routes

In [50]:
# set client request for Socrata API
client = Socrata("data.cityofnewyork.us",app_token)

# make request using API endpoint
results = client.get("cc5c-sm6z", limit=100000)

# create GeoDataFrame out of request
b_routes_gdf = gpd.GeoDataFrame(results)

# name gdf
b_routes_gdf.name = 'bike_routes'

# preview
b_routes_gdf

Unnamed: 0,segmentid,street,the_geom,shape_leng,boro,facilitycl,fromstreet,tostreet,onoffst,allclasses,bikedir,lanecount,ft_facilit,tf_facilit,comments,tf2facilit,ft2facilit
0,53,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",131.771990229,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
1,57,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",268.689582437,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
2,59,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",420.193252605,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
3,61,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",238.242197819,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
4,64,CONFERENCE HOUSE PARK GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",646.293832417,5,I,HYLAN BLVD,SWINNERTON ST,OFF,I,2,2,Greenway,Greenway,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19870,79081,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",116.876290616,2,I,EDGEWATER RD,BRONX RIVER GREENWAY,OFF,I,2,2,Sidewalk,,,,
19871,196825,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",101.800150667,2,I,EDGEWATER RD,BRONX RIVER GREENWAY,OFF,I,2,2,Sidewalk,,,,
19872,196801,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",66.0292754239,2,I,BRONX RIVER GREENWAY,EDGEWATER RD,OFF,I,2,2,,Sidewalk,,,
19873,196800,BRONX RIVER GREENWAY,"{'type': 'MultiLineString', 'coordinates': [[[...",158.356411407,2,I,BRONX RIVER GREENWAY,EDGEWATER RD,OFF,I,2,2,,Sidewalk,,,


In [51]:
b_routes_gdf.name

'bike_routes'

In [52]:
b_routes_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19875 entries, 0 to 19874
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   segmentid   19875 non-null  object
 1   street      19875 non-null  object
 2   the_geom    19875 non-null  object
 3   shape_leng  19875 non-null  object
 4   boro        19875 non-null  object
 5   facilitycl  19875 non-null  object
 6   fromstreet  19875 non-null  object
 7   tostreet    19875 non-null  object
 8   onoffst     19875 non-null  object
 9   allclasses  19875 non-null  object
 10  bikedir     19875 non-null  object
 11  lanecount   19875 non-null  object
 12  ft_facilit  14732 non-null  object
 13  tf_facilit  14557 non-null  object
 14  comments    1638 non-null   object
 15  tf2facilit  37 non-null     object
 16  ft2facilit  11 non-null     object
dtypes: object(17)
memory usage: 2.6+ MB


### Fromat shapfiles

- create custom function to set CRS
- apply function to each geodataframe
- check each GDF to ensure in correct CRS
- export to file directory

In [53]:
# function to set the geometry for shapefiles
def set_geom(gdf,geom_col,set_crs=4326, to_crs=2263):
    """
    function that converts geom column from a geojson to an
    independent geometry with coordinates
    sets the geometry column 
    sets CRS default is NAD 2263 for NYC
    renames to geomtry
    pass in df
    pass in column name as string
    """
    gdf[geom_col] = gdf[geom_col].apply(shape)
    gdf.set_geometry(geom_col, inplace=True)
    gdf.rename_geometry('geometry', inplace=True)
    gdf.set_crs(set_crs, inplace=True)
    gdf.to_crs(to_crs, inplace=True)
    return gdf.head()

In [54]:
# create list of api derived shape files
shp_files = [b_routes_gdf, st_imp_corridors_gdf, st_imp_intersect_gdf, b_priority_gdf]

In [55]:
# apply custom formula to list of shp_files
for gdf in shp_files:
    set_geom(gdf, 'the_geom')

In [56]:
# check crs for each shapefile
for gdf in shp_files:
    print("{} is in {} crs".format(gdf.name, gdf.crs))

bike_routes is in epsg:2263 crs
street_improvement_corridors is in epsg:2263 crs
street_improvement_intersections is in epsg:2263 crs
bike_priorty_areas is in epsg:2263 crs


In [57]:
# view info for api shpiles
for file in shp_files:
    print(file.info())

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 19875 entries, 0 to 19874
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   segmentid   19875 non-null  object  
 1   street      19875 non-null  object  
 2   geometry    19875 non-null  geometry
 3   shape_leng  19875 non-null  object  
 4   boro        19875 non-null  object  
 5   facilitycl  19875 non-null  object  
 6   fromstreet  19875 non-null  object  
 7   tostreet    19875 non-null  object  
 8   onoffst     19875 non-null  object  
 9   allclasses  19875 non-null  object  
 10  bikedir     19875 non-null  object  
 11  lanecount   19875 non-null  object  
 12  ft_facilit  14732 non-null  object  
 13  tf_facilit  14557 non-null  object  
 14  comments    1638 non-null   object  
 15  tf2facilit  37 non-null     object  
 16  ft2facilit  11 non-null     object  
dtypes: geometry(1), object(16)
memory usage: 2.6+ MB
None
<class 'geopandas.geodataf

In [62]:
# verify what geodataframes are currently active in the notebook
%who GeoDataFrame

b_priority_gdf	 b_routes_gdf	 bike_racks	 file	 gdf	 st_imp_corridors_gdf	 st_imp_intersect_gdf	 


In [59]:
# export api shpfiles to shp folder 
for file in shp_files:
    file.to_file('./data/shp/' + file.name + '.shp')