In [13]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
from shapely.geometry import Point
import datetime as dt
from shapely import wkt

In [16]:
df = pd.read_csv('Data/Raw/bike_lanes.csv')
df = df[['the_geom','status','lanecount','instdate']]

In [10]:
df['year'] = pd.to_datetime(df['instdate']).dt.year
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = pd.to_datetime(df['instdate']).dt.year


Unnamed: 0,the_geom,status,lanecount,instdate,year
0,MULTILINESTRING ((-74.192429667502 40.52174149...,Current,2,10/01/2007,2007
1,MULTILINESTRING ((-74.16031384424598 40.589027...,Current,2,08/12/2021,2021
2,MULTILINESTRING ((-74.12631986979562 40.635252...,Current,2,09/11/2015,2015
3,MULTILINESTRING ((-74.00973537112554 40.645666...,Current,2,07/02/2013,2013
4,MULTILINESTRING ((-74.02089492413252 40.626542...,Current,2,06/29/2015,2015
...,...,...,...,...,...
27668,MULTILINESTRING ((-73.96432543559295 40.756381...,Current,1,05/24/2024,2024
27669,MULTILINESTRING ((-73.93422324951723 40.859429...,Current,1,06/06/2024,2024
27670,MULTILINESTRING ((-73.93416898292729 40.859258...,Current,1,06/06/2024,2024
27671,MULTILINESTRING ((-73.93399379322436 40.859413...,Current,1,06/06/2024,2024


### Spatial Join Lines to Census Tracts

In [11]:
# Load the census tract shapefile
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)

In [15]:
geometry = wkt.loads(df['the_geom'])
geo_df = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# Perform a spatial join to match points to census tracts
geo_df = gpd.sjoin(geo_df, tracts, how='inner', predicate="intersects")
geo_df = geo_df[['status', 'year','GEOID']]
geo_df

Unnamed: 0,status,year,GEOID
0,Current,2007,36085019800
1,Current,2021,36085027705
1,Current,2021,36085027706
2,Current,2015,36085013302
3,Current,2013,36047007800
...,...,...,...
27669,Current,2024,36061028700
27670,Current,2024,36061028100
27670,Current,2024,36061028700
27671,Current,2024,36061028700


In [18]:
geo_df['num_bikelanes'] = 1

b4 = geo_df[geo_df.year<2009]
df_09 = b4.groupby(['GEOID'])[['num_bikelanes']].sum().reset_index()
df_09['year'] = '2009'
df_09.sort_values('GEOID')

Unnamed: 0,GEOID,num_bikelanes,year
0,36005000200,2,2009
1,36005000400,17,2009
2,36005001600,12,2009
3,36005001901,15,2009
4,36005001902,34,2009
...,...,...,...
951,36085024402,16,2009
952,36085027301,7,2009
953,36085029104,6,2009
954,36085029105,12,2009


In [20]:
df_list = [df_09]
for yr in [2010,2011, 2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023]:
    df_yr = geo_df[geo_df.year==yr]
    df = pd.concat([df_list[-1], df_yr])  # add on total from previous year
    grouped = df.groupby(['GEOID'])[['num_bikelanes']].sum().reset_index()
    grouped['year'] = str(yr)
    df_list.append(grouped)

In [21]:
df_out = pd.concat(df_list).sort_values(['GEOID', 'year'])
df_out.sample(5)

Unnamed: 0,GEOID,num_bikelanes,year
349,36047013100,18,2021
855,36061001800,57,2023
457,36047042300,8,2015
739,36047077200,19,2023
799,36061017500,19,2014


In [24]:
df_out[df_out.GEOID=='36061017500']

Unnamed: 0,GEOID,num_bikelanes,year
649,36061017500,11,2009
692,36061017500,13,2010
694,36061017500,13,2011
716,36061017500,13,2012
781,36061017500,13,2013
799,36061017500,19,2014
835,36061017500,19,2015
890,36061017500,22,2016
911,36061017500,22,2017
936,36061017500,22,2018


In [25]:
# df_out.to_parquet('Data/Cleaned/bikelanes.parquet')