In [1]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
from shapely.geometry import Point
import datetime as dt

In [2]:
directory = "Data/Raw/tree_census_2015.csv"
dff = pd.read_csv(directory)

df = dff[['health','borough','latitude', 'longitude']]
df.sample(10)


Unnamed: 0,health,borough,latitude,longitude
596719,Good,Manhattan,40.78259,-73.954878
465300,Good,Brooklyn,40.625495,-73.909341
13802,Good,Brooklyn,40.7209,-73.941768
424124,Good,Queens,40.793354,-73.803162
90837,,Brooklyn,40.685739,-73.983991
59828,Poor,Bronx,40.83785,-73.827457
131057,Good,Staten Island,40.635094,-74.113296
369745,Good,Brooklyn,40.689346,-73.923829
566289,Good,Bronx,40.842559,-73.917855
5586,Good,Bronx,40.842368,-73.899631


In [3]:
# get census tract to coordinate conversion

# Load the census tract shapefile
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)

coords_df = df[["latitude","longitude"]].drop_duplicates().reset_index(drop=True)
coords = [tuple(record) for record in coords_df.to_records(index=False)] #[(40.735324, -73.998004), (40.715595,	-73.987030)]

# Create a GeoDataFrame from the coordinates
geometry = [Point(lon, lat) for lat, lon in coords]
geo_df = gpd.GeoDataFrame(geometry=geometry, crs="EPSG:4326")
geo_df["latitude"] = coords_df["latitude"]
geo_df["longitude"] = coords_df["longitude"]

# Perform a spatial join to match points to census tracts
ct_lookup = gpd.sjoin(geo_df, tracts, how='left', predicate="within")[["latitude", "longitude", "CT2020"]]

# merge our lats/longs with the lookup table we created
df_ct = df.merge(ct_lookup, left_on=["latitude", "longitude"], right_on=["latitude", "longitude"])
df_ct.rename(columns={"CT2020":"Tract"}, inplace=True)
df_ct.drop(['latitude','longitude'], axis=1, inplace=True)
df_ct

Unnamed: 0,health,borough,Tract
0,Fair,Queens,073900
1,Fair,Queens,097300
2,Good,Brooklyn,044901
3,Good,Brooklyn,044902
4,Good,Brooklyn,016500
...,...,...,...
683783,Good,Brooklyn,051900
683784,Good,Queens,070700
683785,Good,Staten Island,020100
683786,Good,Bronx,023502


In [4]:
### encode the counties

conditions = [df_ct['borough'].str.contains("Bronx"),
             df_ct['borough'].str.contains("Manhattan"),
             df_ct['borough'].str.contains("Brooklyn"),
             df_ct['borough'].str.contains("Queens"),
             df_ct['borough'].str.contains("Staten")]
choices = ['005','061','047','081','085']

df_ct['county'] = np.select(conditions, choices, -1)
df_ct['full_tract'] = df_ct.county + df_ct['Tract']
df_ct

Unnamed: 0,health,borough,Tract,county,full_tract
0,Fair,Queens,073900,081,081073900
1,Fair,Queens,097300,081,081097300
2,Good,Brooklyn,044901,047,047044901
3,Good,Brooklyn,044902,047,047044902
4,Good,Brooklyn,016500,047,047016500
...,...,...,...,...,...
683783,Good,Brooklyn,051900,047,047051900
683784,Good,Queens,070700,081,081070700
683785,Good,Staten Island,020100,085,085020100
683786,Good,Bronx,023502,005,005023502


In [5]:
# group by tract number to get tree counts

df_health = df_ct.groupby(['full_tract','health'])['borough'].count().rename('count').reset_index()
df_15 = df_health.pivot_table(index='full_tract',columns='health',values='count').reset_index()
df_15['year']='2015'
df_15['num_trees'] = df_15['Fair'].fillna(0) +df_15['Good'].fillna(0) +df_15['Poor'].fillna(0)
df_15

health,full_tract,Fair,Good,Poor,year,num_trees
0,005000200,28.0,341.0,7.0,2015,376.0
1,005000400,58.0,361.0,21.0,2015,440.0
2,005001600,50.0,441.0,12.0,2015,503.0
3,005001901,24.0,64.0,2.0,2015,90.0
4,005001902,74.0,171.0,14.0,2015,259.0
...,...,...,...,...,...,...
2304,085030301,14.0,523.0,16.0,2015,553.0
2305,085030302,63.0,637.0,30.0,2015,730.0
2306,085031901,37.0,335.0,12.0,2015,384.0
2307,085031902,115.0,315.0,49.0,2015,479.0


In [6]:
########### 2005 Trees

directory = "Data/Raw/tree_census_2005.csv"
dff = pd.read_csv(directory)

df = dff[['status','latitude','longitude','boroname']]
df


  dff = pd.read_csv(directory)


Unnamed: 0,status,latitude,longitude,boroname
0,Good,40.632653,-74.000245,Brooklyn
1,Good,40.620084,-73.901453,Brooklyn
2,Good,40.617996,-73.899111,Brooklyn
3,Good,40.619694,-73.901003,Brooklyn
4,Good,40.618323,-73.899467,Brooklyn
...,...,...,...,...
592367,Good,40.586260,-74.148797,5
592368,Good,40.586090,-74.149013,5
592369,Good,40.585802,-74.149156,5
592370,Good,40.585802,-74.149156,5


In [7]:
# get census tract to coordinate conversion

# Load the census tract shapefile
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)

coords_df = df[["latitude","longitude"]].drop_duplicates().reset_index(drop=True)
coords = [tuple(record) for record in coords_df.to_records(index=False)] #[(40.735324, -73.998004), (40.715595,	-73.987030)]

# Create a GeoDataFrame from the coordinates
geometry = [Point(lon, lat) for lat, lon in coords]
geo_df = gpd.GeoDataFrame(geometry=geometry, crs="EPSG:4326")
geo_df["latitude"] = coords_df["latitude"]
geo_df["longitude"] = coords_df["longitude"]

# Perform a spatial join to match points to census tracts
ct_lookup = gpd.sjoin(geo_df, tracts, how='left', predicate="within")[["latitude", "longitude", "CT2020"]]

# merge our lats/longs with the lookup table we created
df_ct = df.merge(ct_lookup, left_on=["latitude", "longitude"], right_on=["latitude", "longitude"])
df_ct.rename(columns={"CT2020":"Tract"}, inplace=True)
df_ct.drop(['latitude','longitude'], axis=1, inplace=True)
df_ct

Unnamed: 0,status,boroname,Tract
0,Good,Brooklyn,021600
1,Good,Brooklyn,070601
2,Good,Brooklyn,070601
3,Good,Brooklyn,070601
4,Good,Brooklyn,070601
...,...,...,...
592367,Good,5,027704
592368,Good,5,027704
592369,Good,5,027704
592370,Good,5,027704


In [8]:
### encode the counties
df_ct['boroname'] = df_ct['boroname'].astype(str)

conditions = [df_ct['boroname'].str.contains("Bronx"),
             df_ct['boroname'].str.contains("Manhattan"),
             df_ct['boroname'].str.contains("Brooklyn"),
             df_ct['boroname'].str.contains("Queens"),
             df_ct['boroname'].str.contains("5")]
choices = ['005','061','047','081','085']

df_ct['county'] = np.select(conditions, choices, -1)
df_ct['full_tract'] = df_ct.county + df_ct['Tract']
df_ct

Unnamed: 0,status,boroname,Tract,county,full_tract
0,Good,Brooklyn,021600,047,047021600
1,Good,Brooklyn,070601,047,047070601
2,Good,Brooklyn,070601,047,047070601
3,Good,Brooklyn,070601,047,047070601
4,Good,Brooklyn,070601,047,047070601
...,...,...,...,...,...
592367,Good,5,027704,085,085027704
592368,Good,5,027704,085,085027704
592369,Good,5,027704,085,085027704
592370,Good,5,027704,085,085027704


In [9]:
df_health = df_ct.groupby(['full_tract','status'])['boroname'].count().rename('count').reset_index()
df_05 = df_health.pivot_table(index='full_tract',columns='status',values='count').reset_index()
df_05['year']='2005'
df_05['num_trees'] = df_05['Excellent'].fillna(0)+ df_05['Good'].fillna(0)+ df_05['Poor'].fillna(0)
df_05


status,full_tract,Dead,Excellent,Good,Poor,year,num_trees
0,005000200,3.0,92.0,135.0,13.0,2005,240.0
1,005000400,8.0,42.0,149.0,5.0,2005,196.0
2,005001600,3.0,78.0,134.0,19.0,2005,231.0
3,005001901,,33.0,10.0,3.0,2005,46.0
4,005001902,1.0,29.0,39.0,5.0,2005,73.0
...,...,...,...,...,...,...,...
2289,085030301,10.0,45.0,357.0,50.0,2005,452.0
2290,085030302,16.0,155.0,515.0,41.0,2005,711.0
2291,085031901,11.0,7.0,192.0,11.0,2005,210.0
2292,085031902,15.0,7.0,221.0,20.0,2005,248.0


In [10]:
## Impute missing values with the average percent increase
combined = df_05[['full_tract', 'year', 'num_trees']].merge(df_15[['full_tract', 'year', 'num_trees']], on='full_tract')
combined['pct_chng'] = ((combined['num_trees_y'] - combined['num_trees_x']) / combined['num_trees_x'])
avg_pct_chng = combined['pct_chng'].mean()

temp_15 = df_15[['full_tract','year','num_trees']][df_15['full_tract'].isin(df_05['full_tract'])==False]
temp_15['num_trees'] = round(temp_15['num_trees'] * avg_pct_chng)
temp_15['year'] = '2005'

### Combine tree counts by year
stacked = pd.concat([df_05[['full_tract', 'year','num_trees']],temp_15,df_15[['full_tract', 'year', 'num_trees']]])
stacked


Unnamed: 0,full_tract,year,num_trees
0,005000200,2005,240.0
1,005000400,2005,196.0
2,005001600,2005,231.0
3,005001901,2005,46.0
4,005001902,2005,73.0
...,...,...,...
2304,085030301,2015,553.0
2305,085030302,2015,730.0
2306,085031901,2015,384.0
2307,085031902,2015,479.0


In [11]:
###### Change frequency of counts to yearly

stacked['year'] = pd.to_datetime(stacked['year'])
yearly = stacked.groupby('full_tract')[['year','num_trees']].apply(lambda x: x.resample("YE", on='year').mean()).apply(lambda x: x.interpolate(method='linear')).reset_index()

# remove irrelevant years
yearly = yearly[yearly.year>='2010']

# add more years
full_date_range = pd.date_range(start='2010-12-31', end='2022-12-31', freq='YE')
all_years = yearly.groupby('full_tract')[['year','num_trees']].apply(lambda x: x.set_index('year').reindex(full_date_range)).reset_index()
all_years

Unnamed: 0,full_tract,level_1,num_trees
0,005000200,2010-12-31,308.0
1,005000200,2011-12-31,321.6
2,005000200,2012-12-31,335.2
3,005000200,2013-12-31,348.8
4,005000200,2014-12-31,362.4
...,...,...,...
30012,085032300,2018-12-31,
30013,085032300,2019-12-31,
30014,085032300,2020-12-31,
30015,085032300,2021-12-31,


In [12]:
kw = dict(method="quadratic", fill_value="extrapolate")
extrap = all_years.groupby('full_tract')['num_trees'].apply(lambda x: x.interpolate(**kw)).reset_index()
extrap = pd.concat([extrap.drop('level_1',axis=1), all_years['level_1']],axis=1)  # get year column back

In [14]:
extrap.rename(columns={'level_1':'year'}, inplace=True)
extrap['year'] = extrap['year'].dt.year
extrap

Unnamed: 0,full_tract,num_trees,year
0,005000200,308.0,2010
1,005000200,321.6,2011
2,005000200,335.2,2012
3,005000200,348.8,2013
4,005000200,362.4,2014
...,...,...,...
30012,085032300,104.3,2018
30013,085032300,102.4,2019
30014,085032300,100.5,2020
30015,085032300,98.6,2021


In [15]:
# save to cleaned data
# extrap.to_parquet("Data/Cleaned/tree_census.parquet")