In [318]:
import pandas as pd
import datetime as dt
import geopandas as gpd
from shapely.geometry import Point

In [319]:
rent = pd.read_csv('Data/Raw/zillow_data.csv')
rent = rent[rent.City=='New York']
rent.drop(['RegionID', 'SizeRank', 'RegionType', 'StateName',
       'State', 'City', 'Metro', 'CountyName'], axis=1, inplace=True)

In [320]:
# group by year and get average

df = rent.melt(id_vars='RegionName',var_name='year',value_name='med_rent')
df['year'] = pd.to_datetime(df['year']).dt.year
df = df.groupby(['RegionName','year'])['med_rent'].mean().reset_index()
df.RegionName = df.RegionName.astype(str)
df['year'] = pd.to_datetime(df['year'].astype(str))


### Add Missing years 

In [321]:
# expand date range to 2010

full_date_range = pd.date_range(start='2010-01-01', end='2024-01-01', freq='YS')
more_yrs = df.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.set_index('year').reindex(full_date_range))
more_yrs = more_yrs.reset_index().rename(columns={'level_1':'year'})
more_yrs[-14:]

Unnamed: 0,RegionName,year,med_rent
2176,11694,2011-01-01,
2177,11694,2012-01-01,
2178,11694,2013-01-01,
2179,11694,2014-01-01,
2180,11694,2015-01-01,
2181,11694,2016-01-01,
2182,11694,2017-01-01,
2183,11694,2018-01-01,
2184,11694,2019-01-01,
2185,11694,2020-01-01,


In [322]:
# interpolate missing values TODO: interpolate using trend/percent change instead of just backfilling repeat value

df = more_yrs.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.bfill()).reset_index().drop('level_1', axis=1)
df

Unnamed: 0,RegionName,year,med_rent
0,10001,2010-01-01,4040.470666
1,10001,2011-01-01,4040.470666
2,10001,2012-01-01,4040.470666
3,10001,2013-01-01,4040.470666
4,10001,2014-01-01,4040.470666
...,...,...,...
2185,11694,2020-01-01,2733.333333
2186,11694,2021-01-01,2733.333333
2187,11694,2022-01-01,2733.333333
2188,11694,2023-01-01,2733.333333


### Match Zip Codes to CT's  

In [323]:
# get census tract to ZCTA conversion

# Load the shapefiles
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)
zips = gpd.read_file('Data/Raw/zcta2020_shapefile/tl_2020_us_zcta520.shp')
zips = zips.to_crs(epsg = 4326)

# join files together on intersecion
mapping = gpd.sjoin(zips, tracts, how='inner', predicate="intersects")[['ZCTA5CE20','GEOID']]
mapping

# add census tract to main dataframe
merged = df.merge(mapping, left_on='RegionName', right_on='ZCTA5CE20', how='left').drop(['ZCTA5CE20'],axis=1)

Unnamed: 0,ZCTA5CE20,GEOID
9038,10152,36061010000
9038,10152,36061010200
9039,10153,36061011202
9039,10153,36061011201
9039,10153,36061011401
...,...,...
33785,11249,36047051700
33785,11249,36061002400
33785,11249,36047056900
33785,11249,36047055700


In [325]:
# remove 2024, 2023
merged['year'] = pd.to_datetime(merged['year']).dt.year
dff = merged[merged.year.isin([2024, 2023])==False]
dff.sort_values('RegionName', inplace=True)
dff.drop_duplicates(['year','GEOID'], inplace=True)  # same zip code is matching with multiple CT's so we drop dupes randomly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.sort_values('RegionName', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop_duplicates(['year','GEOID'], inplace=True)  # same zip code is matching with multiple CT's so we drop dupes randomly


Unnamed: 0,RegionName,year,med_rent,GEOID
0,10001,2010,4040.470666,36061005600
149,10001,2018,4043.817517,36061009901
150,10001,2018,4043.817517,36061009300
151,10001,2018,4043.817517,36061007600
152,10001,2018,4043.817517,36061010100
...,...,...,...,...
55278,11694,2014,2733.333333,36081092800
55279,11694,2014,2733.333333,36081093401
55280,11694,2014,2733.333333,36081093402
55300,11694,2016,2733.333333,36081092800


In [326]:
# adjust CT format
dff.drop('RegionName', axis=1, inplace=True)
dff['GEOID'] = dff['GEOID'].apply(lambda x: x[2:])
dff.rename(columns={'GEOID':'full_tract'}, inplace=True)
dff['year'] = dff['year'].astype(str)
dff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop('RegionName', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['GEOID'] = dff['GEOID'].apply(lambda x: x[2:])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.rename(columns={'GEOID':'full_tract'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http

Unnamed: 0,year,med_rent,full_tract
0,2010,4040.470666,061005600
149,2018,4043.817517,061009901
150,2018,4043.817517,061009300
151,2018,4043.817517,061007600
152,2018,4043.817517,061010100
...,...,...,...
55278,2014,2733.333333,081092800
55279,2014,2733.333333,081093401
55280,2014,2733.333333,081093402
55300,2016,2733.333333,081092800


In [327]:
# dff.to_parquet('Data/Cleaned/zillow_clean.parquet')