In [199]:
import pandas as pd
import datetime as dt
import geopandas as gpd
from shapely.geometry import Point

In [228]:
rent = pd.read_csv('Data/Raw/zillow_data.csv')
rent = rent[rent.City=='New York']
rent.drop(['RegionID', 'SizeRank', 'RegionType', 'StateName',
       'State', 'City', 'Metro', 'CountyName'], axis=1, inplace=True)

In [232]:
# group by year and get average

df = rent.melt(id_vars='RegionName',var_name='year',value_name='med_rent')
df['year'] = pd.to_datetime(df['year']).dt.year
df = df.groupby(['RegionName','year'])['med_rent'].mean().reset_index()
df.RegionName = df.RegionName.astype(str)
# df['year'] = pd.to_datetime(df['year'])
df[-12:]
pd.to_datetime(df['year'])

0      1970-01-01 00:00:00.000002015
1      1970-01-01 00:00:00.000002016
2      1970-01-01 00:00:00.000002017
3      1970-01-01 00:00:00.000002018
4      1970-01-01 00:00:00.000002019
                    ...             
1455   1970-01-01 00:00:00.000002020
1456   1970-01-01 00:00:00.000002021
1457   1970-01-01 00:00:00.000002022
1458   1970-01-01 00:00:00.000002023
1459   1970-01-01 00:00:00.000002024
Name: year, Length: 1460, dtype: datetime64[ns]

### Add Missing years 

In [231]:
# expand date range to 2010

full_date_range = pd.date_range(start='2010', end='2024', freq='YE')
more_yrs = df.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.set_index('year').reindex(full_date_range))
more_yrs = more_yrs.reset_index().rename(columns={'level_1':'year'})
more_yrs[-20:]

Unnamed: 0,RegionName,year,med_rent
2024,11691,2018-12-31,
2025,11691,2019-12-31,
2026,11691,2020-12-31,
2027,11691,2021-12-31,
2028,11691,2022-12-31,
2029,11691,2023-12-31,
2030,11694,2010-12-31,
2031,11694,2011-12-31,
2032,11694,2012-12-31,
2033,11694,2013-12-31,


In [194]:
# interpolate missing values

more_yrs.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.bfill()).reset_index()


Unnamed: 0,RegionName,level_1,year,med_rent
0,10001,0,2010-12-31,4067.927377
1,10001,1,2011-12-31,4067.927377
2,10001,2,2012-12-31,4067.927377
3,10001,3,2013-12-31,4067.927377
4,10001,4,2014-12-31,4067.927377
...,...,...,...,...
2185,11694,2185,2020-12-31,
2186,11694,2186,2021-12-31,
2187,11694,2187,2022-12-31,
2188,11694,2188,2023-12-31,


### Match Zip Codes to CT's  

In [78]:
# get census tract to ZCTA conversion

# Load the shapefiles
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)
zips = gpd.read_file('Data/Raw/zcta2020_shapefile/tl_2020_us_zcta520.shp')
zips = zips.to_crs(epsg = 4326)

# join files together on intersecion
mapping = gpd.sjoin(zips, tracts, how='inner', predicate="intersects")[['ZCTA5CE20','GEOID']]
mapping

Unnamed: 0,ZCTA5CE20,GEOID
9038,10152,36061010000
9038,10152,36061010200
9039,10153,36061011202
9039,10153,36061011201
9039,10153,36061011401
...,...,...
33785,11249,36047051700
33785,11249,36061002400
33785,11249,36047056900
33785,11249,36047055700


In [79]:
# add census tract to main dataframe

df = df.merge(mapping, left_on='RegionName', right_on='ZCTA5CE20', how='left').drop('ZCTA5CE20',axis=1)

In [80]:
# remove 2024, 2023
df = df[df.year.isin([2024, 2023])==False]


Unnamed: 0,RegionName,year,med_rent,GEOID
0,10001,2015,4040.470666,36061005600
1,10001,2015,4040.470666,36061005800
2,10001,2015,4040.470666,36061009100
3,10001,2015,4040.470666,36061007400
4,10001,2015,4040.470666,36061009500
...,...,...,...,...
36903,11694,2022,,36081093402
36904,11694,2022,,36081094203
36905,11694,2022,,36081093800
36906,11694,2022,,36081094201


In [89]:
# if an entire CT is null, 

df.sort_values(['RegionName','GEOID','year'], inplace=True)
df.sample(5)

Unnamed: 0,RegionName,year,med_rent,GEOID
36850,11694,2017,,36081093800
24468,11229,2015,,36047063600
13226,10473,2016,,36005011502
23324,11224,2015,,36047036002
7742,10451,2017,,36005005902


In [90]:
df[df.RegionName=='10462']

Unnamed: 0,RegionName,year,med_rent,GEOID
10779,10462,2015,,36005007200
10815,10462,2016,,36005007200
10851,10462,2017,1372.818998,36005007200
10887,10462,2018,1472.700870,36005007200
10923,10462,2019,1547.303019,36005007200
...,...,...,...,...
10911,10462,2018,1472.700870,36005033400
10947,10462,2019,1547.303019,36005033400
10983,10462,2020,1601.519144,36005033400
11019,10462,2021,1672.559096,36005033400
