In [6]:
import pandas as pd
import datetime as dt
import geopandas as gpd
from shapely.geometry import Point

In [25]:
rent = pd.read_csv('Data/Raw/zillow_data.csv')
rent = rent[rent.City=='New York']
rent.drop(['RegionID', 'SizeRank', 'RegionType', 'StateName',
       'State', 'City', 'Metro', 'CountyName'], axis=1, inplace=True)


In [26]:
# group by year and get average

df = rent.melt(id_vars='RegionName',var_name='year',value_name='med_rent')
df['year'] = pd.to_datetime(df['year']).dt.year
df = df.groupby(['RegionName','year'])['med_rent'].mean().reset_index()  # get the average of that year for that zipcode
df.RegionName = df.RegionName.astype(str)
df['year'] = pd.to_datetime(df['year'].astype(str))


### Add Missing years 

In [27]:
# expand date range to 2010

full_date_range = pd.date_range(start='2010-01-01', end='2024-01-01', freq='YS')
more_yrs = df.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.set_index('year').reindex(full_date_range))
more_yrs = more_yrs.reset_index().rename(columns={'level_1':'year'})
more_yrs[-10:]

Unnamed: 0,RegionName,year,med_rent
2180,11694,2015-01-01,
2181,11694,2016-01-01,
2182,11694,2017-01-01,
2183,11694,2018-01-01,
2184,11694,2019-01-01,
2185,11694,2020-01-01,
2186,11694,2021-01-01,
2187,11694,2022-01-01,
2188,11694,2023-01-01,
2189,11694,2024-01-01,2733.333333


In [28]:
# # interpolate missing values TODO: interpolate using trend/percent change instead of just backfilling repeat value

# df = more_yrs.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.bfill()).reset_index().drop('level_1', axis=1)
# df

### Match Zip Codes to CT's  

In [29]:
# get census tract to ZCTA conversion

# Load the shapefiles
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)
zips = gpd.read_file('Data/Raw/zcta2020_shapefile/tl_2020_us_zcta520.shp')
zips = zips.to_crs(epsg = 4326)

# join files together on intersecion
mapping = gpd.sjoin(zips, tracts, how='inner', predicate="intersects")[['ZCTA5CE20','GEOID']]
mapping

# add census tract to main dataframe
merged = df.merge(mapping, left_on='RegionName', right_on='ZCTA5CE20', how='left').drop(['ZCTA5CE20'],axis=1)

In [30]:
# remove 2024
merged['year'] = pd.to_datetime(merged['year']).dt.year
dff = merged[merged.year.isin([2024])==False]
dff.sort_values('RegionName', inplace=True)
dff.drop_duplicates(['year','GEOID'], inplace=True)  # same zip code is matching with multiple CT's so we drop dupes randomly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.sort_values('RegionName', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop_duplicates(['year','GEOID'], inplace=True)  # same zip code is matching with multiple CT's so we drop dupes randomly


In [33]:
# adjust dtype
dff.drop('RegionName', axis=1, inplace=True)
dff['year'] = dff['year'].astype(str)
dff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop('RegionName', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['year'] = dff['year'].astype(str)


Unnamed: 0,year,med_rent,GEOID
0,2015,4040.470666,36061005600
104,2020,3889.975996,36061009902
105,2020,3889.975996,36061009903
106,2020,3889.975996,36061011700
107,2020,3889.975996,36061008200
...,...,...,...
36847,2017,,36081093401
36846,2017,,36081092800
36845,2017,,36081092200
36855,2018,,36081091800


In [34]:
# dff.to_parquet('Data/Cleaned/zillow_clean.parquet')