In [16]:
import pandas as pd
import numpy as np
import datetime as dt
import geopandas as gpd
from shapely.geometry import Point

In [42]:
rent = pd.read_csv('Data/Raw/zillow_data.csv')
rent = rent[rent.City=='New York']
rent.drop(['RegionID', 'SizeRank', 'RegionType', 'StateName',
       'State', 'City', 'Metro', 'CountyName'], axis=1, inplace=True)


In [43]:
# group by year and get average

df = rent.melt(id_vars='RegionName',var_name='year',value_name='med_rent')
df['year'] = pd.to_datetime(df['year']).dt.year
df = df.groupby(['RegionName','year'])['med_rent'].mean().reset_index()  # get the average of that year for that zipcode
df.RegionName = df.RegionName.astype(str)
df['year'] = pd.to_datetime(df['year'].astype(str))


### Add Missing years 

In [44]:
# expand date range to 2010

full_date_range = pd.date_range(start='2010-01-01', end='2024-01-01', freq='YS')
more_yrs = df.groupby('RegionName')[['year','med_rent']].apply(lambda x: x.set_index('year').reindex(full_date_range))
more_yrs = more_yrs.reset_index().rename(columns={'level_1':'year'})
more_yrs = more_yrs.sort_values(['RegionName','year'])
more_yrs

Unnamed: 0,RegionName,year,med_rent
0,10001,2010-01-01,
1,10001,2011-01-01,
2,10001,2012-01-01,
3,10001,2013-01-01,
4,10001,2014-01-01,
...,...,...,...
2185,11694,2020-01-01,
2186,11694,2021-01-01,
2187,11694,2022-01-01,
2188,11694,2023-01-01,


In [45]:
from sklearn.linear_model import LinearRegression

def backfill_trend(df, column):
    non_null_values = df[column].dropna()
    if len(non_null_values) < 5:  # if there aren't enough observations to make a trend, will rely in spatial interpolation in postprocessing
        return df

    first_valid_index = non_null_values.index[0]
    valid_indices = np.arange(len(df))
    valid_values = df[column].values
    
    # Prepare data for linear regression
    known_indices = np.where(~np.isnan(valid_values))[0].reshape(-1, 1)
    known_values = valid_values[~np.isnan(valid_values)]
    
    model = LinearRegression()
    model.fit(known_indices, known_values)
    
    missing_indices = np.where(np.isnan(valid_values))[0].reshape(-1, 1)
    predicted_values = model.predict(missing_indices)
    
    # Assign backfilled values
    df[column].iloc[missing_indices.flatten()] = predicted_values
    
    return df

# Apply backfill function
for zipcode in more_yrs['RegionName'].unique():
    zip_df = more_yrs[more_yrs['RegionName']==zipcode]
    more_yrs.drop(zip_df.index, inplace=True)
    zip_df = backfill_trend(zip_df, 'med_rent')
    more_yrs = pd.concat([more_yrs, zip_df])
more_yrs

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[column].iloc[missing_indices.flatten()] = predicted_values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d

Unnamed: 0,RegionName,year,med_rent
0,10001,2010-01-01,3140.842684
1,10001,2011-01-01,3266.392901
2,10001,2012-01-01,3391.943118
3,10001,2013-01-01,3517.493335
4,10001,2014-01-01,3643.043552
...,...,...,...
2185,11694,2020-01-01,
2186,11694,2021-01-01,
2187,11694,2022-01-01,
2188,11694,2023-01-01,


### Match Zip Codes to CT's  

In [46]:
# get census tract to ZCTA conversion

# Load the shapefiles
tracts = gpd.read_file('Data/Raw/tracts2020_shapefile/nyct2020.shp')
tracts = tracts.to_crs(epsg = 4326)
zips = gpd.read_file('Data/Raw/zcta2020_shapefile/tl_2020_us_zcta520.shp')
zips = zips.to_crs(epsg = 4326)

# join files together on intersecion
mapping = gpd.sjoin(zips, tracts, how='inner', predicate="intersects")[['ZCTA5CE20','GEOID']]
mapping

# add census tract to main dataframe
merged = df.merge(mapping, left_on='RegionName', right_on='ZCTA5CE20', how='left').drop(['ZCTA5CE20'],axis=1)

In [47]:
# remove 2024
merged['year'] = pd.to_datetime(merged['year']).dt.year
dff = merged[merged.year.isin([2024])==False]
dff.sort_values('RegionName', inplace=True)
dff.drop_duplicates(['year','GEOID'], inplace=True)  # same zip code is matching with multiple CT's so we drop dupes randomly


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.sort_values('RegionName', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop_duplicates(['year','GEOID'], inplace=True)  # same zip code is matching with multiple CT's so we drop dupes randomly


In [48]:
# adjust dtype
dff.drop('RegionName', axis=1, inplace=True)
dff['year'] = dff['year'].astype(str)
dff

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop('RegionName', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['year'] = dff['year'].astype(str)


Unnamed: 0,year,med_rent,GEOID
0,2015,4040.470666,36061005600
104,2020,3889.975996,36061009902
105,2020,3889.975996,36061009903
106,2020,3889.975996,36061011700
107,2020,3889.975996,36061008200
...,...,...,...
36847,2017,,36081093401
36846,2017,,36081092800
36845,2017,,36081092200
36855,2018,,36081091800


In [49]:
# dff.to_parquet('Data/Cleaned/zillow_clean.parquet')