In [31]:
import pandas as pd
import numpy as np
from scipy import stats

In [None]:
# Supress auto scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load Datasets

### Top airbnb zipcodes throughout nyc
Get zip codes that need to be mapped to zillow home values

In [32]:
nyc_top_zipcodes_all_years = pd.read_csv("./zipcode_data/nyc_top_zipcodes_all_years.csv")
display(nyc_top_zipcodes_all_years.head())

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


### Zillow home value prices
Zillow home value data for each zipcode in nyc years 2002-2022

In [36]:
zillow_home_value = pd.read_csv('../data/clean/zhome_value.csv', low_memory=False)
zillow_home_value["date"] = pd.to_datetime(zillow_home_value["date"]).apply(lambda x: x.year)
zillow_home_value = zillow_home_value.rename(columns={"date": "year", "zip_code": "zipcode"})
display(zillow_home_value.head())

Unnamed: 0,zipcode,state,city,borough,year,home_value
0,11368,NY,New York,QUEENS,2002,249885.0
1,11385,NY,New York,QUEENS,2002,333404.0
2,11236,NY,New York,BROOKLYN,2002,255687.0
3,11208,NY,New York,BROOKLYN,2002,235676.0
4,10467,NY,New York,BRONX,2002,174696.0


In [18]:
zillow_home_value.groupby("borough")['home_value'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BRONX,496.0,376852.772,120358.922,90412.167,302601.604,367200.417,449662.438,755567.333
BROOKLYN,815.0,667060.102,294326.306,201736.5,451608.958,588411.833,849253.708,1531767.1
MANHATTAN,877.0,1144716.03,610890.393,196200.167,726529.583,1002733.667,1380581.583,3509708.75
QUEENS,1193.0,525833.348,163973.358,189217.545,404326.917,495207.0,613614.5,1103964.4
STATEN_ISLAND,252.0,445088.871,102858.575,205149.583,380906.167,431854.458,513230.396,758961.3


### Filter by significant years from top zipcodes
Find min and max year for top zipcodes that need to be mapped from "nyc_top_zipcodes_all_years"

In [23]:
min_year, max_year = nyc_top_zipcodes_all_years["year"].min(), nyc_top_zipcodes_all_years["year"].max()
print("MIN={}\nMAX={}".format(min_year, max_year))

MIN=2015
MAX=2022


Using **min,max** years from nyc_top_zipcodes, narrow down zillow dataset years to min, max years

In [20]:
zillow_home_value = zillow_home_value[zillow_home_value["year"].between(min_year, max_year, inclusive=True)]
display(zillow_home_value)

  zillow_home_value = zillow_home_value[zillow_home_value["year"].between(min_year, max_year, inclusive=True)]


Unnamed: 0,zipcode,year,borough,home_value
13,10001,2015,MANHATTAN,2280604.000
14,10001,2016,MANHATTAN,2369453.250
15,10001,2017,MANHATTAN,2271803.500
16,10001,2018,MANHATTAN,2351996.833
17,10001,2019,MANHATTAN,2380720.417
...,...,...,...,...
3628,11694,2018,QUEENS,769236.167
3629,11694,2019,QUEENS,819271.333
3630,11694,2020,QUEENS,829625.583
3631,11694,2021,QUEENS,897179.167


### Group by zipcode, year, borough and compute the average for home value

In [26]:
zillow_home_value = zillow_home_value.groupby(["zipcode", "year", "borough"])["home_value"].mean().to_frame().reset_index()
display(zillow_home_value)

Unnamed: 0,zipcode,year,borough,home_value
0,10001,2015,MANHATTAN,2280604.000
1,10001,2016,MANHATTAN,2369453.250
2,10001,2017,MANHATTAN,2271803.500
3,10001,2018,MANHATTAN,2351996.833
4,10001,2019,MANHATTAN,2380720.417
...,...,...,...,...
1407,11694,2018,QUEENS,769236.167
1408,11694,2019,QUEENS,819271.333
1409,11694,2020,QUEENS,829625.583
1410,11694,2021,QUEENS,897179.167


# Map home value to top airbnb zipcodes for each borough in NYC

Helper function to narrow down each data frame to borough and map home prices for that year

In [27]:
def merge_zipcode_home_value(zipcodes, zillow_home_value, boro):
    # Narrow down by borough
    merge_cols = ['zipcode','year']   
    boro_zipcodes = zipcodes[zipcodes["borough"] == boro]
    # And remove "borough" column to prevent duplicates on merge
    boro_home_value = zillow_home_value[zillow_home_value["borough"]== boro].drop(["borough"], axis=1)
    
    merged_df = pd.merge(boro_zipcodes, boro_home_value,  how='left', left_on=merge_cols, right_on=merge_cols)
    return merged_df

Display mapped home values for each top zipcode in each borough, also create cumulative df "nyc_top_zipcode_home_val"

In [24]:
boros = nyc_top_zipcodes_all_years["borough"].unique().tolist()
all_boro_home_value = pd.DataFrame()

for borough in boros:
    boro_data = merge_zipcode_home_value(nyc_top_zipcodes_all_years, zillow_home_value, borough).sort_values(by=["home_value"], ascending=False)
    print("Borough={}".format(borough))
    display(boro_data)
    all_boro_home_value = pd.concat([all_boro_home_value, boro_data])

Borough=BRONX


Unnamed: 0,zipcode,borough,year,count,home_value
0,10469,BRONX,2022,122,592184.8
1,10466,BRONX,2022,119,560980.3
3,10466,BRONX,2021,99,528377.083
4,10456,BRONX,2022,98,454902.9
2,10463,BRONX,2018,105,357383.417


Borough=BROOKLYN


Unnamed: 0,zipcode,borough,year,count,home_value
2,11211,BROOKLYN,2022,2730,1150638.4
3,11211,BROOKLYN,2020,2719,1036530.25
0,11211,BROOKLYN,2019,6231,1023273.75
1,11211,BROOKLYN,2018,3227,997202.917
4,11211,BROOKLYN,2017,2645,967886.0


Borough=MANHATTAN


Unnamed: 0,zipcode,borough,year,count,home_value
3,10013,MANHATTAN,2019,1779,3310300.5
4,10024,MANHATTAN,2019,1721,1726462.083
1,10018,MANHATTAN,2018,2485,1463682.25
2,10003,MANHATTAN,2017,2184,1402998.5
0,10009,MANHATTAN,2017,2593,1091941.333


Borough=QUEENS


Unnamed: 0,zipcode,borough,year,count,home_value
4,11103,QUEENS,2018,443,881343.583
1,11103,QUEENS,2017,584,822617.0
0,11106,QUEENS,2018,648,618868.333
3,11104,QUEENS,2019,465,578418.083
2,11106,QUEENS,2017,531,576068.75


Borough=STATEN ISLAND


Unnamed: 0,zipcode,borough,year,count,home_value
0,10301,STATEN ISLAND,2019,212,
1,10301,STATEN ISLAND,2022,143,
2,10301,STATEN ISLAND,2021,126,
3,10301,STATEN ISLAND,2015,124,
4,10301,STATEN ISLAND,2016,118,


### NYC data for all top airbnb zipcodes and their home values 2002 - 2022

In [25]:
display(all_boro_home_value)

Unnamed: 0,zipcode,borough,year,count,home_value
0,10469,BRONX,2022,122,592184.8
1,10466,BRONX,2022,119,560980.3
3,10466,BRONX,2021,99,528377.083
4,10456,BRONX,2022,98,454902.9
2,10463,BRONX,2018,105,357383.417
2,11211,BROOKLYN,2022,2730,1150638.4
3,11211,BROOKLYN,2020,2719,1036530.25
0,11211,BROOKLYN,2019,6231,1023273.75
1,11211,BROOKLYN,2018,3227,997202.917
4,11211,BROOKLYN,2017,2645,967886.0


### Test for normality

In [13]:
# Generate Shapiro wilk test for 
def generate_shapiro_test(all_boro_home_value, boro):
    data = all_boro_home_value[all_boro_home_value["borough"] == boro]
    shapiro_wilk_data = stats.shapiro(data['home_value'])
    print(boro)
    print("W={}, P={}".format(shapiro_wilk_data[0], shapiro_wilk_data[1]))

In [14]:
generate_shapiro_test(all_boro_home_value, "BRONX")
generate_shapiro_test(all_boro_home_value, "BROOKLYN")
generate_shapiro_test(all_boro_home_value, "QUEENS")
generate_shapiro_test(all_boro_home_value, "MANHATTAN")

BRONX
W=0.9325530529022217, P=0.6138810515403748
BROOKLYN
W=0.8819567561149597, P=0.31827250123023987
QUEENS
W=0.810954213142395, P=0.09922929108142853
MANHATTAN
W=0.7880368232727051, P=0.06450823694467545
