In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os

In [2]:
# Supress auto scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load Datasets

### Top airbnb zipcodes throughout nyc
Get zip codes that need to be mapped to zillow home values

In [3]:
nyc_top_zipcodes_all_years = pd.read_csv("./zipcode_data/nyc_top_zipcodes_all_years.csv")
nyc_top_zipcodes_all_years['borough'] = nyc_top_zipcodes_all_years['borough'].str.replace(' ','_')
display(nyc_top_zipcodes_all_years.head())

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


### Zillow home value prices
Zillow home value data for each zipcode in nyc years 2002-2022

In [4]:
zillow_home_value = pd.read_csv('../data/clean/zhome_value.csv', low_memory=False)
zillow_home_value["date"] = pd.to_datetime(zillow_home_value["date"]).apply(lambda x: x.year)
zillow_home_value = zillow_home_value.rename(columns={"date": "year", "zip_code": "zipcode"})
display(zillow_home_value.head())

Unnamed: 0,zipcode,state,city,borough,year,home_value
0,11368,NY,New York,QUEENS,2002,249885.0
1,11385,NY,New York,QUEENS,2002,333404.0
2,11236,NY,New York,BROOKLYN,2002,255687.0
3,11208,NY,New York,BROOKLYN,2002,235676.0
4,10467,NY,New York,BRONX,2002,174696.0


### Filter by significant years from top zipcodes
Find min and max year for top zipcodes that need to be mapped from "nyc_top_zipcodes_all_years"

In [5]:
min_year, max_year = nyc_top_zipcodes_all_years["year"].min(), nyc_top_zipcodes_all_years["year"].max()
print("MIN={}\nMAX={}".format(min_year, max_year))

MIN=2015
MAX=2022


Using **min,max** years from nyc_top_zipcodes, narrow down zillow dataset years to min, max years

In [6]:
zillow_home_value = zillow_home_value[zillow_home_value["year"].between(min_year, max_year, inclusive=True)]
display(zillow_home_value)

  zillow_home_value = zillow_home_value[zillow_home_value["year"].between(min_year, max_year, inclusive=True)]


Unnamed: 0,zipcode,state,city,borough,year,home_value
26486,11368,NY,New York,QUEENS,2015,417755.000
26487,11385,NY,New York,QUEENS,2015,537186.000
26488,11236,NY,New York,BROOKLYN,2015,401310.000
26489,11208,NY,New York,BROOKLYN,2015,396395.000
26490,10467,NY,New York,BRONX,2015,278072.000
...,...,...,...,...,...,...
43058,10464,NY,New York,BRONX,2022,609781.000
43059,10004,NY,New York,MANHATTAN,2022,1081807.000
43060,10006,NY,New York,MANHATTAN,2022,902566.000
43061,11243,NY,New York,BROOKLYN,2022,1426544.000


### Group by zipcode, year, borough and compute the average for home value

In [7]:
zillow_home_value = zillow_home_value.groupby(["zipcode", "year", "borough"])["home_value"].mean().to_frame().reset_index()
display(zillow_home_value)

Unnamed: 0,zipcode,year,borough,home_value
0,10001,2015,MANHATTAN,2280604.000
1,10001,2016,MANHATTAN,2369453.250
2,10001,2017,MANHATTAN,2271803.500
3,10001,2018,MANHATTAN,2351996.833
4,10001,2019,MANHATTAN,2380720.417
...,...,...,...,...
1407,11694,2018,QUEENS,769236.167
1408,11694,2019,QUEENS,819271.333
1409,11694,2020,QUEENS,829625.583
1410,11694,2021,QUEENS,897179.167


### Map home value to top airbnb zipcodes for each borough in NYC

In [8]:
# Helper function that creates csv file from data frame
def create_csv(data, folder_name, fname_suffix, boro, display_=False):
    outdir = "./zipcode_data/{}".format(folder_name)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    if display_:
        print("Borough={}".format(boro))
        display(data)
    file_name = "./zipcode_data/{}/{}_{}.csv".format(folder_name, boro.lower(), fname_suffix)
    data.to_csv(file_name, index=False)
    print("Created {}".format(file_name))

Helper function to narrow down each data frame to borough and map home prices for that year

In [9]:
def merge_zipcode_home_value(zipcodes, zillow_home_value, boro):
    # Narrow down by borough
    merge_cols = ['zipcode','year']
    boro_zipcodes = zipcodes[zipcodes["borough"] == boro]
    # And remove "borough" column to prevent duplicates on merge
    boro_home_value = zillow_home_value[zillow_home_value["borough"]== boro].drop(["borough"], axis=1)
    
    merged_df = pd.merge(boro_zipcodes, boro_home_value,  how='left', left_on=merge_cols, right_on=merge_cols)
    return merged_df

Display mapped home values for each top zipcode in each borough, also create cumulative df "nyc_top_zipcode_home_val"

In [10]:
boros = ["STATEN_ISLAND", "BROOKLYN", "BRONX", "QUEENS", "MANHATTAN"]
all_boro_home_value = pd.DataFrame()
for boro in boros:
    data = merge_zipcode_home_value(nyc_top_zipcodes_all_years, zillow_home_value, boro).sort_values(by=["home_value"], ascending=False)
    create_csv(data, "top_zipcodes", "top_zipcode_home_value", boro, True)
    all_boro_home_value = pd.concat([all_boro_home_value, data])

Borough=STATEN_ISLAND


Unnamed: 0,zipcode,borough,year,count,home_value
1,10301,STATEN_ISLAND,2022,143,667275.2
2,10301,STATEN_ISLAND,2021,126,613591.25
0,10301,STATEN_ISLAND,2019,212,581955.833
4,10301,STATEN_ISLAND,2016,118,478215.833
3,10301,STATEN_ISLAND,2015,124,438929.167


Created ./zipcode_data/top_zipcodes/staten_island_top_zipcode_home_value.csv
Borough=BROOKLYN


Unnamed: 0,zipcode,borough,year,count,home_value
2,11211,BROOKLYN,2022,2730,1150638.4
3,11211,BROOKLYN,2020,2719,1036530.25
0,11211,BROOKLYN,2019,6231,1023273.75
1,11211,BROOKLYN,2018,3227,997202.917
4,11211,BROOKLYN,2017,2645,967886.0


Created ./zipcode_data/top_zipcodes/brooklyn_top_zipcode_home_value.csv
Borough=BRONX


Unnamed: 0,zipcode,borough,year,count,home_value
0,10469,BRONX,2022,122,592184.8
1,10466,BRONX,2022,119,560980.3
3,10466,BRONX,2021,99,528377.083
4,10456,BRONX,2022,98,454902.9
2,10463,BRONX,2018,105,357383.417


Created ./zipcode_data/top_zipcodes/bronx_top_zipcode_home_value.csv
Borough=QUEENS


Unnamed: 0,zipcode,borough,year,count,home_value
4,11103,QUEENS,2018,443,881343.583
1,11103,QUEENS,2017,584,822617.0
0,11106,QUEENS,2018,648,618868.333
3,11104,QUEENS,2019,465,578418.083
2,11106,QUEENS,2017,531,576068.75


Created ./zipcode_data/top_zipcodes/queens_top_zipcode_home_value.csv
Borough=MANHATTAN


Unnamed: 0,zipcode,borough,year,count,home_value
3,10013,MANHATTAN,2019,1779,3310300.5
4,10024,MANHATTAN,2019,1721,1726462.083
1,10018,MANHATTAN,2018,2485,1463682.25
2,10003,MANHATTAN,2017,2184,1402998.5
0,10009,MANHATTAN,2017,2593,1091941.333


Created ./zipcode_data/top_zipcodes/manhattan_top_zipcode_home_value.csv


### NYC data for all top airbnb zipcodes and their home values 2002 - 2022

In [11]:
display(all_boro_home_value)

Unnamed: 0,zipcode,borough,year,count,home_value
1,10301,STATEN_ISLAND,2022,143,667275.2
2,10301,STATEN_ISLAND,2021,126,613591.25
0,10301,STATEN_ISLAND,2019,212,581955.833
4,10301,STATEN_ISLAND,2016,118,478215.833
3,10301,STATEN_ISLAND,2015,124,438929.167
2,11211,BROOKLYN,2022,2730,1150638.4
3,11211,BROOKLYN,2020,2719,1036530.25
0,11211,BROOKLYN,2019,6231,1023273.75
1,11211,BROOKLYN,2018,3227,997202.917
4,11211,BROOKLYN,2017,2645,967886.0


### Test for normality

In [12]:
# Generate Shapiro wilk test for 
def generate_shapiro_test(all_boro_home_value, boro):
    data = all_boro_home_value[all_boro_home_value["borough"] == boro]
    shapiro_wilk_data = stats.shapiro(data['home_value'])
    print(boro)
    print("W={}, P={}".format(shapiro_wilk_data[0], shapiro_wilk_data[1]))

In [13]:
generate_shapiro_test(all_boro_home_value, "BRONX")
generate_shapiro_test(all_boro_home_value, "BROOKLYN")
generate_shapiro_test(all_boro_home_value, "QUEENS")
generate_shapiro_test(all_boro_home_value, "MANHATTAN")

BRONX
W=0.9325530529022217, P=0.6138810515403748
BROOKLYN
W=0.8819567561149597, P=0.31827250123023987
QUEENS
W=0.810954213142395, P=0.09922929108142853
MANHATTAN
W=0.7880368232727051, P=0.06450823694467545


### Get Remaining Zip codes (with top 5 excluded) for each borough

In [14]:
# Non - intersecting zipcodes: Allzipcodes - top 5 zipcodes
remaining_zipcodes = zillow_home_value.merge(nyc_top_zipcodes_all_years, indicator=True, how="left")[lambda x: x._merge=='left_only'].drop('_merge',1)

  remaining_zipcodes = zillow_home_value.merge(nyc_top_zipcodes_all_years, indicator=True, how="left")[lambda x: x._merge=='left_only'].drop('_merge',1)


In [15]:
def get_remaining_zipcodes_zipcodes(remaining_zipcodes, boro):
    return remaining_zipcodes[remaining_zipcodes["borough"] == boro]

In [16]:
# Non intersecting zipcodes
def create_remaining_zipcode_csvs(remaining_zipcodes, display=False):
    boros = ["BRONX", "BROOKLYN", "QUEENS", "MANHATTAN", "STATEN_ISLAND"]
    for boro in boros:
        data = get_remaining_zipcodes_zipcodes(remaining_zipcodes, boro)
        create_csv(data, "remaining_zipcodes", "remaining_zipcode_home_value", boro, display)

In [17]:
# create_remaining_zipcode_csvs(remaining_zipcodes, True)

In [18]:
remaining_zipcode_count = pd.read_csv("./zipcode_data/nyc_remaining_zipcode_count.csv")

In [19]:
print("Remaining zipcodes count:", len(remaining_zipcode_count["zipcode"].unique().tolist()))

Remaining zipcodes count: 186


In [20]:
print("Zillow zipcode count:", len(zillow_home_value["zipcode"].unique().tolist()))

Zillow zipcode count: 177


In [21]:
merge_cols = ["year", "zipcode"]
remaining_mapped = pd.merge(remaining_zipcode_count, zillow_home_value,  how='left', left_on=merge_cols, right_on=merge_cols)

In [22]:
remaining_mapped

Unnamed: 0,zipcode,borough_x,year,count,borough_y,home_value
0,10001,MANHATTAN,2015,350,MANHATTAN,2280604.000
1,10001,MANHATTAN,2016,666,MANHATTAN,2369453.250
2,10001,MANHATTAN,2017,418,MANHATTAN,2271803.500
3,10001,MANHATTAN,2018,491,MANHATTAN,2351996.833
4,10001,MANHATTAN,2019,275,MANHATTAN,2380720.417
...,...,...,...,...,...,...
1399,11694,QUEENS,2020,11,QUEENS,829625.583
1400,11694,QUEENS,2021,43,QUEENS,897179.167
1401,11694,QUEENS,2022,64,QUEENS,955955.100
1402,11697,QUEENS,2016,1,,


In [25]:
# Rockerfeller center 10020, check what type of areas we're missing.
# Look for significant counts andor/ places
remaining_mapped[remaining_mapped["home_value"].isnull()]   

Unnamed: 0,zipcode,borough_x,year,count,borough_y,home_value
108,10020,MANHATTAN,2015,460,,
109,10020,MANHATTAN,2016,458,,
110,10020,MANHATTAN,2017,219,,
111,10020,MANHATTAN,2018,253,,
112,10020,MANHATTAN,2019,533,,
...,...,...,...,...,...,...
1327,11430,QUEENS,2022,2,,
1368,11580,QUEENS,2021,7,,
1369,11580,QUEENS,2022,7,,
1402,11697,QUEENS,2016,1,,
