In [17]:
import pandas as pd
import numpy as np
from scipy import stats

In [18]:
# Supress auto scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load Datasets

### Top airbnb zipcodes throughout nyc  (2015-2022)
Get top airbnb zip codes that need to be merged with zillow home values

In [3]:
top_zipcodes = pd.read_csv("./zipcode_data/nyc_top_zipcodes_all_years.csv")
top_zipcodes["borough"] =  top_zipcodes["borough"].replace(' ', '_', regex=True)
display(top_zipcodes.head())

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


### Zillow home values (2002-2022)
Get Zillow home values for each zipcode in nyc

In [4]:
home_values = pd.read_csv('../data/clean/zhome_value.csv', low_memory=False)
home_values["date"] = pd.to_datetime(home_values["date"]).apply(lambda x: x.year)
home_values = home_values.rename(columns={"date": "year", "zip_code": "zipcode"})
display(home_values.head())

Unnamed: 0,zipcode,state,city,borough,year,home_value
0,11368,NY,New York,QUEENS,2002,249885.0
1,11385,NY,New York,QUEENS,2002,333404.0
2,11236,NY,New York,BROOKLYN,2002,255687.0
3,11208,NY,New York,BROOKLYN,2002,235676.0
4,10467,NY,New York,BRONX,2002,174696.0


# Filter by significant years and compute yearly mean

### Filter by significant years from top zipcodes
Find min and max year for top zipcodes that need to be mapped from "zillow_home_values"

In [5]:
min_year, max_year = top_zipcodes["year"].min(), top_zipcodes["year"].max()
print("MIN={}\nMAX={}".format(min_year, max_year))

MIN=2015
MAX=2022


Use **MIN, MAX** years from top_zipcodes and narrow down zillow home_value years between [min, max]

In [22]:
home_values = home_values[home_values["year"].between(min_year, max_year, inclusive="both")]
print("Years present in Airbnb zipcode home value:", home_values["year"].unique().tolist(), end="\n\n")
home_values.info()

Years present in Airbnb zipcode home value: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1412 entries, 0 to 1411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   zipcode     1412 non-null   int64  
 1   year        1412 non-null   int64  
 2   borough     1412 non-null   object 
 3   home_value  1412 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 55.2+ KB


### Compute yearly mean
Group by **zipcode, year, and borough** to compute the average for home_value for each zipcode on a yearly basis. Notice the reduction in entries. 16577 --> 1412

In [7]:
home_values = home_values.groupby(["zipcode", "year", "borough"])["home_value"].mean().to_frame().reset_index()
home_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1412 entries, 0 to 1411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   zipcode     1412 non-null   int64  
 1   year        1412 non-null   int64  
 2   borough     1412 non-null   object 
 3   home_value  1412 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 44.2+ KB


# Merge airbnb top zipcode count count to zillow home values
Merge all the top 5 zipcodes for each borough to their appropriate zillow home value based on
- **year, zipcode, and borough**

### Dataframes

In [8]:
print("top_zipcodes")
display(top_zipcodes.head())
print("home_values")
display(home_values.head())

top_zipcodes


Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


home_values


Unnamed: 0,zipcode,year,borough,home_value
0,10001,2015,MANHATTAN,2280604.0
1,10001,2016,MANHATTAN,2369453.25
2,10001,2017,MANHATTAN,2271803.5
3,10001,2018,MANHATTAN,2351996.833
4,10001,2019,MANHATTAN,2380720.417


### Merging dataframes

In [9]:
merge_cols = ["year", "zipcode", "borough"]
merged_top_zipcodes = pd.merge(top_zipcodes, home_values,  how='left', left_on=merge_cols, right_on=merge_cols)
print("merged_top_zipcodes")
display(merged_top_zipcodes.head())

merged_top_zipcodes


Unnamed: 0,zipcode,borough,year,count,home_value
0,10469,BRONX,2022,122,592184.8
1,10466,BRONX,2022,119,560980.3
2,10463,BRONX,2018,105,357383.417
3,10466,BRONX,2021,99,528377.083
4,10456,BRONX,2022,98,454902.9


In [10]:
# Generate Shapiro wilk test for 
def generate_shapiro_test(df, boro):
    data = df[df["borough"] == boro]
    shapiro_wilk_data = stats.shapiro(data['home_value'])
    return "{}:::W={}, P={}".format(boro, shapiro_wilk_data[0], shapiro_wilk_data[1])

### Test for normality

In [11]:
boros = top_zipcodes["borough"].unique()
[generate_shapiro_test(merged_top_zipcodes, boro) for boro in boros]

['BRONX:::W=0.9325530529022217, P=0.6138810515403748',
 'BROOKLYN:::W=0.8819567561149597, P=0.31827250123023987',
 'MANHATTAN:::W=0.7880368232727051, P=0.06450823694467545',
 'QUEENS:::W=0.810954213142395, P=0.09922929108142853',
 'STATEN_ISLAND:::W=0.941604495048523, P=0.6773035526275635']

# Merge remaining airbnb zipcode count  to zillow home values
Merge remaining zillow zipcodes (excluding top 5) to remaining airbnb count based on
- **year, zipcode, and borough**

### Dataframes

In [12]:
remaining_zipcode_count = pd.read_csv("./zipcode_data/nyc_remaining_zipcode_count.csv")
print("remaining_zipcode_count")
display(remaining_zipcode_count.head())
print("home_values")
display(home_values.head())

remaining_zipcode_count


Unnamed: 0,zipcode,borough,year,count
0,10001,MANHATTAN,2015,350
1,10001,MANHATTAN,2016,666
2,10001,MANHATTAN,2017,418
3,10001,MANHATTAN,2018,491
4,10001,MANHATTAN,2019,275


home_values


Unnamed: 0,zipcode,year,borough,home_value
0,10001,2015,MANHATTAN,2280604.0
1,10001,2016,MANHATTAN,2369453.25
2,10001,2017,MANHATTAN,2271803.5
3,10001,2018,MANHATTAN,2351996.833
4,10001,2019,MANHATTAN,2380720.417


### Merging dataframes

In [13]:
merge_cols = ["year", "zipcode", "borough"]
merged_reminaing = remaining_zipcode_count.merge(home_values, indicator=True, how="left", left_on=merge_cols, right_on=merge_cols).drop(columns=['_merge'])
print("merged_reminaing")
display(merged_reminaing.head())

merged_reminaing


Unnamed: 0,zipcode,borough,year,count,home_value
0,10001,MANHATTAN,2015,350,2280604.0
1,10001,MANHATTAN,2016,666,2369453.25
2,10001,MANHATTAN,2017,418,2271803.5
3,10001,MANHATTAN,2018,491,2351996.833
4,10001,MANHATTAN,2019,275,2380720.417


# Check for missing home values from remaining zipcodes

In [14]:
#Rockerfeller center 10020, check what type of areas we're missing.Look for significant counts andor/ places
missing_zipcodes = merged_reminaing[merged_reminaing["home_value"].isnull()]   

In [30]:
def get_boro_missing_zipcodes(df, boro):
    missing_boro = df[df["borough"] == boro]
    missing_years = missing_boro["year"].unique().tolist()
    missing_zipcodes = missing_boro["zipcode"].unique().tolist()
    missing_zipcodes.sort();     missing_years.sort()
    str1 = "{}\nmissing zipcode_count:{}".format(boro, len(missing_boro))
    str2 = "\nmissing years: {}".format(missing_years)
    str3 = "\nmissing zipcodes:{}\n\n".format(missing_zipcodes)
    return str1 + str2 + str3

### Explain in EDA as to why these zipcodes are missing home values and not necessary
- So far, **MANHATTAN** has justifiable zipcodes along with parts of **BROOKLYN**
- TODO: **BRONX**, **QUEENS**

In [31]:
boros = top_zipcodes["borough"].unique()
[print(get_boro_missing_zipcodes(missing_zipcodes, boro)) for boro in boros]; display()

BRONX
missing zipcode_count:25
missing years: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
missing zipcodes:[10474, 10550, 10704, 10705, 10803]


BROOKLYN
missing zipcode_count:8
missing years: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
missing zipcodes:[11693]


MANHATTAN
missing zipcode_count:20
missing years: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
missing zipcodes:[10020, 10037, 10115]


QUEENS
missing zipcode_count:54
missing years: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
missing zipcodes:[11001, 11005, 11040, 11096, 11109, 11351, 11359, 11371, 11430, 11580, 11697]


STATEN_ISLAND
missing zipcode_count:0
missing years: []
missing zipcodes:[]





It’s a common practice to drop rows with nan values (not a number) from a dataset when analyzing data, as these rows can often introduce bias or errors into the analysis. In our case, we found it necessary to drop rows with nan values if the zip codes correspond to locations that do not have homes, such as hospitals, churches, stadiums, etc. The Zillow home value data is likely not applicable to these types of locations, and including them in the analysis could skew the results.
By dropping the rows with nan values, we can ensure that the analysis is based on a more accurate sample of the data. This can help to improve the reliability and validity of the analysis, and provide more meaningful insights into the data. We have created csv files with the zip codes that do not have Zillow home value data to ensure if the data is needed it will be accessible.
In our dataframe staten island has no zip codes missing.Queens has 11 zipcodes missing. Manhattan has 3 zip codes missing. Brooklyn has 1 zip code missing. Bronx has 5 zip codes missing. 
