In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Supress auto scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load Datasets

### Top airbnb zipcodes throughout nyc  (2015-2022)
Get top airbnb zip codes that need to be merged with zillow home values

In [3]:
top_zipcodes = pd.read_csv("./zipcode_data/unmapped_data/nyc_top_zipcodes_count.csv")
display(top_zipcodes)

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98
5,11211,BROOKLYN,2019,6231
6,11211,BROOKLYN,2018,3227
7,11211,BROOKLYN,2022,2730
8,11211,BROOKLYN,2020,2719
9,11211,BROOKLYN,2017,2645


### Zillow home values (2002-2022)
Get Zillow home values for each zipcode in nyc

In [4]:
home_values = pd.read_csv('../data/clean/zhome_value.csv', low_memory=False)

In [5]:
# EXPORT
# home_values.to_csv("../data/clean/zhome_value.csv", index=False)
# display(home_values)
# print("CSV Generated")

# Filter by significant years and compute yearly mean

### Filter by significant years from top zipcodes
Find min and max year for top zipcodes that need to be mapped from "zillow_home_values"

In [6]:
min_year, max_year = top_zipcodes["year"].min(), top_zipcodes["year"].max()
print("MIN={}\nMAX={}".format(min_year, max_year))

MIN=2015
MAX=2022


Use **MIN, MAX** years from top_zipcodes and narrow down zillow home_value years between [min, max]

In [7]:
home_values = home_values[home_values["year"].between(min_year, max_year, inclusive="both")]
print("Years present in Airbnb zipcode home value:", home_values["year"].unique().tolist(), end="\n\n")

Years present in Airbnb zipcode home value: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]



### Compute yearly mean
Group by **zipcode, year, and borough** to compute the average for home_value for each zipcode on a yearly basis. Notice the reduction in entries. 16577 --> 1412

In [8]:
home_values = home_values.groupby(["zipcode", "year", "borough"])["home_value"].mean().to_frame().reset_index()

# Merge airbnb top zipcode count count to zillow home values
Merge all the top 5 zipcodes for each borough to their appropriate zillow home value based on
- **year, zipcode, and borough**

### Dataframes

In [9]:
print("top_zipcodes")
display(top_zipcodes.head())
print("home_values")
display(home_values.head())

top_zipcodes


Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


home_values


Unnamed: 0,zipcode,year,borough,home_value
0,10001,2015,MANHATTAN,2280604.0
1,10001,2016,MANHATTAN,2369453.25
2,10001,2017,MANHATTAN,2271803.5
3,10001,2018,MANHATTAN,2351996.833
4,10001,2019,MANHATTAN,2380720.417


### Merging dataframes

In [10]:
merge_cols = ["year", "zipcode", "borough"]
merged_top_zipcodes = pd.merge(top_zipcodes, home_values,  how='left', left_on=merge_cols, right_on=merge_cols)
merged_top_zipcodes = merged_top_zipcodes.dropna()
print("merged_top_zipcodes")
display(merged_top_zipcodes.head())
print("count:", len(merged_reminaing))

merged_top_zipcodes


Unnamed: 0,zipcode,borough,year,count,home_value
0,10469,BRONX,2022,122,592184.8
1,10466,BRONX,2022,119,560980.3
2,10463,BRONX,2018,105,357383.417
3,10466,BRONX,2021,99,528377.083
4,10456,BRONX,2022,98,454902.9


NameError: name 'merged_reminaing' is not defined

In [None]:
# EXPORT
# merged_top_zipcodes.to_csv("./zipcode_data/mapped_data/top_zipcodes_home_val_mapped.csv", index=False)
# display(merged_top_zipcodes)
# print("CSV Generated")

In [None]:
# Generate Shapiro wilk test for 
def generate_shapiro_test(df, boro):
    data = df[df["borough"] == boro]
    shapiro_wilk_data = stats.shapiro(data['home_value'])
    return "{}:::W={}, P={}".format(boro, shapiro_wilk_data[0], shapiro_wilk_data[1])

### Test for normality

In [None]:
boros = top_zipcodes["borough"].unique()
[generate_shapiro_test(merged_top_zipcodes, boro) for boro in boros]

# Merge remaining airbnb zipcode count  to zillow home values
Merge remaining zillow zipcodes (excluding top 5) to remaining airbnb count based on
- **year, zipcode, and borough**

### Dataframes

In [None]:
remaining_zipcode_count = pd.read_csv("./zipcode_data/unmapped_data/nyc_remaining_zipcode_count.csv")
print("remaining_zipcode_count")
display(remaining_zipcode_count.head())
print("home_values")
display(home_values.head())

### Merging dataframes

In [None]:
merge_cols = ["year", "zipcode", "borough"]
merged_reminaing = remaining_zipcode_count.merge(home_values, indicator=True, how="left", left_on=merge_cols, right_on=merge_cols).drop(columns=['_merge'])
merged_reminaing = merged_reminaing.dropna()
print("merged_reminaing")
display(merged_reminaing.head())
print("count:", len(merged_reminaing))

In [None]:
# EXPORT
# merged_reminaing.to_csv("./zipcode_data/mapped_data/remaining_zipcodes_home_val_mapped.csv", index=False)
# display(merged_reminaing)
# print("CSV Generated")

# Check for missing home values from remaining zipcodes

In [None]:
#Rockerfeller center 10020, check what type of areas we're missing.Look for significant counts andor/ places
missing_zipcodes = merged_reminaing[merged_reminaing["home_value"].isnull()]   

In [None]:
# EXPORT
# missing_zipcodes.to_csv("./zipcode_data/missing_home_rent_values/missing_zipcode_home_values.csv", index=False)
# display(missing_zipcodes)
# print("CSV Generated")

In [None]:
def get_boro_missing_zipcodes(df, boro):
    missing_boro = df[df["borough"] == boro]
    missing_years = missing_boro["year"].unique().tolist()
    missing_zipcodes = missing_boro["zipcode"].unique().tolist()
    missing_zipcodes.sort();     missing_years.sort()
    str1 = "{}\nmissing zipcode_count:{}".format(boro, len(missing_boro))
    str2 = "\nmissing years: {}".format(missing_years)
    str3 = "\nmissing zipcodes:{}\n\n".format(missing_zipcodes)
    return str1 + str2 + str3

### Explain in EDA as to why these zipcodes are missing home values and not necessary
- So far, **MANHATTAN** has justifiable zipcodes along with parts of **BROOKLYN**
- TODO: **BRONX**, **QUEENS**

In [None]:
boros = top_zipcodes["borough"].unique()
[print(get_boro_missing_zipcodes(missing_zipcodes, boro)) for boro in boros]; display()