In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Supress auto scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load Datasets

### Top airbnb zipcodes throughout nyc  (2015-2022)
Get top airbnb zip codes that need to be merged with zillow home values

In [3]:
top_zipcodes = pd.read_csv("./zipcode_data/unmapped_data/nyc_top_zipcodes_count.csv")
display(top_zipcodes)

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98
5,11211,BROOKLYN,2019,6231
6,11211,BROOKLYN,2018,3227
7,11211,BROOKLYN,2022,2730
8,11211,BROOKLYN,2020,2719
9,11211,BROOKLYN,2017,2645


In [4]:
# EXPORT
# top_zipcodes.to_csv("./zipcode_data/nyc_top_zipcodes_all_years.csv", index=False)
# display(top_zipcodes)
# print("CSV Generated")

### Rent values (2002-2022)
Get rent values for each zipcode in nyc

In [5]:
rent_vals = pd.read_csv('../data/clean/nyc_rental_prices.csv', low_memory=False)

In [6]:
# EXPORT
# rent_vals.to_csv("../data/clean/nyc_rental_prices.csv", index=False)
# display(rent_vals)
# print("CSV Generated")

### Compute yearly mean
Group by **zipcode, year, and borough** to compute the average for home_value for each zipcode on a yearly basis. Entry amount stays the same.

In [7]:
rent_vals = rent_vals.groupby(["zipcode", "year", "borough"])["rent"].mean().to_frame().reset_index()

# Merge airbnb top zipcode count count to zillow rent values
Merge all the top 5 zipcodes for each borough to their appropriate rent values based on
- **year, zipcode, and borough**

### Dataframes

In [8]:
print("top_zipcodes")
display(top_zipcodes.head())
print("rent_vals")
display(rent_vals.head())

top_zipcodes


Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


rent_vals


Unnamed: 0,zipcode,year,borough,rent
0,7020,2022,QUEENS,2741.0
1,10001,2015,MANHATTAN,4010.0
2,10001,2016,MANHATTAN,4022.983
3,10001,2017,MANHATTAN,3958.383
4,10001,2018,MANHATTAN,4001.067


### Merging dataframes

In [9]:
merge_cols = ["year", "zipcode", "borough"]
merged_top_zipcodes = pd.merge(top_zipcodes, rent_vals,  how='left', left_on=merge_cols, right_on=merge_cols)
merged_top_zipcodes = merged_top_zipcodes.dropna()
print("merged_top_zipcodes")
display(merged_top_zipcodes.head())
print("count:", len(merged_top_zipcodes))

merged_top_zipcodes


Unnamed: 0,zipcode,borough,year,count,rent
0,10469,BRONX,2022,122,1883.011
1,10466,BRONX,2022,119,1934.625
2,10463,BRONX,2018,105,2308.025
4,10456,BRONX,2022,98,2235.944
5,11211,BROOKLYN,2019,6231,3335.467


count: 19


In [10]:
merged_top_zipcodes = merged_top_zipcodes.groupby(["zipcode", "borough", "year"])[["rent", "count"]].mean().reset_index()

In [11]:
# merged_top_zipcodes.to_csv("../stats/zipcode_data/mapped_data/top_zipcodes_rent_val_mapped.csv", index=False)
# display(merged_top_zipcodes)
# print("CSV Generated")

# Merge remaining airbnb zipcode count  to rent values
Merge remaining zillow zipcodes (excluding top 5) to remaining airbnb count based on
- **year, zipcode, and borough**

### Dataframes

In [12]:
remaining_zipcode_count = pd.read_csv("./zipcode_data/unmapped_data/nyc_remaining_zipcode_count.csv")
print("remaining_zipcode_count")
display(remaining_zipcode_count.head())
print("rent_vals")
display(rent_vals.head())

remaining_zipcode_count


Unnamed: 0,zipcode,borough,year,count
0,10001,MANHATTAN,2015,350
1,10001,MANHATTAN,2016,666
2,10001,MANHATTAN,2017,418
3,10001,MANHATTAN,2018,491
4,10001,MANHATTAN,2019,275


rent_vals


Unnamed: 0,zipcode,year,borough,rent
0,7020,2022,QUEENS,2741.0
1,10001,2015,MANHATTAN,4010.0
2,10001,2016,MANHATTAN,4022.983
3,10001,2017,MANHATTAN,3958.383
4,10001,2018,MANHATTAN,4001.067


### Merging dataframes

In [13]:
merge_cols = ["year", "zipcode", "borough"]
merged_reminaing = remaining_zipcode_count.merge(rent_vals, indicator=True, how="left", left_on=merge_cols, right_on=merge_cols).drop(columns=['_merge'])
merged_reminaing = merged_reminaing.dropna()
print("merged_reminaing")
display(merged_reminaing.head())
print("count:", len(merged_reminaing))

merged_reminaing


Unnamed: 0,zipcode,borough,year,count,rent
0,10001,MANHATTAN,2015,350,4010.0
1,10001,MANHATTAN,2016,666,4022.983
2,10001,MANHATTAN,2017,418,3958.383
3,10001,MANHATTAN,2018,491,4001.067
4,10001,MANHATTAN,2019,275,4145.267


count: 732


In [14]:
merged_reminaing = merged_reminaing.groupby(["zipcode", "borough", "year"])[["rent", "count"]].mean().reset_index()

In [15]:
# merged_reminaing.to_csv("./zipcode_data/mapped_data/remaining_zipcodes_rent_val_mapped.csv", index=False)
# display(merged_reminaing)
# print("CSV Generated")

# Check for missing home values from remaining zipcodes

In [16]:
#Rockerfeller center 10020, check what type of areas we're missing.Look for significant counts andor/ places
missing_zipcodes = merged_reminaing[merged_reminaing["rent"].isnull()]   

In [17]:
# EXPORT
# missing_zipcodes.to_csv("./zipcode_data/missing_home_rent_values/missing_zipcode_rent_values.csv", index=False)
# display(missing_zipcodes)
# print("CSV Generated")

In [18]:
def get_boro_missing_zipcodes(df, boro):
    missing_boro = df[df["borough"] == boro]
    missing_years = missing_boro["year"].unique().tolist()
    missing_zipcodes = missing_boro["zipcode"].unique().tolist()
    missing_zipcodes.sort();     missing_years.sort()
    str1 = "{}\nmissing zipcode_count:{}".format(boro, len(missing_boro))
    str2 = "\nmissing years: {}".format(missing_years)
    str3 = "\nmissing zipcodes:{}\n\n".format(missing_zipcodes)
    return str1 + str2 + str3

### Explain in EDA as to why these zipcodes are missing home values and not necessary
- So far, **MANHATTAN** has justifiable zipcodes along with parts of **BROOKLYN**
- TODO: **BRONX**, **QUEENS**

In [19]:
boros = top_zipcodes["borough"].unique()
[print(get_boro_missing_zipcodes(missing_zipcodes, boro)) for boro in boros]; display()

BRONX
missing zipcode_count:0
missing years: []
missing zipcodes:[]


BROOKLYN
missing zipcode_count:0
missing years: []
missing zipcodes:[]


MANHATTAN
missing zipcode_count:0
missing years: []
missing zipcodes:[]


QUEENS
missing zipcode_count:0
missing years: []
missing zipcodes:[]


STATEN_ISLAND
missing zipcode_count:0
missing years: []
missing zipcodes:[]


