In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/clean/inside_airbnb_clean.gz', low_memory=False)
df["borough"] =  df["borough"].replace(' ', '_', regex=True)

In [3]:
df.head()

Unnamed: 0,id,last_scraped,host_id,host_name,neighbourhood,latitude,longitude,room_type,bedrooms,availability_365,calculated_host_listings_count,geom,zipcode,borough,year,availability_pct
0,2539,2022-09-07,2787,John,Kensington,40.64529,-73.97238,Private room,1.0,356,9,"40.64529,-73.97238",11218,BROOKLYN,2022,97.534247
1,5121,2022-09-07,7356,Garon,Bedford-Stuyvesant,40.68535,-73.95512,Private room,1.0,335,2,"40.68535,-73.95512",11238,BROOKLYN,2022,91.780822
2,45910,2022-09-07,204539,Mark,Ridgewood,40.70309,-73.89963,Entire home/apt,5.0,365,6,"40.70309,-73.89963",11385,QUEENS,2022,100.0
3,5136,2022-09-07,7378,Rebecca,Sunset Park,40.66265,-73.99454,Entire home/apt,2.0,179,1,"40.66265,-73.99454",11215,BROOKLYN,2022,49.041096
4,45935,2022-09-07,204586,L,Mott Haven,40.80635,-73.92201,Private room,1.0,83,1,"40.80635,-73.92201",10454,BRONX,2022,22.739726


In [4]:
def get_groupby_count(df, groupby_cols, value):
    data = df.groupby(groupby_cols)[value].count().to_frame().reset_index()
    data = data.rename(columns={value:'count'})
    return data

In [5]:
# Generate zipcode count for all boroughs
def create_boro_count_df(df, count_col):
    return get_groupby_count(df, [count_col, "borough", "year"], "id").reset_index(drop=True)

In [6]:
zipcode_count = create_boro_count_df(df, "zipcode")

In [7]:
# Gets the leading zipcodes for a borough at a specific year or cumalative (all years)
def get_leading_zipcode(df, n, boro, year = None):
    data = df[df["year"] == year] if year != None else df
    data = data[data["borough"]== boro]
    data = data.sort_values(["count"], ascending=False)
    return data.head(n).reset_index(drop=True)

In [8]:
# Top zipcodes for each borough CUMULATIVE
n = 5 # Top n zipcodes with the most appearance 
top_bronx = get_leading_zipcode(zipcode_count, n, "BRONX")
top_brooklyn = get_leading_zipcode(zipcode_count, n, "BROOKLYN")
top_manhattan = get_leading_zipcode(zipcode_count, n, "MANHATTAN")
top_queens = get_leading_zipcode(zipcode_count, n, "QUEENS")
top_staten_island= get_leading_zipcode(zipcode_count, n, "STATEN_ISLAND")
nyc_top_zipcodes = pd.concat([top_bronx, top_brooklyn, top_manhattan, top_queens, top_staten_island])
display(nyc_top_zipcodes)

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98
0,11211,BROOKLYN,2019,6231
1,11211,BROOKLYN,2018,3227
2,11211,BROOKLYN,2022,2730
3,11211,BROOKLYN,2020,2719
4,11211,BROOKLYN,2017,2645


In [9]:
# Top zipcodes for each borough throughout - 2021 ONLY
n = 5 # Top n zipcodes with the most appearance 
year = 2021 # Year to narrow down borough
bronx_zipcodes_year = get_leading_zipcode(zipcode_count, n, "BRONX", year)
brooklyn_zipcodes_year = get_leading_zipcode(zipcode_count, n, "BROOKLYN", year)
manhattan_zipcodes_year = get_leading_zipcode(zipcode_count, n, "MANHATTAN", year)
queens_zipcodes_year = get_leading_zipcode(zipcode_count, n, "QUEENS", year)
staten_island_zipcodes_year = get_leading_zipcode(zipcode_count, n, "STATEN_ISLAND", year)
nyc_zipcodes_2021 = pd.concat([bronx_zipcodes_year, brooklyn_zipcodes_year,manhattan_zipcodes_year, queens_zipcodes_year, staten_island_zipcodes_year ])
display(nyc_zipcodes_2021)

Unnamed: 0,zipcode,borough,year,count
0,10466,BRONX,2021,99
1,10469,BRONX,2021,89
2,10467,BRONX,2021,77
3,10456,BRONX,2021,70
4,10454,BRONX,2021,69
0,11211,BROOKLYN,2021,2101
1,11237,BROOKLYN,2021,1036
2,11233,BROOKLYN,2021,1013
3,11216,BROOKLYN,2021,984
4,11221,BROOKLYN,2021,934


In [10]:
# EXPORT
file_prefix = "zipcode_data/unmapped_data/"
nyc_top_zipcodes_name = file_prefix + "nyc_top_zipcodes_count.csv"
nyc_zipcodes_2021_name = file_prefix + "nyc_zipcodes_2021.csv"
nyc_top_zipcodes.to_csv(nyc_top_zipcodes_name, index=False)
nyc_zipcodes_2021.to_csv(nyc_zipcodes_2021_name, index=False)
print("Created CSVs")

Created CSVs


In [11]:
nyc_remaining_zipcodes = zipcode_count.merge(nyc_top_zipcodes, indicator=True, how="left")[lambda x: x._merge=='left_only'].drop(columns=['_merge'])
display(nyc_remaining_zipcodes)

Unnamed: 0,zipcode,borough,year,count
0,10001,MANHATTAN,2015,350
1,10001,MANHATTAN,2016,666
2,10001,MANHATTAN,2017,418
3,10001,MANHATTAN,2018,491
4,10001,MANHATTAN,2019,275
...,...,...,...,...
1424,11694,QUEENS,2020,11
1425,11694,QUEENS,2021,43
1426,11694,QUEENS,2022,64
1427,11697,QUEENS,2016,1


In [12]:
# EXPORT
file_prefix = "zipcode_data/unmapped_data/"
nyc_remaining_zipcodes_name = file_prefix + "nyc_remaining_zipcode_count.csv"
nyc_remaining_zipcodes.to_csv(nyc_remaining_zipcodes_name, index=False)
print("Created CSV")

Created CSV
