In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/clean/inside_airbnb_clean.gz', low_memory=False)

In [3]:
df.head()

Unnamed: 0,id,last_scraped,host_id,host_name,neighbourhood,latitude,longitude,room_type,bedrooms,availability_365,calculated_host_listings_count,geom,zipcode,borough,year,availability_pct
0,2539,2022-09-07,2787,John,Kensington,40.64529,-73.97238,Private room,1.0,356,9,"40.64529,-73.97238",11218,BROOKLYN,2022,97.534247
1,5121,2022-09-07,7356,Garon,Bedford-Stuyvesant,40.68535,-73.95512,Private room,1.0,335,2,"40.68535,-73.95512",11238,BROOKLYN,2022,91.780822
2,45910,2022-09-07,204539,Mark,Ridgewood,40.70309,-73.89963,Entire home/apt,5.0,365,6,"40.70309,-73.89963",11385,QUEENS,2022,100.0
3,5136,2022-09-07,7378,Rebecca,Sunset Park,40.66265,-73.99454,Entire home/apt,2.0,179,1,"40.66265,-73.99454",11215,BROOKLYN,2022,49.041096
4,45935,2022-09-07,204586,L,Mott Haven,40.80635,-73.92201,Private room,1.0,83,1,"40.80635,-73.92201",10454,BRONX,2022,22.739726


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335503 entries, 0 to 335502
Data columns (total 16 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              335503 non-null  int64  
 1   last_scraped                    335503 non-null  object 
 2   host_id                         335503 non-null  int64  
 3   host_name                       331779 non-null  object 
 4   neighbourhood                   335503 non-null  object 
 5   latitude                        335503 non-null  float64
 6   longitude                       335503 non-null  float64
 7   room_type                       335503 non-null  object 
 8   bedrooms                        74098 non-null   float64
 9   availability_365                335503 non-null  int64  
 10  calculated_host_listings_count  335503 non-null  int64  
 11  geom                            335503 non-null  object 
 12  zipcode         

In [5]:
def get_groupby_count(df, groupby_cols, value):
    data = df.groupby(groupby_cols)[value].count().to_frame().reset_index()
    data = data.rename(columns={value:'count'})
    return data

In [6]:
# Generate zipcode count for all boroughs
def create_boro_count_df(df, count_col):
    return get_groupby_count(df, [count_col, "borough", "year"], "id").reset_index(drop=True)

In [7]:
zipcode_count = create_boro_count_df(df, "zipcode")

In [8]:
# Gets the leading zipcodes for a borough at a specific year or cumalative (all years)
def get_leading_zipcode(df, n, boro, year = None):
    data = df[df["year"] == year] if year != None else df
    data = data[data["borough"]== boro]
    data = data.sort_values(["count"], ascending=False)
    return data.head(n).reset_index(drop=True)

In [9]:
# Top zipcodes for each borough throughout all years
n = 5 # Top n zipcodes with the most appearance 
top_bronx_zipcodes = get_leading_zipcode(zipcode_count, n, "BRONX")
top_brooklyn_zipcodes = get_leading_zipcode(zipcode_count, n, "BROOKLYN")
top_manhattan_zipcodes = get_leading_zipcode(zipcode_count, n, "MANHATTAN")
top_queens_zipcodes = get_leading_zipcode(zipcode_count, n, "QUEENS")
top_staten_island_zipcodes = get_leading_zipcode(zipcode_count, n, "STATEN ISLAND")

In [10]:
top_bronx_zipcodes

Unnamed: 0,zipcode,borough,year,count
0,10469,BRONX,2022,122
1,10466,BRONX,2022,119
2,10463,BRONX,2018,105
3,10466,BRONX,2021,99
4,10456,BRONX,2022,98


In [11]:
# Top zipcodes for each borough throughout 2020 ONLY
n = 5 # Top n zipcodes with the most appearance 
year = 2020 # Year to narrow down borough
bronx_zipcodes_year = get_leading_zipcode(zipcode_count, n, "BRONX", year)
brooklyn_zipcodes_year = get_leading_zipcode(zipcode_count, n, "BROOKLYN", year)
manhattan_zipcodes_year = get_leading_zipcode(zipcode_count, n, "MANHATTAN", year)
queens_zipcodes_year = get_leading_zipcode(zipcode_count, n, "QUEENS", year)
staten_island_zipcodes_year = get_leading_zipcode(zipcode_count, n, "STATEN ISLAND", year)

In [12]:
bronx_zipcodes_year

Unnamed: 0,zipcode,borough,year,count
0,10467,BRONX,2020,59
1,10456,BRONX,2020,53
2,10463,BRONX,2020,51
3,10454,BRONX,2020,47
4,10469,BRONX,2020,38
