In [1]:
import pandas as pd
import numpy as np


In [2]:
from IPython.display import display, HTML

def display_df(orders, height=300):
    """
    Displays a scrollable HTML table for a given Pandas DataFrame.
    
    Parameters:
    orders (pd.DataFrame): The DataFrame to display.
    height (int): The height of the scrollable div in pixels (default is 300).
    
    Returns:
    None: Displays the scrollable HTML table in a Jupyter Notebook or IPython environment.
    """
    # Convert DataFrame to HTML
    html_table = orders.to_html(classes='table table-striped', index=False)
    
    # Create scrollable div with the HTML table
    scrollable_html = f"""
    <div style="height:{height}px; overflow:auto;">
        {html_table}
    </div>
    """
    
    # Display the HTML
    display(HTML(scrollable_html))

## Retrieving Data Set
Source: https://insideairbnb.com/get-the-data/ (Data is for New York City, New York, United States)

This is a quarterly data of all listings in New York City.

Variable Information:
https://docs.google.com/spreadsheets/d/1iWCNJcSutYqpULSQHlNyGInUvHg2BoUGoNRIGa6Szc4/edit?gid=1322284596#gid=1322284596

In [3]:
# Import data
listings = pd.read_csv("Data/listings.csv")

In [4]:
listings.shape

(37548, 75)

In [5]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37548 entries, 0 to 37547
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            37548 non-null  int64  
 1   listing_url                                   37548 non-null  object 
 2   scrape_id                                     37548 non-null  int64  
 3   last_scraped                                  37548 non-null  object 
 4   source                                        37548 non-null  object 
 5   name                                          37546 non-null  object 
 6   description                                   36504 non-null  object 
 7   neighborhood_overview                         20574 non-null  object 
 8   picture_url                                   37547 non-null  object 
 9   host_id                                       37548 non-null 

In [6]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

# Calculate and display the sum of null values for each column
null_values = listings.isnull().sum()
print(null_values)

# Optionally, reset the display option to default
pd.reset_option('display.max_rows')

id                                                  0
listing_url                                         0
scrape_id                                           0
last_scraped                                        0
source                                              0
name                                                2
description                                      1044
neighborhood_overview                           16974
picture_url                                         1
host_id                                             0
host_url                                            0
host_name                                           5
host_since                                          5
host_location                                    7999
host_about                                      16224
host_response_time                              15001
host_response_rate                              15001
host_acceptance_rate                            14983
host_is_superhost           

## Data Pre-processing
This process remove columns that have a lot of null values and we think that it is not significant to our analysis. Also, we will create dummy variables for categorical variables.

### Handling Null Values

There are some columns that have null values. The following handle null values by removing rows or replacing the value with a default value.

In [7]:
# Total duplicated rows
print("Total Duplicated Rows:")
print(listings.duplicated().sum())

Total Duplicated Rows:
0


In [8]:
display_df(listings.head(5))

id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
2595,https://www.airbnb.com/rooms/2595,20241104040953,2024-11-04,city scrape,Skylit Midtown Castle Sanctuary,"Beautiful, spacious skylit studio in the heart of Midtown, Manhattan. <br /><br />1 BED / FULL BATH / FULL KITCHEN / CENTRALLY LOCATED / HIGH SPEED WIFI","Centrally located in the heart of Manhattan just a few blocks from all subway connections in the very desirable Midtown location a few minutes walk to Times Square, the Theater District, Bryant Park and Herald Square.",https://a0.muscache.com/pictures/miso/Hosting-2595/original/9e5435f8-414f-4ebe-b104-679e942a60ef.jpeg,2845,https://www.airbnb.com/users/show/2845,Jennifer,2008-09-09,"Woodstock, NY","A New Yorker since 2000! My passion is creating beautiful, unique spaces where unforgettable memories are made. It's my pleasure to host people from around the world and meet new faces. Welcome travelers! \n\nI am a Sound Therapy Practitioner and Kundalini Yoga & Meditation teacher. I work with energy and sound for relaxation and healing, using Symphonic gong, singing bowls, tuning forks, drums, voice and other instruments.\n\nAny questions, please text or call Jennifer at 646.498.8710.",within a day,90%,21%,f,https://a0.muscache.com/im/pictures/user/50fc57af-a6a3-4e88-8f16-efd6cac7c9bc.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/50fc57af-a6a3-4e88-8f16-efd6cac7c9bc.jpg?aki_policy=profile_x_medium,Midtown,7.0,9.0,"['email', 'phone', 'work_email']",t,t,Neighborhood highlights,Midtown,Manhattan,40.75356,-73.98559,Entire rental unit,Entire home/apt,1,1.0,1 bath,0.0,1.0,"[""Fire extinguisher"", ""Smoke alarm"", ""Stove"", ""Air conditioning"", ""Paid parking off premises"", ""Extra pillows and blankets"", ""Self check-in"", ""Wifi"", ""Hair dryer"", ""Free street parking"", ""TV"", ""Heating"", ""Dishes and silverware"", ""Long term stays allowed"", ""Carbon monoxide alarm"", ""Bed linens"", ""Baking sheet"", ""Dedicated workspace"", ""Iron"", ""Hot water"", ""Coffee maker"", ""Keypad"", ""Refrigerator"", ""Ethernet connection"", ""Cleaning available during stay"", ""Bathtub"", ""Luggage dropoff allowed"", ""Cooking basics"", ""Essentials"", ""Hangers"", ""Kitchen"", ""Oven""]",$240.00,30,1125,30.0,30.0,1125.0,1125.0,30.0,1125.0,,t,30,60,90,365,2024-11-04,49,0,0,2009-11-21,2022-06-21,4.68,4.73,4.63,4.77,4.8,4.81,4.4,,f,3,3,0,0,0.27
6848,https://www.airbnb.com/rooms/6848,20241104040953,2024-11-04,city scrape,Only 2 stops to Manhattan studio,"Comfortable studio apartment with super comfortable king size bed and full kitchen and bathroom located in FABULOUS Williamsburg, Brooklyn.",,https://a0.muscache.com/pictures/e4f031a7-f146-40fd-98fd-b399a940c505.jpg,15991,https://www.airbnb.com/users/show/15991,Allen & Irina,2009-05-06,"New York, NY","We love to travel. When we travel we like to stay in a comfortable place that is clean, neat and sweet smelling, for a reasonable price. That's what we rent to you. We love city life but we also love outdoor adventures. We like keep up with all that's new and exciting around New York and we're happy to tell you where to find it. Want to hear the new young musicians that people are excited about or the promising new artists - we'll tell you about them. The best restaurants - ask us.\r\nWant to take a break from the city we'll tell you about wilderness canoeing on the Delaware river - just an hour and a half's drive. We can tell you where to find the great salt water fishing with clean beautiful waters just an hour away from the city or how to take a one day trip by bus to ski in Vermont.\r\n\r\n\r\nWHY WILLIAMSBURG?\r\n\r\nWilliamsburg is FABULOUS. It's fast growing and it's fast changing. It's hip. In the 1960's the new and exciting place where culture was blooming was Greenwich Village. In the 70's the SOHO neighborhood was where cheap industrial loft space were being turned into places where the adventurous could move to New York, live cheaply and make the new art, music, and culture. These days Williamsburg has that special chemistry of the New York that's constantly renewing itself.",within a few hours,100%,100%,t,https://a0.muscache.com/im/users/15991/profile_pic/1259104907/original.jpg?aki_policy=profile_small,https://a0.muscache.com/im/users/15991/profile_pic/1259104907/original.jpg?aki_policy=profile_x_medium,Williamsburg,1.0,1.0,"['email', 'phone']",t,t,,Williamsburg,Brooklyn,40.70935,-73.95342,Entire rental unit,Entire home/apt,3,1.0,1 bath,2.0,1.0,"[""Fire extinguisher"", ""Smoke alarm"", ""Stove"", ""Air conditioning"", ""Extra pillows and blankets"", ""Wifi"", ""Hair dryer"", ""Free street parking"", ""TV"", ""Washer"", ""Heating"", ""Microwave"", ""Dishes and silverware"", ""Carbon monoxide alarm"", ""Bed linens"", ""Iron"", ""Hot water"", ""Coffee maker"", ""Refrigerator"", ""Shampoo"", ""Dishwasher"", ""Cooking basics"", ""Essentials"", ""Hangers"", ""Kitchen"", ""Oven""]",$83.00,30,120,30.0,30.0,120.0,120.0,30.0,120.0,,t,0,15,15,185,2024-11-04,195,4,1,2009-05-25,2024-10-05,4.58,4.59,4.85,4.85,4.8,4.69,4.58,,f,1,1,0,0,1.04
6872,https://www.airbnb.com/rooms/6872,20241104040953,2024-11-04,city scrape,Uptown Sanctuary w/ Private Bath (Month to Month),"This charming distancing-friendly month-to-month home away from home located in Historic Harlem, Uptown Sanctuary is ideal for lovers of travel, work-life balance, art, soulful living, culture, and kindness.<br /><br />Spacious bedroom available (if you require more space, please inquiry about the master bedroom with its own ensuite bathroom or renting the whole place).<br /><br />➨ Minimum 30-day stay only, thank you for understanding!<br />➨ Additional fees apply for events and must be approved in advance.","This sweet Harlem sanctuary is a 10-20 minute ride from downtown Manhattan, a 20-25 minute ride from Laguardia Airport and is walking distance to transportation making this location a step away from the city buzz when it's time for intimate social relaxation and restoration.",https://a0.muscache.com/pictures/miso/Hosting-6872/original/50573585-928d-4de4-8c3d-da27a5a2b179.jpeg,16104,https://www.airbnb.com/users/show/16104,Kae,2009-05-07,"New York, NY","A former life in fashion and wellness has left me well traveled and a lover of all things work-life balance and zen living. I have great living chemistry with independent, considerate professional females who don't work from home.",a few days or more,30%,33%,f,https://a0.muscache.com/im/pictures/user/d865acc2-3cba-4f03-bf38-c50819aad378.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/d865acc2-3cba-4f03-bf38-c50819aad378.jpg?aki_policy=profile_x_medium,East Harlem,2.0,2.0,"['email', 'phone', 'work_email']",t,t,Neighborhood highlights,East Harlem,Manhattan,40.80107,-73.94255,Private room in condo,Private room,1,1.0,1 shared bath,1.0,1.0,"[""Heating"", ""Washer"", ""Fire extinguisher"", ""Smoke alarm"", ""Elevator"", ""Paid parking garage off premises"", ""Essentials"", ""Luggage dropoff allowed"", ""Hot water"", ""Exterior security cameras on property"", ""Long term stays allowed"", ""Host greets you"", ""Carbon monoxide alarm"", ""Hangers"", ""Kitchen"", ""Wifi"", ""Free street parking"", ""Dryer""]",$65.00,30,180,30.0,30.0,180.0,180.0,30.0,180.0,,t,23,53,83,83,2024-11-04,1,0,0,2022-06-05,2022-06-05,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,f,2,0,2,0,0.03
6990,https://www.airbnb.com/rooms/6990,20241104040953,2024-11-04,city scrape,UES Beautiful Blue Room,Beautiful peaceful healthy home,"Location: Five minutes to Central Park, Museum Mile (Guggenheim Museum, Metropolitan Museum, Whitney Museum, The Cooper Hewitt Museum, The Frick Collection and many more. Movie Stars pepper the hood and the area is considered “The New Downtown”. Today’s hipsters moved uptown to escape the over gentrified neighborhoods, such as the LES, the Village and now Chelsea.<br /><br />Shopping: Many cool consignment shops to get your Haute couture, such as the famous Encore.<br /><br />Food: Around the corner, Gourmet Garage, Japanese, French, Mexican, Health food bars, and Joy Burgers<br /><br />Laundry: There is a laundry service on the corner, drop before 10am and pickup after 5pm, washed, dried and folded for $11. You will not miss a beat on your venture and you can still look good.",https://a0.muscache.com/pictures/be6cd5b3-9295-4b6d-bf9a-a1ca4e2c9ea5.jpg,16800,https://www.airbnb.com/users/show/16800,Cyn,2009-05-12,"New York, NY","Capturing the Steinbeck side of life in its Fillini moment.\r\nHome is a special place, it is a live-in work of art... A great experience I hope all to enjoy...",within an hour,100%,100%,t,https://a0.muscache.com/im/pictures/user/17c4d833-81c5-41a5-a212-c3f249ff1237.jpg?aki_policy=profile_small,https://a0.muscache.com/im/pictures/user/17c4d833-81c5-41a5-a212-c3f249ff1237.jpg?aki_policy=profile_x_medium,East Harlem,1.0,6.0,"['email', 'phone']",t,t,Neighborhood highlights,East Harlem,Manhattan,40.78778,-73.94759,Private room in rental unit,Private room,1,1.0,1 shared bath,1.0,1.0,"[""Fire extinguisher"", ""Smoke alarm"", ""Stove"", ""Air conditioning"", ""Paid parking off premises"", ""Wifi"", ""Hair dryer"", ""Free street parking"", ""TV"", ""Washer"", ""Heating"", ""Carbon monoxide alarm"", ""Hot water"", ""Coffee maker"", ""Refrigerator"", ""Shampoo"", ""Host greets you"", ""Breakfast"", ""Kitchen""]",$71.00,30,365,30.0,30.0,365.0,365.0,30.0,365.0,,t,0,23,53,284,2024-11-04,250,4,1,2009-10-28,2024-10-31,4.88,4.83,4.95,4.96,4.95,4.85,4.85,,f,1,0,1,0,1.37
7064,https://www.airbnb.com/rooms/7064,20241104040953,2024-11-04,previous scrape,"Amazing location! Wburg. Large, bright & tranquil","Large, private loft-like room in a spacious 2-story apt in the heart of Williamsburg, Brooklyn. The apt is in the heart of a vibrant neighborhood - while there's lots going on outside, inside it is peaceful inside, making it a beautiful base for your visit.","- One stop from the East Village, Lower East Side; an easy ride to other parts of Brooklyn and Manhattan<br /><br />- We are surrounded by innovative restaurants serving delicious food of every kind--Japanese, American, Vietnamese, Korean, French, Italian, Turkish, etc.<br /><br />- Grocery stores (Whole Foods, etc.) and delis nearby<br /><br />- Art galleries, local designers and boutiques all around<br /><br />- Pottery studio right downstairs with a beautiful garden.<br /><br />- Cinema 1 min walk away; art-film house 7-min walk<br /><br />- Two great parks a 10 min walk away (McCarren Park and Domino Park)<br /><br />- This is a really fun area - everything you could want is here!",https://a0.muscache.com/pictures/13708959/7e745a2f_original.jpg,17297,https://www.airbnb.com/users/show/17297,Joelle,2009-05-15,"New York, NY","I have lived in the same apartment in Brooklyn for more than 20 years and I love it. I also love to travel, and have been to Brazil, Peru, Costa Rica, Mexico, Germany, Italy, France as well as all over the US and Canada. I am in my early 50s, curious, responsible, and organized.\r\n\r\nFalo muito bem português. Mon français est comme ci comme ça. Mi español es también más o menos.",,,0%,f,https://a0.muscache.com/im/users/17297/profile_pic/1259105689/original.jpg?aki_policy=profile_small,https://a0.muscache.com/im/users/17297/profile_pic/1259105689/original.jpg?aki_policy=profile_x_medium,Williamsburg,2.0,2.0,"['email', 'phone', 'work_email']",t,t,Neighborhood highlights,Williamsburg,Brooklyn,40.71248,-73.95881,Private room in loft,Private room,2,,1 shared bath,1.0,,"[""Heating"", ""Washer"", ""Dishes and silverware"", ""Smoke alarm"", ""Essentials"", ""Air conditioning"", ""Fast wifi \u2013 273 Mbps"", ""Refrigerator"", ""Hangers"", ""Kitchen"", ""Dedicated workspace"", ""Shampoo"", ""Iron"", ""Dryer"", ""Coffee maker""]",,30,45,30.0,30.0,45.0,45.0,30.0,45.0,,t,0,0,0,0,2024-11-04,13,0,0,2010-08-17,2022-09-12,4.91,5.0,4.91,5.0,5.0,5.0,5.0,,f,2,0,2,0,0.08


Let's see what are the available columns in the dataset.

In [9]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

# Calculate and display the sum of null values for each column
null_values = listings.isnull().sum()
print(null_values)

# Optionally, reset the display option to default
pd.reset_option('display.max_rows')

id                                                  0
listing_url                                         0
scrape_id                                           0
last_scraped                                        0
source                                              0
name                                                2
description                                      1044
neighborhood_overview                           16974
picture_url                                         1
host_id                                             0
host_url                                            0
host_name                                           5
host_since                                          5
host_location                                    7999
host_about                                      16224
host_response_time                              15001
host_response_rate                              15001
host_acceptance_rate                            14983
host_is_superhost           

To streamline the dataset and reduce noise, we are removing the following columns:

- **`bedrooms` and `beds`**:  
  These columns contain a high proportion of null values. Their information is largely captured by the `accommodates` column, which is effective in identifying the guest capacity of a listing.

- **`bathrooms`**:  
  This column is being dropped in favor of `bathrooms_text`, which conveys similar information but with significantly fewer missing values (`bathrooms_text` has only 30 nulls). Retaining `bathrooms_text` helps preserve data integrity while minimizing the need for imputation.


In [10]:
# Select necessary columns
columns_to_exclude = ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
                      'description', 'neighborhood_overview', 'picture_url', 'host_id',
                      'host_url', 'host_name', 'host_location', 'host_about',
                      'host_response_time', 'host_response_rate', 'host_acceptance_rate', 
                      'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 
                      'neighbourhood', 'property_type', 'bathrooms',                                         # Removing bathrooms here
                      'bedrooms', 'beds', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',      # Removing bedrooms and beds here
                      'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm',
                      'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability',
                      'availability_30', 'availability_60', 'availability_90', 'calendar_last_scraped', 
                      'number_of_reviews_ltm', 'first_review', 'last_review', 'review_scores_rating', 
                      'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
                      'review_scores_communication', 'review_scores_location',
                      'review_scores_value', 'license', 'instant_bookable',
                      'calculated_host_listings_count',
                      'calculated_host_listings_count_entire_homes',
                      'calculated_host_listings_count_private_rooms',
                      'calculated_host_listings_count_shared_rooms', 'reviews_per_month']

# Drop the columns
listings_filtered = listings.drop(columns=columns_to_exclude)

In [11]:
listings_filtered.columns

Index(['host_since', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'room_type', 'accommodates', 'bathrooms_text', 'amenities',
       'price', 'availability_365', 'number_of_reviews',
       'number_of_reviews_l30d'],
      dtype='object')

In [12]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

# Calculate and display the sum of null values for each column
null_values = listings_filtered.isnull().sum()
print(null_values)

# Optionally, reset the display option to default
pd.reset_option('display.max_rows')

host_since                          5
host_is_superhost                 485
host_listings_count                 5
host_total_listings_count           5
host_verifications                  5
host_has_profile_pic                5
host_identity_verified              5
neighbourhood_cleansed              0
neighbourhood_group_cleansed        0
latitude                            0
longitude                           0
room_type                           0
accommodates                        0
bathrooms_text                     30
amenities                           0
price                           14807
availability_365                    0
number_of_reviews                   0
number_of_reviews_l30d              0
dtype: int64


We have less features to focus on now. Let's delete rows with N/A values, except price and host_is_superhost. We will deal with price and host_is_superhost later.

In [13]:
listings_filtered.columns

Index(['host_since', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'room_type', 'accommodates', 'bathrooms_text', 'amenities',
       'price', 'availability_365', 'number_of_reviews',
       'number_of_reviews_l30d'],
      dtype='object')

In [14]:
# Remove rows with na values
columns_to_remove = ['host_since', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 
       'room_type', 'accommodates', 'bathrooms_text', 'amenities',
       'availability_365', 'number_of_reviews', 'number_of_reviews_l30d']


listings_filtered = listings_filtered.dropna(subset=columns_to_remove)

In [15]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

# Calculate and display the sum of null values for each column
null_values = listings_filtered.isnull().sum()
print(null_values)

# Optionally, reset the display option to default
pd.reset_option('display.max_rows')

host_since                          0
host_is_superhost                 485
host_listings_count                 0
host_total_listings_count           0
host_verifications                  0
host_has_profile_pic                0
host_identity_verified              0
neighbourhood_cleansed              0
neighbourhood_group_cleansed        0
latitude                            0
longitude                           0
room_type                           0
accommodates                        0
bathrooms_text                      0
amenities                           0
price                           14785
availability_365                    0
number_of_reviews                   0
number_of_reviews_l30d              0
dtype: int64


In [16]:
# HANDLE 'host_is_superhost' NULL VALUES
# Assuming if host_is_superhost is null, it means that host is not superhost
listings_filtered['host_is_superhost'] = listings_filtered['host_is_superhost'].fillna('f')

# TRANSFORM f/t to 0/1
# Transform 'f' to 0 and 't' to 1
listings_filtered['host_is_superhost'] = listings_filtered['host_is_superhost'].replace({'f': 0, 't': 1})
listings_filtered['host_has_profile_pic'] = listings_filtered['host_has_profile_pic'].replace({'f': 0, 't': 1})
listings_filtered['host_identity_verified'] = listings_filtered['host_identity_verified'].replace({'f': 0, 't': 1})



We've already removed null values and dropped unnecessary columns. The next focus is the `prices` column, which still contains a significant number of missing values. Let's conduct a deeper analysis to understand the nature and potential impact of these missing entries.

In [17]:
import plotly.express as px
import pandas as pd

na_visual = listings_filtered[['longitude', 'latitude', 'price']].copy()

# Set color based on price availability
na_visual['color'] = na_visual['price'].apply(lambda x: 'Price Available' if pd.notna(x) else 'Price Unavailable')

# Create the map with Plotly
fig = px.scatter_mapbox(
    na_visual,
    lat="latitude",
    lon="longitude",
    color="color",  # color based on price availability with labels
    hover_data=["price"],  # show price in the hover
    title="Listings with Price",
    mapbox_style="carto-positron",  # Choose your preferred style
    color_discrete_map={"Price Available": "blue", "Price Unavailable": "red"}  # Set color for the labels
)

# Update layout to center on New York City with a suitable zoom level
fig.update_layout(
    mapbox=dict(
        zoom=10,  # Adjust zoom level for New York City
        center=dict(lat=40.7128, lon=-74.0060)  # Center the map on New York City
    ),
    margin={"r": 0, "t": 40, "l": 0, "b": 0},
)

# Show the figure
fig.show()


The missing price values in the dataset are randomly distributed across the map rather than being concentrated in specific areas. Since the missing values don't show a pattern (such as being clustered in a particular neighborhood or price range), removing them won't introduce bias into the analysis. **Let's remove the rows with missing prices.**

In [18]:
listings_filtered.columns

Index(['host_since', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'room_type', 'accommodates', 'bathrooms_text', 'amenities',
       'price', 'availability_365', 'number_of_reviews',
       'number_of_reviews_l30d'],
      dtype='object')

In [19]:
# Remove rows with missing prices
listings_filtered.dropna(subset=['price'], inplace=True)

In [20]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

null_values = listings_filtered.isnull().sum()
print(null_values)

# Optionally, reset the display option to default
pd.reset_option('display.max_rows')

host_since                      0
host_is_superhost               0
host_listings_count             0
host_total_listings_count       0
host_verifications              0
host_has_profile_pic            0
host_identity_verified          0
neighbourhood_cleansed          0
neighbourhood_group_cleansed    0
latitude                        0
longitude                       0
room_type                       0
accommodates                    0
bathrooms_text                  0
amenities                       0
price                           0
availability_365                0
number_of_reviews               0
number_of_reviews_l30d          0
dtype: int64


### Feature Engineering

This step will cover the transformation of a feature into another that is more predic.

In [21]:
# 'host_since' FEATURE ENGINEERING
# Convert the 'host_since' column from year to total number of days hosting

# Get current time
current_time = pd.Timestamp.today()
# Convert to date type
listings_filtered['host_since'] = pd.to_datetime(listings_filtered['host_since'], format='%Y-%m-%d')
# Create new column
listings_filtered['host_tenure'] = listings_filtered['host_since'].apply(
    lambda x: (current_time - x).days
)
# Drop 'host_since'
listings_filtered.drop(columns=['host_since'], inplace=True)


# 'bathrooms_text' FEATURE ENGINEERING
# Convert the 'bathrooms_text' column from bathroom text description to the following classifications:
#  - is_shared_bath: is it a shared bathroom?
#  - is_less_than_1_bath: is the unit equipped with only half-bath?

# Check if it is shared bathroom
listings_filtered['is_shared_bath'] = listings_filtered['bathrooms_text'].str.contains('shared', case=False, na=False)
# Extract the number from the 'bathrooms_text' column
listings_filtered['bathroom_count'] = listings_filtered['bathrooms_text'].str.extract(r'(\d+\.\d+|\d+)').astype(float)
# Check if the bathroom is half or 0
listings_filtered['is_less_than_1_bath'] = listings_filtered['bathroom_count'].apply(lambda x: x < 1) | listings_filtered['bathrooms_text'].str.contains('half', case=False, na=False)
# Remove the bathroom_count and bathrooms_text
listings_filtered = listings_filtered.drop(columns=['bathroom_count', 'bathrooms_text'])


# Transform False to 0 and True to 1
listings_filtered['is_shared_bath'] = listings_filtered['is_shared_bath'].replace({False: 0, True: 1})
listings_filtered['is_less_than_1_bath'] = listings_filtered['is_less_than_1_bath'].replace({False: 0, True: 1})


In [22]:
# Distance to downtown FEATURE ENGINEERING

def haversine_distance(lat1, lon1, lat2, lon2):
    # Earth radius in kilometers
    R = 6371  
    
    # Convert degrees to radians
    lat1_rad, lon1_rad = np.radians(lat1), np.radians(lon1)
    lat2_rad, lon2_rad = np.radians(lat2), np.radians(lon2)
    
    # Differences
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    # Haversine formula
    a = np.sin(dlat/2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c  # in kilometers

# Downtown NYC coordinates
downtown_lat = 40.7060
downtown_lon = -74.0086

# Apply the function to create a new distance_to_downtown
listings_filtered['distance_to_downtown'] = haversine_distance(
    listings_filtered['latitude'],
    listings_filtered['longitude'],
    downtown_lat,
    downtown_lon
)

### Dummy Variables Creation

This step will cover the creation of dummy variables.

In [23]:
# AMENITIES DATA CLEANING
# Convert the 'amenities' column from strings to actual lists
amenities_series = listings_filtered['amenities'].apply(eval)
# Create a set of all unique elements in the lists
unique_amenities = set([item for sublist in amenities_series for item in sublist])

# Sum up unique values for each amenity
tv_unique_cnt = len({item for item in unique_amenities if 'tv' in item.lower()})
wifi_unique_cnt = len({item for item in unique_amenities if 'wifi' in item.lower()})
shampoo_unique_cnt = len({item for item in unique_amenities if 'shampoo' in item.lower()})
conditioner_unique_cnt = len({item for item in unique_amenities if 'conditioner' in item.lower()})
soap_unique_cnt = len({item for item in unique_amenities if 'soap' in item.lower()})
stove_unique_cnt = len({item for item in unique_amenities if 'stove' in item.lower()})
fridge_unique_cnt = len({item for item in unique_amenities if 'refrigerator' in item.lower()})
oven_unique_cnt = len({item for item in unique_amenities if 'oven' in item.lower()})

print("TV unique amenities: ", tv_unique_cnt)
print("Wifi unique amenities: ", wifi_unique_cnt) 
print("Shampoo unique amenities: ", shampoo_unique_cnt) 
print("Conditioner unique amenities: ", conditioner_unique_cnt) 
print("Soap unique amenities: ", soap_unique_cnt) 
print("Stove unique amenities: ", stove_unique_cnt) 
print("_________________________________")
print("Total unique amenities: ", len(unique_amenities)) 

TV unique amenities:  1996
Wifi unique amenities:  613
Shampoo unique amenities:  490
Conditioner unique amenities:  505
Soap unique amenities:  616
Stove unique amenities:  292
_________________________________
Total unique amenities:  5935


In [24]:
print("TV unique values:")
list({item for item in unique_amenities if 'tv' in item.lower()})[:5]

TV unique values:


['60 inch HDTV with HBO Max, standard cable, Fire TV, Apple TV, Hulu, Netflix, Amazon Prime Video',
 '50 inch HDTV with Amazon Prime Video, Apple TV, Disney+, Hulu, HBO Max, Netflix, standard cable',
 'HDTV with Chromecast, Fire TV',
 '55 inch HDTV with HBO Max, Apple TV, Netflix, Hulu',
 '32 inch TV with Netflix']

In [25]:
print("Exercise equipment unique values:")
list({item for item in unique_amenities if 'workout bench' in item.lower()})[:5]

Exercise equipment unique values:


['Exercise equipment: free weights, yoga mat, workout bench',
 'Exercise equipment: elliptical, free weights, stationary bike, treadmill, yoga mat, workout bench, rowing',
 'Exercise equipment: elliptical, free weights, yoga mat, workout bench',
 'Exercise equipment: free weights, stationary bike, treadmill, workout bench',
 'Exercise equipment: elliptical, free weights, treadmill, yoga mat, workout bench']

We have a lot of unique amenities in the dataset. TV itself has 2,385 unique values. Hosts tend to put TV size, streaming services it includes, etc. We are going to clasify amenities into a predefined amenities that I have compiled in the form of list below:

In [26]:
# Create popular amenities in Airbnb
amenities = [
    # basic essentials
    "wifi",
    "air conditioning",
    "heating",
    "hot water",
    "soap",
    "shampoo",
    "conditioner",
    "hair dryer",
    "hangers",
    "iron",
    "coffee maker",
    "refrigerator",
    "freezer",
    "stove",
    "microwave",
    "dishwasher",
    "washer",
    "dryer",
    "first aid kit",
    "parking",

    # comfort & convenience
    "tv",
    "netflix",
    "hulu",
    "disney+",
    "apple tv",
    "amazon prime",
    "hbo",
    "roku",
    "cable",
    "sound system",
    "extra pillows and blankets",
    "nintendo switch",
    "ps3",
    "ps4",
    "ps5",
    "nintendo wii",
    "xbox",
    "arcade",
    "board",
    "ping pong table",
    "pool table",
    "game console",
    "reading materials (books, magazines)",
    "patio or balcony",
    "fire pit",
    "grill",

    # luxury & additional features
    "view",
    "hot tub",
    "bathtub",
    "pool",
    "sauna",
    "gym",
    "dishwasher",
    "hammocks",
    "yoga mat",
    "elliptical",
    "free weights",
    "stationary bike",
    "treadmill",
    "workout bench",
    "rowing",
    "pets",
    "theater",
    "resort",
    "beach",

    # accessibility features
    "elevator"
]

# Dummy variables for 'amenities'
# Initialize dummy columns with 0s
for feature in amenities:
    feature_name = "amenity_" + feature
    listings_filtered[feature_name] = listings_filtered['amenities'].apply(
        lambda x: 1 if feature in x.lower() else 0
    )
# Drop the original 'amenities' column, no longer needed
listings_filtered = listings_filtered.drop(columns=['amenities'])


In [27]:
# OTHER FEATURES DATA CLEANING
# Transform 'price' from string to float
listings_filtered['price'] = listings_filtered['price'].str.replace(r'[\$,]', '', regex=True).astype(float)

# Dummy variables for 'host_verifications'
# Convert the 'host_verifications' column from strings to actual lists
listings_filtered['host_verifications'] = listings_filtered['host_verifications'].apply(
    lambda x: x.strip("[]").replace("'", "").split(", ") if isinstance(x, str) else []
)
# Create a set of all unique elements in the lists
unique_features = set([item for sublist in listings_filtered['host_verifications'] for item in sublist])
# Initialize dummy columns with 0s
for feature in unique_features:
    feature_name = 'host_verif_' + feature
    listings_filtered[feature_name] = listings_filtered['host_verifications'].apply(lambda x: 1 if feature in x else 0)
# Drop the original 'host_verifications' column, no longer needed
listings_filtered = listings_filtered.drop(columns=['host_verifications'])


# Dummy variables for 'neighborhood_group_cleansed' and 'room_type'
listings_filtered = pd.get_dummies(listings_filtered, columns=['neighbourhood_cleansed', 'room_type'])

# Dummy variables for 'neighbourhood_cleansed' (we don't want to remove the original column for train test split purpose)
dummy = pd.get_dummies(listings_filtered[['neighbourhood_group_cleansed']], prefix=['neighbourhood_group_cleansed'])
listings_filtered = pd.concat([listings_filtered, dummy], axis=1)

# Convert all boolean columns to integers
listings_filtered[listings_filtered.select_dtypes(include=['bool']).columns] = listings_filtered.select_dtypes(include=['bool']).astype(int)




In [28]:
display_df(listings_filtered.head(10))

host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,accommodates,price,availability_365,number_of_reviews,number_of_reviews_l30d,host_tenure,is_shared_bath,is_less_than_1_bath,distance_to_downtown,amenity_wifi,amenity_air conditioning,amenity_heating,amenity_hot water,amenity_soap,amenity_shampoo,amenity_conditioner,amenity_hair dryer,amenity_hangers,amenity_iron,amenity_coffee maker,amenity_refrigerator,amenity_freezer,amenity_stove,amenity_microwave,amenity_dishwasher,amenity_washer,amenity_dryer,amenity_first aid kit,amenity_parking,amenity_tv,amenity_netflix,amenity_hulu,amenity_disney+,amenity_apple tv,amenity_amazon prime,amenity_hbo,amenity_roku,amenity_cable,amenity_sound system,amenity_extra pillows and blankets,amenity_nintendo switch,amenity_ps3,amenity_ps4,amenity_ps5,amenity_nintendo wii,amenity_xbox,amenity_arcade,amenity_board,amenity_ping pong table,amenity_pool table,amenity_game console,"amenity_reading materials (books, magazines)",amenity_patio or balcony,amenity_fire pit,amenity_grill,amenity_view,amenity_hot tub,amenity_bathtub,amenity_pool,amenity_sauna,amenity_gym,amenity_hammocks,amenity_yoga mat,amenity_elliptical,amenity_free weights,amenity_stationary bike,amenity_treadmill,amenity_workout bench,amenity_rowing,amenity_pets,amenity_theater,amenity_resort,amenity_beach,amenity_elevator,host_verif_,host_verif_work_email,host_verif_email,host_verif_phone,neighbourhood_cleansed_Allerton,neighbourhood_cleansed_Arden Heights,neighbourhood_cleansed_Arrochar,neighbourhood_cleansed_Arverne,neighbourhood_cleansed_Astoria,neighbourhood_cleansed_Bath Beach,neighbourhood_cleansed_Battery Park City,neighbourhood_cleansed_Bay Ridge,neighbourhood_cleansed_Bay Terrace,"neighbourhood_cleansed_Bay Terrace, Staten Island",neighbourhood_cleansed_Baychester,neighbourhood_cleansed_Bayside,neighbourhood_cleansed_Bayswater,neighbourhood_cleansed_Bedford-Stuyvesant,neighbourhood_cleansed_Belle Harbor,neighbourhood_cleansed_Bellerose,neighbourhood_cleansed_Belmont,neighbourhood_cleansed_Bensonhurst,neighbourhood_cleansed_Bergen Beach,neighbourhood_cleansed_Boerum Hill,neighbourhood_cleansed_Borough Park,neighbourhood_cleansed_Breezy Point,neighbourhood_cleansed_Briarwood,neighbourhood_cleansed_Brighton Beach,neighbourhood_cleansed_Bronxdale,neighbourhood_cleansed_Brooklyn Heights,neighbourhood_cleansed_Brownsville,neighbourhood_cleansed_Bull's Head,neighbourhood_cleansed_Bushwick,neighbourhood_cleansed_Cambria Heights,neighbourhood_cleansed_Canarsie,neighbourhood_cleansed_Carroll Gardens,neighbourhood_cleansed_Castle Hill,neighbourhood_cleansed_Castleton Corners,neighbourhood_cleansed_Chelsea,"neighbourhood_cleansed_Chelsea, Staten Island",neighbourhood_cleansed_Chinatown,neighbourhood_cleansed_City Island,neighbourhood_cleansed_Civic Center,neighbourhood_cleansed_Claremont Village,neighbourhood_cleansed_Clason Point,neighbourhood_cleansed_Clifton,neighbourhood_cleansed_Clinton Hill,neighbourhood_cleansed_Co-op City,neighbourhood_cleansed_Cobble Hill,neighbourhood_cleansed_College Point,neighbourhood_cleansed_Columbia St,neighbourhood_cleansed_Concord,neighbourhood_cleansed_Concourse,neighbourhood_cleansed_Concourse Village,neighbourhood_cleansed_Coney Island,neighbourhood_cleansed_Corona,neighbourhood_cleansed_Country Club,neighbourhood_cleansed_Crown Heights,neighbourhood_cleansed_Cypress Hills,neighbourhood_cleansed_DUMBO,neighbourhood_cleansed_Ditmars Steinway,neighbourhood_cleansed_Dongan Hills,neighbourhood_cleansed_Douglaston,neighbourhood_cleansed_Downtown Brooklyn,neighbourhood_cleansed_Dyker Heights,neighbourhood_cleansed_East Elmhurst,neighbourhood_cleansed_East Flatbush,neighbourhood_cleansed_East Harlem,neighbourhood_cleansed_East Morrisania,neighbourhood_cleansed_East New York,neighbourhood_cleansed_East Village,neighbourhood_cleansed_Eastchester,neighbourhood_cleansed_Edenwald,neighbourhood_cleansed_Edgemere,neighbourhood_cleansed_Elmhurst,neighbourhood_cleansed_Eltingville,neighbourhood_cleansed_Emerson Hill,neighbourhood_cleansed_Far Rockaway,neighbourhood_cleansed_Fieldston,neighbourhood_cleansed_Financial District,neighbourhood_cleansed_Flatbush,neighbourhood_cleansed_Flatiron District,neighbourhood_cleansed_Flatlands,neighbourhood_cleansed_Flushing,neighbourhood_cleansed_Fordham,neighbourhood_cleansed_Forest Hills,neighbourhood_cleansed_Fort Greene,neighbourhood_cleansed_Fort Hamilton,neighbourhood_cleansed_Fort Wadsworth,neighbourhood_cleansed_Fresh Meadows,neighbourhood_cleansed_Gerritsen Beach,neighbourhood_cleansed_Glendale,neighbourhood_cleansed_Gowanus,neighbourhood_cleansed_Gramercy,neighbourhood_cleansed_Graniteville,neighbourhood_cleansed_Grant City,neighbourhood_cleansed_Gravesend,neighbourhood_cleansed_Great Kills,neighbourhood_cleansed_Greenpoint,neighbourhood_cleansed_Greenwich Village,neighbourhood_cleansed_Grymes Hill,neighbourhood_cleansed_Harlem,neighbourhood_cleansed_Hell's Kitchen,neighbourhood_cleansed_Highbridge,neighbourhood_cleansed_Hollis,neighbourhood_cleansed_Holliswood,neighbourhood_cleansed_Howard Beach,neighbourhood_cleansed_Howland Hook,neighbourhood_cleansed_Huguenot,neighbourhood_cleansed_Hunts Point,neighbourhood_cleansed_Inwood,neighbourhood_cleansed_Jackson Heights,neighbourhood_cleansed_Jamaica,neighbourhood_cleansed_Jamaica Estates,neighbourhood_cleansed_Jamaica Hills,neighbourhood_cleansed_Kensington,neighbourhood_cleansed_Kew Gardens,neighbourhood_cleansed_Kew Gardens Hills,neighbourhood_cleansed_Kingsbridge,neighbourhood_cleansed_Kips Bay,neighbourhood_cleansed_Laurelton,neighbourhood_cleansed_Lighthouse Hill,neighbourhood_cleansed_Little Italy,neighbourhood_cleansed_Little Neck,neighbourhood_cleansed_Long Island City,neighbourhood_cleansed_Longwood,neighbourhood_cleansed_Lower East Side,neighbourhood_cleansed_Manhattan Beach,neighbourhood_cleansed_Marble Hill,neighbourhood_cleansed_Mariners Harbor,neighbourhood_cleansed_Maspeth,neighbourhood_cleansed_Melrose,neighbourhood_cleansed_Middle Village,neighbourhood_cleansed_Midland Beach,neighbourhood_cleansed_Midtown,neighbourhood_cleansed_Midwood,neighbourhood_cleansed_Mill Basin,neighbourhood_cleansed_Morningside Heights,neighbourhood_cleansed_Morris Heights,neighbourhood_cleansed_Morris Park,neighbourhood_cleansed_Morrisania,neighbourhood_cleansed_Mott Haven,neighbourhood_cleansed_Mount Eden,neighbourhood_cleansed_Mount Hope,neighbourhood_cleansed_Murray Hill,neighbourhood_cleansed_Navy Yard,neighbourhood_cleansed_Neponsit,neighbourhood_cleansed_New Brighton,neighbourhood_cleansed_New Dorp Beach,neighbourhood_cleansed_New Springville,neighbourhood_cleansed_NoHo,neighbourhood_cleansed_Nolita,neighbourhood_cleansed_North Riverdale,neighbourhood_cleansed_Norwood,neighbourhood_cleansed_Oakwood,neighbourhood_cleansed_Olinville,neighbourhood_cleansed_Ozone Park,neighbourhood_cleansed_Park Slope,neighbourhood_cleansed_Parkchester,neighbourhood_cleansed_Pelham Bay,neighbourhood_cleansed_Pelham Gardens,neighbourhood_cleansed_Port Morris,neighbourhood_cleansed_Port Richmond,neighbourhood_cleansed_Prince's Bay,neighbourhood_cleansed_Prospect Heights,neighbourhood_cleansed_Prospect-Lefferts Gardens,neighbourhood_cleansed_Queens Village,neighbourhood_cleansed_Randall Manor,neighbourhood_cleansed_Red Hook,neighbourhood_cleansed_Rego Park,neighbourhood_cleansed_Richmond Hill,neighbourhood_cleansed_Richmondtown,neighbourhood_cleansed_Ridgewood,neighbourhood_cleansed_Riverdale,neighbourhood_cleansed_Rockaway Beach,neighbourhood_cleansed_Roosevelt Island,neighbourhood_cleansed_Rosebank,neighbourhood_cleansed_Rosedale,neighbourhood_cleansed_Schuylerville,neighbourhood_cleansed_Sea Gate,neighbourhood_cleansed_Sheepshead Bay,neighbourhood_cleansed_Shore Acres,neighbourhood_cleansed_Silver Lake,neighbourhood_cleansed_SoHo,neighbourhood_cleansed_Soundview,neighbourhood_cleansed_South Beach,neighbourhood_cleansed_South Ozone Park,neighbourhood_cleansed_South Slope,neighbourhood_cleansed_Springfield Gardens,neighbourhood_cleansed_Spuyten Duyvil,neighbourhood_cleansed_St. Albans,neighbourhood_cleansed_St. George,neighbourhood_cleansed_Stapleton,neighbourhood_cleansed_Stuyvesant Town,neighbourhood_cleansed_Sunnyside,neighbourhood_cleansed_Sunset Park,neighbourhood_cleansed_Theater District,neighbourhood_cleansed_Throgs Neck,neighbourhood_cleansed_Todt Hill,neighbourhood_cleansed_Tompkinsville,neighbourhood_cleansed_Tottenville,neighbourhood_cleansed_Tremont,neighbourhood_cleansed_Tribeca,neighbourhood_cleansed_Two Bridges,neighbourhood_cleansed_Unionport,neighbourhood_cleansed_University Heights,neighbourhood_cleansed_Upper East Side,neighbourhood_cleansed_Upper West Side,neighbourhood_cleansed_Van Nest,neighbourhood_cleansed_Vinegar Hill,neighbourhood_cleansed_Wakefield,neighbourhood_cleansed_Washington Heights,neighbourhood_cleansed_West Brighton,neighbourhood_cleansed_West Farms,neighbourhood_cleansed_West Village,neighbourhood_cleansed_Westchester Square,neighbourhood_cleansed_Westerleigh,neighbourhood_cleansed_Whitestone,neighbourhood_cleansed_Williamsbridge,neighbourhood_cleansed_Williamsburg,neighbourhood_cleansed_Willowbrook,neighbourhood_cleansed_Windsor Terrace,neighbourhood_cleansed_Woodhaven,neighbourhood_cleansed_Woodlawn,neighbourhood_cleansed_Woodrow,neighbourhood_cleansed_Woodside,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island
0,7.0,9.0,1,1,Manhattan,40.75356,-73.98559,1,240.0,365,49,0,6132,0,0,5.632655,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,1.0,1.0,1,1,Brooklyn,40.70935,-73.95342,3,83.0,185,195,1,5893,0,0,4.666069,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
0,2.0,2.0,1,1,Manhattan,40.80107,-73.94255,1,65.0,83,1,0,5892,1,0,11.945956,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,1.0,6.0,1,1,Manhattan,40.78778,-73.94759,1,71.0,284,250,1,5887,1,0,10.44544,1,1,1,1,0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,2.0,2.0,1,1,Brooklyn,40.69194,-73.97389,2,205.0,215,390,5,5882,0,0,3.317591,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,4.0,4.0,1,1,Brooklyn,40.718807,-73.956177,2,290.0,259,13,0,5856,0,0,4.64232,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
0,2.0,3.0,1,1,Brooklyn,40.684556,-73.939634,5,170.0,219,190,0,5828,0,0,6.284235,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
0,1.0,1.0,1,1,Manhattan,40.76724,-73.98664,2,175.0,65,58,0,5801,0,0,7.056463,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,6.0,6.0,1,1,Brooklyn,40.68294,-73.95682,2,90.0,261,81,0,5773,1,0,5.062812,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
0,2.0,3.0,1,0,Manhattan,40.72296,-73.98383,2,75.0,123,316,0,5727,1,0,2.813339,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [29]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

null_values = listings_filtered.isnull().sum()
print(null_values)

# Optionally, reset the display option to default
pd.reset_option('display.max_rows')

host_is_superhost                                    0
host_listings_count                                  0
host_total_listings_count                            0
host_has_profile_pic                                 0
host_identity_verified                               0
neighbourhood_group_cleansed                         0
latitude                                             0
longitude                                            0
accommodates                                         0
price                                                0
availability_365                                     0
number_of_reviews                                    0
number_of_reviews_l30d                               0
host_tenure                                          0
is_shared_bath                                       0
is_less_than_1_bath                                  0
distance_to_downtown                                 0
amenity_wifi                                         0
amenity_ai

### KMeans as another Feature Engineering

To enhance the feature set for model training, we will apply KMeans clustering to the dataset and introduce a new column called `cluster`. This column will capture latent groupings or patterns within the data, providing an additional categorical feature that may improve model performance.

In [30]:
display_df(listings_filtered.head(10))

host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,accommodates,price,availability_365,number_of_reviews,number_of_reviews_l30d,host_tenure,is_shared_bath,is_less_than_1_bath,distance_to_downtown,amenity_wifi,amenity_air conditioning,amenity_heating,amenity_hot water,amenity_soap,amenity_shampoo,amenity_conditioner,amenity_hair dryer,amenity_hangers,amenity_iron,amenity_coffee maker,amenity_refrigerator,amenity_freezer,amenity_stove,amenity_microwave,amenity_dishwasher,amenity_washer,amenity_dryer,amenity_first aid kit,amenity_parking,amenity_tv,amenity_netflix,amenity_hulu,amenity_disney+,amenity_apple tv,amenity_amazon prime,amenity_hbo,amenity_roku,amenity_cable,amenity_sound system,amenity_extra pillows and blankets,amenity_nintendo switch,amenity_ps3,amenity_ps4,amenity_ps5,amenity_nintendo wii,amenity_xbox,amenity_arcade,amenity_board,amenity_ping pong table,amenity_pool table,amenity_game console,"amenity_reading materials (books, magazines)",amenity_patio or balcony,amenity_fire pit,amenity_grill,amenity_view,amenity_hot tub,amenity_bathtub,amenity_pool,amenity_sauna,amenity_gym,amenity_hammocks,amenity_yoga mat,amenity_elliptical,amenity_free weights,amenity_stationary bike,amenity_treadmill,amenity_workout bench,amenity_rowing,amenity_pets,amenity_theater,amenity_resort,amenity_beach,amenity_elevator,host_verif_,host_verif_work_email,host_verif_email,host_verif_phone,neighbourhood_cleansed_Allerton,neighbourhood_cleansed_Arden Heights,neighbourhood_cleansed_Arrochar,neighbourhood_cleansed_Arverne,neighbourhood_cleansed_Astoria,neighbourhood_cleansed_Bath Beach,neighbourhood_cleansed_Battery Park City,neighbourhood_cleansed_Bay Ridge,neighbourhood_cleansed_Bay Terrace,"neighbourhood_cleansed_Bay Terrace, Staten Island",neighbourhood_cleansed_Baychester,neighbourhood_cleansed_Bayside,neighbourhood_cleansed_Bayswater,neighbourhood_cleansed_Bedford-Stuyvesant,neighbourhood_cleansed_Belle Harbor,neighbourhood_cleansed_Bellerose,neighbourhood_cleansed_Belmont,neighbourhood_cleansed_Bensonhurst,neighbourhood_cleansed_Bergen Beach,neighbourhood_cleansed_Boerum Hill,neighbourhood_cleansed_Borough Park,neighbourhood_cleansed_Breezy Point,neighbourhood_cleansed_Briarwood,neighbourhood_cleansed_Brighton Beach,neighbourhood_cleansed_Bronxdale,neighbourhood_cleansed_Brooklyn Heights,neighbourhood_cleansed_Brownsville,neighbourhood_cleansed_Bull's Head,neighbourhood_cleansed_Bushwick,neighbourhood_cleansed_Cambria Heights,neighbourhood_cleansed_Canarsie,neighbourhood_cleansed_Carroll Gardens,neighbourhood_cleansed_Castle Hill,neighbourhood_cleansed_Castleton Corners,neighbourhood_cleansed_Chelsea,"neighbourhood_cleansed_Chelsea, Staten Island",neighbourhood_cleansed_Chinatown,neighbourhood_cleansed_City Island,neighbourhood_cleansed_Civic Center,neighbourhood_cleansed_Claremont Village,neighbourhood_cleansed_Clason Point,neighbourhood_cleansed_Clifton,neighbourhood_cleansed_Clinton Hill,neighbourhood_cleansed_Co-op City,neighbourhood_cleansed_Cobble Hill,neighbourhood_cleansed_College Point,neighbourhood_cleansed_Columbia St,neighbourhood_cleansed_Concord,neighbourhood_cleansed_Concourse,neighbourhood_cleansed_Concourse Village,neighbourhood_cleansed_Coney Island,neighbourhood_cleansed_Corona,neighbourhood_cleansed_Country Club,neighbourhood_cleansed_Crown Heights,neighbourhood_cleansed_Cypress Hills,neighbourhood_cleansed_DUMBO,neighbourhood_cleansed_Ditmars Steinway,neighbourhood_cleansed_Dongan Hills,neighbourhood_cleansed_Douglaston,neighbourhood_cleansed_Downtown Brooklyn,neighbourhood_cleansed_Dyker Heights,neighbourhood_cleansed_East Elmhurst,neighbourhood_cleansed_East Flatbush,neighbourhood_cleansed_East Harlem,neighbourhood_cleansed_East Morrisania,neighbourhood_cleansed_East New York,neighbourhood_cleansed_East Village,neighbourhood_cleansed_Eastchester,neighbourhood_cleansed_Edenwald,neighbourhood_cleansed_Edgemere,neighbourhood_cleansed_Elmhurst,neighbourhood_cleansed_Eltingville,neighbourhood_cleansed_Emerson Hill,neighbourhood_cleansed_Far Rockaway,neighbourhood_cleansed_Fieldston,neighbourhood_cleansed_Financial District,neighbourhood_cleansed_Flatbush,neighbourhood_cleansed_Flatiron District,neighbourhood_cleansed_Flatlands,neighbourhood_cleansed_Flushing,neighbourhood_cleansed_Fordham,neighbourhood_cleansed_Forest Hills,neighbourhood_cleansed_Fort Greene,neighbourhood_cleansed_Fort Hamilton,neighbourhood_cleansed_Fort Wadsworth,neighbourhood_cleansed_Fresh Meadows,neighbourhood_cleansed_Gerritsen Beach,neighbourhood_cleansed_Glendale,neighbourhood_cleansed_Gowanus,neighbourhood_cleansed_Gramercy,neighbourhood_cleansed_Graniteville,neighbourhood_cleansed_Grant City,neighbourhood_cleansed_Gravesend,neighbourhood_cleansed_Great Kills,neighbourhood_cleansed_Greenpoint,neighbourhood_cleansed_Greenwich Village,neighbourhood_cleansed_Grymes Hill,neighbourhood_cleansed_Harlem,neighbourhood_cleansed_Hell's Kitchen,neighbourhood_cleansed_Highbridge,neighbourhood_cleansed_Hollis,neighbourhood_cleansed_Holliswood,neighbourhood_cleansed_Howard Beach,neighbourhood_cleansed_Howland Hook,neighbourhood_cleansed_Huguenot,neighbourhood_cleansed_Hunts Point,neighbourhood_cleansed_Inwood,neighbourhood_cleansed_Jackson Heights,neighbourhood_cleansed_Jamaica,neighbourhood_cleansed_Jamaica Estates,neighbourhood_cleansed_Jamaica Hills,neighbourhood_cleansed_Kensington,neighbourhood_cleansed_Kew Gardens,neighbourhood_cleansed_Kew Gardens Hills,neighbourhood_cleansed_Kingsbridge,neighbourhood_cleansed_Kips Bay,neighbourhood_cleansed_Laurelton,neighbourhood_cleansed_Lighthouse Hill,neighbourhood_cleansed_Little Italy,neighbourhood_cleansed_Little Neck,neighbourhood_cleansed_Long Island City,neighbourhood_cleansed_Longwood,neighbourhood_cleansed_Lower East Side,neighbourhood_cleansed_Manhattan Beach,neighbourhood_cleansed_Marble Hill,neighbourhood_cleansed_Mariners Harbor,neighbourhood_cleansed_Maspeth,neighbourhood_cleansed_Melrose,neighbourhood_cleansed_Middle Village,neighbourhood_cleansed_Midland Beach,neighbourhood_cleansed_Midtown,neighbourhood_cleansed_Midwood,neighbourhood_cleansed_Mill Basin,neighbourhood_cleansed_Morningside Heights,neighbourhood_cleansed_Morris Heights,neighbourhood_cleansed_Morris Park,neighbourhood_cleansed_Morrisania,neighbourhood_cleansed_Mott Haven,neighbourhood_cleansed_Mount Eden,neighbourhood_cleansed_Mount Hope,neighbourhood_cleansed_Murray Hill,neighbourhood_cleansed_Navy Yard,neighbourhood_cleansed_Neponsit,neighbourhood_cleansed_New Brighton,neighbourhood_cleansed_New Dorp Beach,neighbourhood_cleansed_New Springville,neighbourhood_cleansed_NoHo,neighbourhood_cleansed_Nolita,neighbourhood_cleansed_North Riverdale,neighbourhood_cleansed_Norwood,neighbourhood_cleansed_Oakwood,neighbourhood_cleansed_Olinville,neighbourhood_cleansed_Ozone Park,neighbourhood_cleansed_Park Slope,neighbourhood_cleansed_Parkchester,neighbourhood_cleansed_Pelham Bay,neighbourhood_cleansed_Pelham Gardens,neighbourhood_cleansed_Port Morris,neighbourhood_cleansed_Port Richmond,neighbourhood_cleansed_Prince's Bay,neighbourhood_cleansed_Prospect Heights,neighbourhood_cleansed_Prospect-Lefferts Gardens,neighbourhood_cleansed_Queens Village,neighbourhood_cleansed_Randall Manor,neighbourhood_cleansed_Red Hook,neighbourhood_cleansed_Rego Park,neighbourhood_cleansed_Richmond Hill,neighbourhood_cleansed_Richmondtown,neighbourhood_cleansed_Ridgewood,neighbourhood_cleansed_Riverdale,neighbourhood_cleansed_Rockaway Beach,neighbourhood_cleansed_Roosevelt Island,neighbourhood_cleansed_Rosebank,neighbourhood_cleansed_Rosedale,neighbourhood_cleansed_Schuylerville,neighbourhood_cleansed_Sea Gate,neighbourhood_cleansed_Sheepshead Bay,neighbourhood_cleansed_Shore Acres,neighbourhood_cleansed_Silver Lake,neighbourhood_cleansed_SoHo,neighbourhood_cleansed_Soundview,neighbourhood_cleansed_South Beach,neighbourhood_cleansed_South Ozone Park,neighbourhood_cleansed_South Slope,neighbourhood_cleansed_Springfield Gardens,neighbourhood_cleansed_Spuyten Duyvil,neighbourhood_cleansed_St. Albans,neighbourhood_cleansed_St. George,neighbourhood_cleansed_Stapleton,neighbourhood_cleansed_Stuyvesant Town,neighbourhood_cleansed_Sunnyside,neighbourhood_cleansed_Sunset Park,neighbourhood_cleansed_Theater District,neighbourhood_cleansed_Throgs Neck,neighbourhood_cleansed_Todt Hill,neighbourhood_cleansed_Tompkinsville,neighbourhood_cleansed_Tottenville,neighbourhood_cleansed_Tremont,neighbourhood_cleansed_Tribeca,neighbourhood_cleansed_Two Bridges,neighbourhood_cleansed_Unionport,neighbourhood_cleansed_University Heights,neighbourhood_cleansed_Upper East Side,neighbourhood_cleansed_Upper West Side,neighbourhood_cleansed_Van Nest,neighbourhood_cleansed_Vinegar Hill,neighbourhood_cleansed_Wakefield,neighbourhood_cleansed_Washington Heights,neighbourhood_cleansed_West Brighton,neighbourhood_cleansed_West Farms,neighbourhood_cleansed_West Village,neighbourhood_cleansed_Westchester Square,neighbourhood_cleansed_Westerleigh,neighbourhood_cleansed_Whitestone,neighbourhood_cleansed_Williamsbridge,neighbourhood_cleansed_Williamsburg,neighbourhood_cleansed_Willowbrook,neighbourhood_cleansed_Windsor Terrace,neighbourhood_cleansed_Woodhaven,neighbourhood_cleansed_Woodlawn,neighbourhood_cleansed_Woodrow,neighbourhood_cleansed_Woodside,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island
0,7.0,9.0,1,1,Manhattan,40.75356,-73.98559,1,240.0,365,49,0,6132,0,0,5.632655,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,1.0,1.0,1,1,Brooklyn,40.70935,-73.95342,3,83.0,185,195,1,5893,0,0,4.666069,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
0,2.0,2.0,1,1,Manhattan,40.80107,-73.94255,1,65.0,83,1,0,5892,1,0,11.945956,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,1.0,6.0,1,1,Manhattan,40.78778,-73.94759,1,71.0,284,250,1,5887,1,0,10.44544,1,1,1,1,0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,2.0,2.0,1,1,Brooklyn,40.69194,-73.97389,2,205.0,215,390,5,5882,0,0,3.317591,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,4.0,4.0,1,1,Brooklyn,40.718807,-73.956177,2,290.0,259,13,0,5856,0,0,4.64232,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
0,2.0,3.0,1,1,Brooklyn,40.684556,-73.939634,5,170.0,219,190,0,5828,0,0,6.284235,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
0,1.0,1.0,1,1,Manhattan,40.76724,-73.98664,2,175.0,65,58,0,5801,0,0,7.056463,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,6.0,6.0,1,1,Brooklyn,40.68294,-73.95682,2,90.0,261,81,0,5773,1,0,5.062812,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
0,2.0,3.0,1,0,Manhattan,40.72296,-73.98383,2,75.0,123,316,0,5727,1,0,2.813339,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

# Prepare your data
kmeans_listings_filtered = listings_filtered.copy()
kmeans_listings_filtered.drop(columns=['neighbourhood_group_cleansed'], inplace=True)
X = kmeans_listings_filtered.select_dtypes(include='number').copy()
# Optional: X = X.drop(columns=['price'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Set up 10-fold CV
kf = KFold(n_splits=10, shuffle=True, random_state=42)
k_range = range(1, 11)

optimal_ks = []

# Loop over folds
for fold, (train_idx, _) in enumerate(kf.split(X_scaled), start=1):
    X_train_fold = X_scaled[train_idx]
    
    # compute inertia for k=1…10 on this fold
    inertia = []
    for k in k_range:
        km = KMeans(n_clusters=k, n_init='auto', random_state=42)
        km.fit(X_train_fold)
        inertia.append(km.inertia_)
    
    # second-derivative “elbow” pick
    inertia_diff  = np.diff(inertia)
    inertia_diff2 = np.diff(inertia_diff)
    opt_k = np.argmax(inertia_diff2) + 2  # +2 to correct for diff indexing
    
    print(f"Fold {fold:2d} optimal k = {opt_k}")
    optimal_ks.append(opt_k)

# Compute the average (and round to nearest int)
avg_optimal_k = int(np.round(np.mean(optimal_ks)))
print(f"\n Average optimal number of clusters over 10 folds: {avg_optimal_k}")

# (Optional) Refit KMeans on the full dataset with this average k
kmeans_final = KMeans(n_clusters= avg_optimal_k, n_init='auto', random_state=42)
listings_filtered['cluster'] = kmeans_final.fit_predict(X_scaled)

# Check final distribution
print(listings_filtered['cluster'].value_counts())


Fold  1 optimal k = 3
Fold  2 optimal k = 3
Fold  3 optimal k = 2
Fold  4 optimal k = 2
Fold  5 optimal k = 7
Fold  6 optimal k = 8
Fold  7 optimal k = 2
Fold  8 optimal k = 5
Fold  9 optimal k = 3
Fold 10 optimal k = 2

 Average optimal number of clusters over 10 folds: 4
cluster
3    9195
2    6767
1    6429
0     337
Name: count, dtype: int64



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [32]:
import plotly.express as px

# Make sure 'latitude' and 'longitude' are in the dataframe
if 'latitude' not in listings_filtered.columns or 'longitude' not in listings_filtered.columns:
    raise ValueError("Latitude and longitude columns are required for mapping.")

# Create map of clustered listings
fig = px.scatter_mapbox(
    listings_filtered,
    lat="latitude",
    lon="longitude",
    color="cluster",
    mapbox_style="carto-positron",
    zoom=10,
    height=600,
    width=1000,
    title="Airbnb Clusters by Location (Train Set)",
    hover_data=['neighbourhood_group_cleansed', 'price']
)

fig.show()


### Isolation Forest

As part of the data preprocessing pipeline, we will apply Isolation Forest to detect and remove outliers. This unsupervised anomaly detection method is effective for identifying data points that deviate significantly from the overall distribution, helping to improve the robustness and accuracy of the downstream machine learning model.

First of all, let's see what is the distribution of price from the overall dataset

In [33]:
listings_filtered['price'].describe()

count    22728.000000
mean       216.801478
std        380.715446
min          8.000000
25%         85.000000
50%        149.000000
75%        250.000000
max      20000.000000
Name: price, dtype: float64

It seems that, on average, Airbnb listings have a price of around $217 per night.
Half of all listings (the median) are priced at $149 or below, and 75% of listings are priced at $250 or less.
The cheapest listing is $8, which may be a data error or a very low-budget option, and the most expensive listing is $20,000, which is almost certainly an extreme outlier and could represent a luxury property or even a mispriced listing.

Let's train Isolation Forest model to identify anomalies:

In [34]:
from sklearn.ensemble import IsolationForest
import plotly.express as px

# Step 1: Prepare data
X = listings_filtered[['price']].copy()

# Step 2: Instantiate Isolation Forest
isolation_forest = IsolationForest(
    n_estimators=100,
    contamination=0.01,  # 1% of data is expected to be outliers
    random_state=42
)

# Step 3: Fit model
isolation_forest.fit(X)

# Step 4: Predict anomalies (-1 = outlier, 1 = inlier)
listings_filtered['Anomaly'] = isolation_forest.predict(X)

# Step 5: Map results to 'Yes'/'No'
listings_filtered['Anomaly'] = listings_filtered['Anomaly'].map({-1: 'Yes', 1: 'No'})

# Optional: Visualize
fig = px.scatter(
    listings_filtered,
    x=listings_filtered.index,
    y='price',
    color='Anomaly',
    title="Price Outlier Detection with Isolation Forest",
    labels={'index': 'Index', 'price': 'Price'},
    color_discrete_map={'Yes': 'red', 'No': 'blue'}
)
fig.show()



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [35]:
listings_filtered.groupby(by='Anomaly').size()

Anomaly
No     22501
Yes      227
dtype: int64

In [36]:
# Remove anomaly
listings_filtered = listings_filtered[listings_filtered['Anomaly'] == 'No'].copy()
# Drop Anomaly column
listings_filtered.drop(columns=['Anomaly'], inplace=True)


### Price Normalization

To address the strong right skew in the price distribution, we apply a log transformation using `log(1 + price)`. This normalization technique reduces the impact of extreme values and helps stabilize variance, making the distribution more symmetric and suitable for machine learning algorithms that assume normally distributed input features.

In [37]:
import plotly.figure_factory as ff

price_data = listings_filtered['price']
fig = ff.create_distplot([price_data], group_labels=['Price'], show_hist=True, show_rug=False)
fig.update_layout(title='Price Distribution with KDE')
fig.show()


In [38]:
import plotly.figure_factory as ff
import numpy as np

# Log-transform the price to reduce skewness
log_price_data = np.log1p(listings_filtered['price'])

# Plot the log-transformed distribution
fig = ff.create_distplot([log_price_data], group_labels=['Log(Price + 1)'], show_hist=True, show_rug=False)
fig.update_layout(title='Log-Transformed Price Distribution with KDE')
fig.show()


From the above, we can tell the `log(1 + price)` helps "normalize" the distribution. Let's keep this in mind and use the `log( 1+ price)` instead of `price` when training the model.

### Feature Selection

This step selects which feature to include in the ML model. Anything that is significant (0.1 or higher correlation), we will include them in the ML model training.

In [39]:
corr_table = listings_filtered.drop(['neighbourhood_group_cleansed'], axis=1).corr().reset_index()

display_df(corr_table[corr_table['index'] == 'cluster'][['index', 'price']])

index,price
cluster,0.020813


Apparently, the clustering produced by K-Means has a really low correlation with price at only 0.02. 

In [40]:
# Filter for rows where 'index' is 'price' and absolute correlation > 0.1
filtered_corr = corr_table[corr_table['price'].abs() > 0.1]
filtered_corr = filtered_corr.reindex(filtered_corr['price'].abs().sort_values(ascending=False).index)

display_df(filtered_corr[['index', 'price']])


index,price
price,1.0
accommodates,0.470422
room_type_Private room,-0.393948
is_shared_bath,-0.366793
room_type_Entire home/apt,0.364735
host_total_listings_count,0.31437
host_listings_count,0.311224
neighbourhood_group_cleansed_Manhattan,0.30977
distance_to_downtown,-0.285179
amenity_gym,0.283316


Airbnb prices are most influenced by how many guests a listing can accommodate, with larger spaces typically commanding higher rates. Entire homes or apartments tend to cost more than private rooms, as they offer full privacy and dedicated amenities. Hosts with multiple listings also charge more, likely due to their experience and ability to provide better service.

Location plays a key role too. Listings in Manhattan are priced higher, while those in Queens and Brooklyn are generally cheaper. Distance from downtown also lowers price, reflecting how central, well-connected areas drive up listing value. Overall, space, privacy, host experience, and location are the biggest factors shaping Airbnb pricing.


In [41]:
filtered_corr_list = filtered_corr['index'].tolist()
filtered_corr_list.append('neighbourhood_group_cleansed')  # we add this back for train/test stratification split

In [42]:
listings_filtered_reduced_features = listings_filtered[filtered_corr_list]

In [43]:
display_df(listings_filtered_reduced_features.head(10))

price,accommodates,room_type_Private room,is_shared_bath,room_type_Entire home/apt,host_total_listings_count,host_listings_count,neighbourhood_group_cleansed_Manhattan,distance_to_downtown,amenity_gym,amenity_shampoo,amenity_elevator,longitude,amenity_pets,amenity_hair dryer,amenity_dishwasher,neighbourhood_cleansed_Midtown,amenity_iron,amenity_tv,amenity_air conditioning,amenity_dryer,neighbourhood_group_cleansed_Queens,room_type_Hotel room,amenity_sound system,amenity_coffee maker,amenity_washer,neighbourhood_group_cleansed_Brooklyn,host_verif_work_email,amenity_hangers,neighbourhood_cleansed_Financial District,neighbourhood_cleansed_Theater District,amenity_pool,neighbourhood_group_cleansed
240.0,1,0,0,1,9.0,7.0,1,5.632655,0,0,0,-73.98559,0,1,0,1,1,1,1,1,0,0,0,1,0,0,1,1,0,0,0,Manhattan
83.0,3,0,0,1,1.0,1.0,0,4.666069,0,1,0,-73.95342,0,1,1,0,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0,Brooklyn
65.0,1,1,1,0,2.0,2.0,1,11.945956,0,0,1,-73.94255,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,Manhattan
71.0,1,1,1,0,6.0,1.0,1,10.44544,0,1,0,-73.94759,0,1,0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,Manhattan
205.0,2,1,0,0,2.0,2.0,0,3.317591,0,1,0,-73.97389,0,1,0,0,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0,Brooklyn
290.0,2,0,0,1,4.0,4.0,0,4.64232,0,1,0,-73.956177,0,1,1,0,1,1,0,1,0,0,1,1,1,1,0,1,0,0,1,Brooklyn
170.0,5,0,0,1,3.0,2.0,0,6.284235,0,1,0,-73.939634,0,1,0,0,1,1,0,1,0,0,0,1,0,1,0,1,0,0,0,Brooklyn
175.0,2,0,0,1,1.0,1.0,1,7.056463,0,1,1,-73.98664,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,Manhattan
90.0,2,1,1,0,6.0,6.0,0,5.062812,0,1,0,-73.95682,0,1,0,0,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0,Brooklyn
75.0,2,1,1,0,3.0,2.0,1,2.813339,0,1,0,-73.98383,0,1,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0,0,Manhattan


In [44]:
listings_filtered_reduced_features.groupby(by='neighbourhood_group_cleansed').size()

neighbourhood_group_cleansed
Bronx             948
Brooklyn         7957
Manhattan        9731
Queens           3551
Staten Island     314
dtype: int64

## Predicting Price using ML model

In this section, we train and evaluate multiple machine learning models to predict Airbnb listing prices. To ensure robust performance and minimize overfitting, we use **10-fold cross-validation** for each model.

The models included in this comparison are:

- **Linear Regression** – A simple, interpretable baseline model.  
- **Random Forest** – A robust ensemble model that handles non-linear relationships well.  
- **XGBoost** – A gradient boosting algorithm known for its accuracy and speed.  
- **CatBoost** – A gradient boosting model optimized for categorical features.  
- **Decision Tree** – A basic tree-based model for capturing non-linear splits.

Each model’s performance is evaluated using key metrics such as **RMSE** and **R² score** to assess predictive accuracy and generalization capability.


In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from catboost import CatBoostRegressor

# Prepare your data
X = listings_filtered_reduced_features.drop(columns=['price', 'neighbourhood_group_cleansed'])  # Features
y = listings_filtered_reduced_features['price']  # Target (Price)

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Evaluation function
def evaluate_model(model, name):
    # RMSE (use negative because cross_val_score minimizes loss)
    rmse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_root_mean_squared_error', n_jobs=-1)
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2', n_jobs=-1)

    print(f"{name} - Avg RMSE: {rmse_scores.mean():.4f}, Std RMSE: {rmse_scores.std():.4f}, "
          f"Avg R²: {r2_scores.mean():.4f}, Std R²: {r2_scores.std():.4f}")

# Run evaluation on each model
print("10-Fold Cross-Validation Results:\n")

evaluate_model(LinearRegression(), "Linear Regression")
evaluate_model(RandomForestRegressor(random_state=42), "Random Forest")
evaluate_model(xgb.XGBRegressor(random_state=42, verbosity=0), "XGBoost")
evaluate_model(CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0), "CatBoost")
evaluate_model(DecisionTreeRegressor(random_state=42), "Decision Tree")


10-Fold Cross-Validation Results:

Linear Regression - Avg RMSE: 117.6904, Std RMSE: 2.8494, Avg R²: 0.4711, Std R²: 0.0160
Random Forest - Avg RMSE: 96.8173, Std RMSE: 4.1248, Avg R²: 0.6423, Std R²: 0.0176
XGBoost - Avg RMSE: 97.0641, Std RMSE: 3.1983, Avg R²: 0.6405, Std R²: 0.0098
CatBoost - Avg RMSE: 95.4080, Std RMSE: 3.5745, Avg R²: 0.6527, Std R²: 0.0131
Decision Tree - Avg RMSE: 132.6259, Std RMSE: 6.9330, Avg R²: 0.3274, Std R²: 0.0621


We're going to pick XGBoost as it has an average R^2 of 0.64, which is strong, while having a standard deviation R^2 of 0.0098, the lowest, indicating the most consistent performance across folds.

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# Prepare data
y = listings_filtered_reduced_features['price']
stratify_col = listings_filtered_reduced_features['neighbourhood_group_cleansed']
X = listings_filtered_reduced_features.drop(columns=['price', 'neighbourhood_group_cleansed'])

# Train-test split
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=stratify_col,
    random_state=1
)

# KFold setup
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Define choices separately for mapping
n_estimators_choices = [100, 200, 300, 800]
max_depth_choices = [4, 6, 10]

# Define the objective function
def objective(params):
    xgb_params = {
        'n_estimators': n_estimators_choices[params['n_estimators']],
        'learning_rate': params['learning_rate'],
        'max_depth': max_depth_choices[params['max_depth']],
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**xgb_params)

    rmse_scores = -cross_val_score(
        model, X_train_full, y_train_full,
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1
    )

    avg_rmse = np.mean(rmse_scores)
    print(f"Trying params: {xgb_params}")
    print(f"Avg CV RMSE: {avg_rmse:.4f}")

    return {'loss': avg_rmse, 'status': STATUS_OK}

# Hyperparameter space
space = {
    'n_estimators': hp.choice('n_estimators', list(range(len(n_estimators_choices)))),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
    'max_depth': hp.choice('max_depth', list(range(len(max_depth_choices))))
}

# Run optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

# Map back choices
best['n_estimators'] = n_estimators_choices[best['n_estimators']]
best['max_depth'] = max_depth_choices[best['max_depth']]

print(f"\n📌 Best Hyperparameters: {best}")

# Train final model with best params
final_model = xgb.XGBRegressor(
    n_estimators=best['n_estimators'],
    learning_rate=best['learning_rate'],
    max_depth=best['max_depth'],
    random_state=42,
    verbosity=0,
    n_jobs=-1
)
final_model.fit(X_train_full, y_train_full)

# Evaluate
y_pred = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
final_r2 = r2_score(y_test, y_pred)

print(f"\n📊 Final Model Performance on Test Set:")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")


Trying params: {'n_estimators': 100, 'learning_rate': 0.0658107097908977, 'max_depth': 10, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE: 99.1338                                  
Trying params: {'n_estimators': 800, 'learning_rate': 0.06474242459540229, 'max_depth': 4, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE: 98.4142                                                           
Trying params: {'n_estimators': 100, 'learning_rate': 0.016629492290547172, 'max_depth': 4, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE: 116.2001                                                          
Trying params: {'n_estimators': 300, 'learning_rate': 0.029099540529400177, 'max_depth': 6, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE: 98.9692                                                           
Trying params: {'n_estimators': 200, 'learning_rate': 0.018698018110930824, 'max_depth': 6, 'random_state': 42, 'verbosity': 0, 'n_jobs': -

Let's try XGBoost (same as the above but using log transformation as the distribution of price is slightly right skewed after removing outliers. The following uses `log(1 + price)`.

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# Log-transform the target
y = np.log1p(listings_filtered_reduced_features['price'])  # log(price + 1)
stratify_col = listings_filtered_reduced_features['neighbourhood_group_cleansed']
X = listings_filtered_reduced_features.drop(columns=['price', 'neighbourhood_group_cleansed'])

# Train-test split
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=stratify_col,
    random_state=1
)

# KFold setup
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Define hyperparameter choices
n_estimators_choices = [100, 200, 300, 800]
max_depth_choices = [4, 6, 10]

# Define objective function
def objective(params):
    xgb_params = {
        'n_estimators': n_estimators_choices[params['n_estimators']],
        'learning_rate': params['learning_rate'],
        'max_depth': max_depth_choices[params['max_depth']],
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**xgb_params)

    rmse_scores = -cross_val_score(
        model, X_train_full, y_train_full,
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1
    )

    avg_rmse = np.mean(rmse_scores)
    print(f"Trying params: {xgb_params}")
    print(f"Avg CV RMSE (log scale): {avg_rmse:.4f}")

    return {'loss': avg_rmse, 'status': STATUS_OK}

# Hyperparameter search space
space = {
    'n_estimators': hp.choice('n_estimators', list(range(len(n_estimators_choices)))),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
    'max_depth': hp.choice('max_depth', list(range(len(max_depth_choices))))
}

# Run optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

# Decode best parameters
best['n_estimators'] = n_estimators_choices[best['n_estimators']]
best['max_depth'] = max_depth_choices[best['max_depth']]

print(f"\n📌 Best Hyperparameters: {best}")

# Train final model
final_model = xgb.XGBRegressor(
    n_estimators=best['n_estimators'],
    learning_rate=best['learning_rate'],
    max_depth=best['max_depth'],
    random_state=42,
    verbosity=0,
    n_jobs=-1
)
final_model.fit(X_train_full, y_train_full)

# Predict and inverse log
y_pred_log = final_model.predict(X_test)
y_pred = np.expm1(y_pred_log)        # reverse log1p
y_test_actual = np.expm1(y_test)     # reverse log1p

# Final evaluation on original price scale
final_rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
final_r2 = r2_score(y_test_actual, y_pred)

print(f"\n Final Model Performance on Test Set (original scale):")
print(f"RMSE: {final_rmse:.4f}")
print(f"R²: {final_r2:.4f}")


Trying params: {'n_estimators': 100, 'learning_rate': 0.0658107097908977, 'max_depth': 10, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE (log scale): 0.3778                       
Trying params: {'n_estimators': 800, 'learning_rate': 0.06474242459540229, 'max_depth': 4, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE (log scale): 0.3793                                                  
Trying params: {'n_estimators': 100, 'learning_rate': 0.016629492290547172, 'max_depth': 4, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE (log scale): 0.4691                                                  
Trying params: {'n_estimators': 300, 'learning_rate': 0.029099540529400177, 'max_depth': 6, 'random_state': 42, 'verbosity': 0, 'n_jobs': -1}
Avg CV RMSE (log scale): 0.3832                                                  
Trying params: {'n_estimators': 200, 'learning_rate': 0.018698018110930824, 'max_depth': 6, 'random_state': 42, 'verbosity': 0, 'n_jo

The model explains **66% of the variation** in listing prices, showing good predictive power. With an average error of about **$95**, it performs reliably on unseen data.

Training on log-transformed prices helped improve consistency, though the gain was modest — likely due to only slight skewness in the price distribution.

Overall, the model is **accurate and stable**, making it a solid tool for price prediction.


It seems like the results (R^2 and RMSE) are not that different when we use `prices` and `log(1 + price)`. **This might be due to the slight right skewed of the distribution.** Log transformation has the most impact when the target variable is heavily skewed (e.g., long tail with extreme outliers).

## Visualizing Actual Vs. Predicted

In this step, we want to visualize between actual prices of the test set vs. the predicted prices of the test set we got from XGBoost training.

The following shows the top 10 rows of Actual vs. Predicted, along with the neighbourhood group of that particular listing.

What's the R^2 and RMSE for each neighbourhood group

In [48]:
# Get the indices from the test set to retrieve neighbourhood info
test_indices = y_test.index

# Extract neighbourhood group from original DataFrame using test indices
neighbourhood_test = listings_filtered_reduced_features.loc[test_indices, 'neighbourhood_group_cleansed']

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Neighbourhood Group': neighbourhood_test.values,
    'Actual Price': y_test_actual.values,
    'Predicted Price': y_pred
})

# Round prices for readability
comparison_df = comparison_df.round(2)

# Display first 10 rows
print(comparison_df.head(10))


  Neighbourhood Group  Actual Price  Predicted Price
0               Bronx          65.0        66.989998
1            Brooklyn         142.0        82.830002
2           Manhattan          48.0        49.209999
3              Queens          40.0        42.189999
4              Queens         108.0       112.199997
5           Manhattan         380.0       232.619995
6               Bronx          60.0        80.059998
7              Queens         124.0       128.190002
8           Manhattan         105.0       120.059998
9           Manhattan         211.0       214.759995


In [49]:
from sklearn.metrics import mean_squared_error, r2_score

# Add predictions and actuals to a new DataFrame along with neighborhood group
comparison_df = pd.DataFrame({
    'Neighbourhood Group': neighbourhood_test.values,
    'Actual Price': y_test_actual.values,
    'Predicted Price': y_pred
})

# Round prices
comparison_df = comparison_df.round(2)

# Group by neighborhood and compute metrics
metrics_by_group = comparison_df.groupby('Neighbourhood Group').apply(
    lambda g: pd.Series({
        'R²': r2_score(g['Actual Price'], g['Predicted Price']),
        'RMSE': np.sqrt(mean_squared_error(g['Actual Price'], g['Predicted Price']))
    })
).reset_index()

# Round for cleaner display
metrics_by_group = metrics_by_group.round(4)

# Show the result
print(metrics_by_group)


  Neighbourhood Group      R²      RMSE
0               Bronx  0.6011   53.9771
1            Brooklyn  0.5668   86.6341
2           Manhattan  0.6491  110.5819
3              Queens  0.5058   77.3224
4       Staten Island  0.4366   49.5099


Let's visualize the above: plotting actual vs. predicted prices, grouped or color-coded by neighborhood group

In [50]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Scatter plot with color by neighbourhood group
fig = px.scatter(
    comparison_df,
    x='Actual Price',
    y='Predicted Price',
    color='Neighbourhood Group',
    opacity=0.7,
    title='Actual vs. Predicted Prices by Neighbourhood Group',
    labels={
        'Actual Price': 'Actual Price',
        'Predicted Price': 'Predicted Price',
        'Neighbourhood Group': 'Neighbourhood Group'
    },
    color_discrete_sequence=px.colors.qualitative.T10
)

# Add ideal fit line (y = x)
min_val = comparison_df['Actual Price'].min()
max_val = comparison_df['Actual Price'].max()

fig.add_trace(
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        line=dict(color='gray', dash='dash'),
        name='Ideal Fit'
    )
)

fig.update_layout(
    legend_title_text='Neighbourhood Group',
    xaxis=dict(showgrid=True),
    yaxis=dict(showgrid=True),
    template='plotly_white'
)

fig.show()


# Save the figure as an interactive HTML file
pio.write_html(fig, file='actual_vs_predicted_prices.html', auto_open=True)
