In [1]:
import pandas as pd
import numpy as np

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.2f}".format)


In [8]:
import os
import pandas as pd

DATA_PATH = r"D:\PROJECT\Airbnb Listings Analysis\dataset"

listings = pd.read_csv(os.path.join(DATA_PATH, "listings.csv"))
neighbourhoods = pd.read_csv(os.path.join(DATA_PATH, "neighbourhoods.csv"))
reviews = pd.read_csv(os.path.join(DATA_PATH, "reviews.csv"))
print("done")

done


In [9]:
print("Listings shape:", listings.shape)
print("Neighbourhoods shape:", neighbourhoods.shape)
print("Reviews shape:", reviews.shape)


Listings shape: (2876, 18)
Neighbourhoods shape: (8, 2)
Reviews shape: (318549, 2)



Initial Data Inspection

In [10]:

listings.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,155305,Cottage! BonPaul + Sharky's Hostel,746673,BonPaul,,28806,35.58,-82.6,Entire home/apt,95.0,1,454,2025-06-14,2.69,8,162,16,
1,197263,Tranquil Room & Private Bath,961396,Timothy,,28806,35.58,-82.64,Private room,44.0,2,87,2024-09-08,0.56,2,73,5,
2,209068,Terrace Cottage,1029919,Kevin,,28804,35.62,-82.55,Entire home/apt,90.0,30,67,2025-05-03,0.4,1,268,2,
3,246315,Asheville Dreamer's Cabin,1292070,Annie,,28805,35.6,-82.51,Private room,61.0,7,53,2019-10-30,0.32,3,62,0,
4,314540,Asheville Urban Farmhouse Entire Home 4.6 mi t...,381660,Tom,,28806,35.59,-82.63,Entire home/apt,200.0,1,35,2025-06-13,0.22,1,139,11,


In [11]:
reviews.head()


Unnamed: 0,listing_id,date
0,155305,2011-07-31
1,155305,2011-08-23
2,155305,2011-09-19
3,155305,2011-10-28
4,155305,2012-07-01


In [12]:
neighbourhoods.head()


Unnamed: 0,neighbourhood_group,neighbourhood
0,,28704
1,,28715
2,,28732
3,,28801
4,,28803


In [13]:
reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318549 entries, 0 to 318548
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   listing_id  318549 non-null  int64 
 1   date        318549 non-null  object
dtypes: int64(1), object(1)
memory usage: 4.9+ MB


In [15]:
listings.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2876 entries, 0 to 2875
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              2876 non-null   int64  
 1   name                            2876 non-null   object 
 2   host_id                         2876 non-null   int64  
 3   host_name                       2876 non-null   object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   2876 non-null   int64  
 6   latitude                        2876 non-null   float64
 7   longitude                       2876 non-null   float64
 8   room_type                       2876 non-null   object 
 9   price                           2536 non-null   float64
 10  minimum_nights                  2876 non-null   int64  
 11  number_of_reviews               2876 non-null   int64  
 12  last_review                     26

In [16]:
neighbourhoods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   neighbourhood_group  0 non-null      float64
 1   neighbourhood        8 non-null      int64  
dtypes: float64(1), int64(1)
memory usage: 260.0 bytes



Missing Value Inspection 

In [17]:
listings.isna().sum().sort_values(ascending=False)


license                           2876
neighbourhood_group               2876
price                              340
reviews_per_month                  263
last_review                        263
id                                   0
name                                 0
neighbourhood                        0
host_name                            0
host_id                              0
room_type                            0
longitude                            0
latitude                             0
number_of_reviews                    0
minimum_nights                       0
calculated_host_listings_count       0
availability_365                     0
number_of_reviews_ltm                0
dtype: int64

In [18]:
(listings.isna().mean() * 100).sort_values(ascending=False)


license                          100.00
neighbourhood_group              100.00
price                             11.82
reviews_per_month                  9.14
last_review                        9.14
id                                 0.00
name                               0.00
neighbourhood                      0.00
host_name                          0.00
host_id                            0.00
room_type                          0.00
longitude                          0.00
latitude                           0.00
number_of_reviews                  0.00
minimum_nights                     0.00
calculated_host_listings_count     0.00
availability_365                   0.00
number_of_reviews_ltm              0.00
dtype: float64

In [19]:
listings[[
    "neighbourhood_group",
    "neighbourhood",
    "price",
    "room_type",
    "reviews_per_month",
    "last_review"
]].isna().mean() * 100


neighbourhood_group   100.00
neighbourhood           0.00
price                  11.82
room_type               0.00
reviews_per_month       9.14
last_review             9.14
dtype: float64

CLEANING ACTIONS

In [21]:
#Step 1: Drop Useless Columns
listings.drop(
    columns=["license", "neighbourhood_group"],
    inplace=True,
    errors="ignore"
)


In [23]:
#Step 2: Handle Missing Prices
listings[listings["price"].isna()].shape


#listings[listings["price"].isna()].shape
listings = listings.dropna(subset=["price"])


In [24]:
listings.isna().sum()


id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood                       0
latitude                            0
longitude                           0
room_type                           0
price                               0
minimum_nights                      0
number_of_reviews                   0
last_review                       204
reviews_per_month                 204
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
dtype: int64

In [None]:
#Business Interpretation (Important)

About 9% of listings have no review activity

This indicates:

New supply entering the market

Or low-performing listings

This helps in demand segmentation later