In [146]:
# Import packages
import pandas as pd
import numpy as np
import plotly.express as px

In [181]:
# Connect data
df = pd.read_csv('/Users/haribudiarto/Downloads/airbnb.csv')
# Preview data
df.head()

Unnamed: 0,Host Id,Host Since,Name,Neighbourhood,Property Type,Review Scores Rating (bin),Room Type,Zipcode,Beds,Number of Records,Number Of Reviews,Price,Review Scores Rating
0,5162530,,1 Bedroom in Prime Williamsburg,Brooklyn,Apartment,,Entire home/apt,11249.0,1.0,1,0,145,
1,33134899,,"Sunny, Private room in Bushwick",Brooklyn,Apartment,,Private room,11206.0,1.0,1,1,37,
2,39608626,,Sunny Room in Harlem,Manhattan,Apartment,,Private room,10032.0,1.0,1,1,28,
3,500,6/26/08,Gorgeous 1 BR with Private Balcony,Manhattan,Apartment,,Entire home/apt,10024.0,3.0,1,0,199,
4,500,6/26/08,Trendy Times Square Loft,Manhattan,Apartment,95.0,Private room,10036.0,3.0,1,39,549,96.0


In [252]:
# Remove space from column name
df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
df = df.rename(columns={'Neighbourhood_': 'Neighbourhood', 'Review_Scores_Rating_(bin)': 'Review_Scores_Rating(bin)'})
df.head()

Unnamed: 0,Host_Id,Host_Since,Name,Neighbourhood,Property_Type,Review_Scores_Rating(bin),Room_Type,Zipcode,Beds,Number_of_Records,Number_Of_Reviews,Price,Review_Scores_Rating,Host_Since_Date,HS_Year,HS_Month,HS_Day
4,500,6/26/08,Trendy Times Square Loft,Manhattan,Apartment,95.0,Private room,10036.0,3.0,1,39,,96.0,2008-06-26,2008,6,26
5,1039,7/25/08,Big Greenpoint 1BD w/ Skyline View,Brooklyn,Apartment,100.0,Entire home/apt,11222.0,1.0,1,4,149.0,100.0,2008-07-25,2008,7,25
6,1783,8/12/08,Amazing Also,Manhattan,Apartment,100.0,Entire home/apt,10004.0,1.0,1,9,,100.0,2008-08-12,2008,8,12
7,2078,8/15/08,"Colorful, quiet, & near the subway!",Brooklyn,Apartment,90.0,Private room,11201.0,1.0,1,80,90.0,94.0,2008-08-15,2008,8,15
8,2339,8/20/08,East Village Cocoon: 2 Bedroom Flat,Manhattan,Apartment,90.0,Entire home/apt,10009.0,2.0,1,95,,90.0,2008-08-20,2008,8,20


In [194]:
# Check data type
df.dtypes

Host_Id                         int64
Host_Since                     object
Name                           object
Neighbourhood_                 object
Property_Type                  object
Review_Scores_Rating_(bin)    float64
Room_Type                      object
Zipcode                       float64
Beds                          float64
Number_of_Records               int64
Number_Of_Reviews               int64
Price                         float64
Review_Scores_Rating          float64
dtype: object

In [None]:
# Drop NA and Transform column into numeric
df.dropna(inplace=True)
df['Price'] = df['Price'].replace(',', '.')
df['Price'] = pd.to_numeric(df['Price'],errors='coerce')
df.dtypes

In [201]:
# Visualizing room distribution count
ex_level = df['Room_Type'].value_counts()
fig = px.pie(ex_level, names = ex_level.index, values = ex_level.values, 
                title = 'Room Type Distribution')
fig.show()

In [253]:
# Visualizing host increment per neighbourhood per yead
df['Host_Since_Date'] = pd.to_datetime(df['Host_Since'])
df['HS_Year'] = df['Host_Since_Date'].dt.year
df['HS_Month'] = df['Host_Since_Date'].dt.month
df['HS_Day'] = df['Host_Since_Date'].dt.day

group = df.groupby(['HS_Year','Neighbourhood']).size().reset_index(name='Count')
fig = px.line(group, x = "HS_Year", y = "Count", color = "Neighbourhood",
              title = 'Host Increment per Neighbourhood & Year')
fig.show()


In [250]:
# Visualizing review score and number of review between Room_type
fig = px.scatter(df,x = "Review_Scores_Rating", y = 'Number_Of_Reviews', 
                 color = "Room_Type",facet_col="Room_Type")
fig.update_xaxes(title_text='Number of Reviews')
fig.update_yaxes(title_text='Review Scores')
fig.show()

In [254]:
# Visualizing price distribution between location and room type
dfo = df
z_scores = np.abs((dfo['Price'] - dfo['Price'].mean()) / dfo['Price'].std())
threshold = 2  # Adjust the threshold as needed
dfo['Price'] = np.where(z_scores > threshold, np.nan, df['Price'])
dfo = dfo.dropna()

fig = px.box(dfo,y = "Price", color = 'Room_Type', x = 'Neighbourhood',title = 'Price Distribution between Location and Room Type' )
fig.show()

Host Id                         int64
Host Since                     object
Name                           object
Neighbourhood                  object
Property Type                  object
Review Scores Rating (bin)    float64
Room Type                      object
Zipcode                       float64
Beds                          float64
Number of Records               int64
Number Of Reviews               int64
Price                         float64
Review Scores Rating          float64
dtype: object