In [1]:
# !pip install missingno
# !pip install geopy

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_EDA = pd.read_pickle('/content/drive/My Drive/Airbnb/df_EDA.pkl')

In [5]:
df = df_EDA.copy()

In [6]:
# df = df.drop(columns=['Listing ID','Host ID'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23536 entries, 0 to 23535
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Listing ID             23536 non-null  int64         
 1   Accomodates            23536 non-null  float64       
 2   Accuracy Rating        18888 non-null  float64       
 3   Bathrooms              23507 non-null  float64       
 4   Bedrooms               23516 non-null  float64       
 5   Beds                   23501 non-null  float64       
 6   Checkin Rating         18870 non-null  float64       
 7   Cleanliness Rating     18892 non-null  float64       
 8   Communication Rating   18886 non-null  float64       
 9   Guests Included        23536 non-null  float64       
 10  Host ID                23536 non-null  float64       
 11  Latitude               23536 non-null  float64       
 12  Location Rating        18871 non-null  float64       
 13  L

## Defines data columns Lists

In [7]:
numerical_columns = ['Accomodates', 'Bathrooms', 'Bedrooms', 'Beds', 'Guests Included','Min Nights','Reviews','Price']
rating_columns = ['Value Rating','Location Rating', 'Cleanliness Rating','Checkin Rating','Accuracy Rating','Communication Rating','Host Response Rate','Overall Rating']
boolean_columns = ['Is Superhost','Is Exact Location', 'Instant Bookable']
categorical_columns = ['Room Type','Property Type Reduced','Neighborhood Group','Postal Code Reduced','Host Response Time'] # , 'Neighbourhood Grouped'
date_columns = ['review_date','Host Since']
non_categorical_columns = numerical_columns + rating_columns + boolean_columns

In [8]:
df[categorical_columns].describe()

Unnamed: 0,Room Type,Property Type Reduced,Neighborhood Group,Postal Code Reduced,Host Response Time
count,23536,23536,23536,22951,13046
unique,3,15,12,7,4
top,Private room,Apartment,Friedrichshain-Kreuzberg,10,within an hour
freq,11694,20935,5726,14667,6816


# EDA - Explenatory Data Analysis

## Data Protocol

In [9]:
#Type of data
df.dtypes.to_excel ("df_datatype.xlsx", sheet_name='data_types')

#Maximum points
df[non_categorical_columns].max().to_excel("df_max.xlsx", sheet_name='max')

#Minimum points
df[non_categorical_columns].max().to_excel("df_max.xlsx", sheet_name='min')

#Isnull values
df.isnull().to_excel("df_isnull.xlsx", sheet_name='isnull')

#unique values
df.nunique().to_excel("df_nunique.xlsx", sheet_name='nunique')

# Categorical values
df[categorical_columns].describe().to_excel("df_categorical.xlsx", sheet_name='categorical')

## Descriptive statistics
   

In [10]:
df.shape

(23536, 37)

In [11]:
df[numerical_columns].describe()

Unnamed: 0,Accomodates,Bathrooms,Bedrooms,Beds,Guests Included,Min Nights,Reviews,Price
count,23536.0,23507.0,23516.0,23501.0,23536.0,23536.0,23536.0,23536.0
mean,2.671737,1.095971,1.16172,1.641802,1.346151,6.897774,19.238826,69.612424
std,1.567812,0.34615,0.660736,1.228642,0.856297,24.879032,40.360483,216.660408
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,2.0,1.0,1.0,1.0,1.0,2.0,1.0,32.0
50%,2.0,1.0,1.0,1.0,1.0,3.0,5.0,49.0
75%,3.0,1.0,1.0,2.0,1.0,4.0,17.0,75.0
max,16.0,8.5,10.0,22.0,16.0,1000.0,545.0,9000.0


In [14]:
df[rating_columns].describe()

Unnamed: 0,Value Rating,Location Rating,Cleanliness Rating,Checkin Rating,Accuracy Rating,Communication Rating,Host Response Rate,Overall Rating
count,18868.0,18871.0,18892.0,18870.0,18888.0,18886.0,13046.0,18914.0
mean,9.417638,9.550315,9.334904,9.728829,9.67985,9.746479,91.842174,94.564344
std,0.840456,0.748713,1.031248,0.695275,0.737798,0.6796,19.447852,7.599949
min,2.0,2.0,2.0,2.0,2.0,2.0,0.0,20.0
25%,9.0,9.0,9.0,10.0,10.0,10.0,95.0,92.0
50%,10.0,10.0,10.0,10.0,10.0,10.0,100.0,97.0
75%,10.0,10.0,10.0,10.0,10.0,10.0,100.0,100.0
max,10.0,10.0,10.0,10.0,10.0,10.0,100.0,100.0


In [None]:
df[['Host Since Year', 'Host Since Month', 'Host Since Day']].describe()

Several rows with unusually high values can be identified and may in some cases be dropped at a certain threshold

The description of price shows that 75% of the room only charged within 70€. But we can find the maximized price is extremely large - up to 9000€.

To exclude the outlier in this dataset, we set the data limit of 600€.

It also shows that data has 0 price, that is not make sense. So we will exclude it too.

In [None]:
df[['Beds','Min Nights','Reviews','Price']].describe().loc[['max']].T

## Target Value

In [None]:
df[['Price']].describe()

The description of price shows that 75% of the room only charged within 70€. But we can find the maximized price is extremely large - up to 9000€.

### Distribution of Price

In [None]:
def target_value_distributions(df):
  plt.figure(figsize=(10, 6))
  sns.histplot(df['Price'], bins=200, kde=True)
  plt.title("Distribution of Price")
  plt.xlabel("Price")
  plt.ylabel("Frequency")
  plt.show()

In [None]:
# Distribution of Price before dropping extream outlier values
target_value_distributions(df)

In [None]:
plt.figure(figsize=(16, 2))
sns.boxplot(df['Price'], orient='h')
plt.show()

To exclude the outlier in this dataset, we set the data limit of 600€.

In [None]:
df = df[(df['Price'] <= 600) & (df['Price'] > 0)]
df['Price'].describe()

In [None]:
# Distribution of Price
target_value_distributions(df)

### Location vs Price

In [None]:
df.plot(kind="scatter", x="Longitude", y="Latitude", alpha=0.7, figsize=(8,5),
        c="Price", cmap="gist_heat_r", colorbar=True, sharex=False);

## Missing values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()

# Visualizing missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

# Display missing values as a bar chart
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_values.values, y=missing_values.index, palette="viridis")
plt.xlabel("Count of Missing Values")
plt.ylabel("Columns")
plt.title("Missing Values Count per Column")
plt.show()


print("\nMissing Values:")
print(missing_values)

## Dummies

In [None]:
def plot_counts_for_columns(df, columns_names, rows=1, cols=3, figsize_=(20, 6)):
  i = 1
  plt.figure(figsize=figsize_)
  plt.subplots_adjust(hspace = 0.5)
  for col in columns_names:
    plt.subplot(rows, cols, i)
    plt.title(f"'{col}'")
    plt.xlabel(col)
    plt.ylabel("Count")
    sns.countplot(data=df, x=col, palette="viridis")
    i = i + 1

  plt.tight_layout()
  plt.show()
  print (i)

In [None]:
boolean_columns

In [None]:
df['Is Superhost'] = df['Is Superhost'].astype(bool)
df[boolean_columns].info()

In [None]:
plot_counts_for_columns(df, boolean_columns)

## Categorial

In [None]:
df['Postal Code Reduced'].isna().sum()
# df['Postal Code Reduced'] = df['Postal Code Reduced'].astype(str)

In [None]:
categorical_columns

In [None]:
xticks_map = {'Room Type':0, 'Property Type Reduced':90, 'Neighborhood Group':90, 'Neighbourhood Grouped':90, 'Postal Code Reduced':0, 'Host Response Time':45}
i = 1
plt.figure(figsize=(18, 15))
plt.subplots_adjust(hspace = 0.5)
for col in categorical_columns:
  plt.subplot(2, 3, i)
  plt.title(f"Distribution of `{col}`", fontsize=12, fontweight="bold")
  plt.xlabel(col, fontsize=14)
  plt.xticks(rotation=xticks_map.get(col, 0), fontsize=14)
  plt.ylabel("Count")
  sns.countplot(data=df, x=col, palette="viridis")
  i = i + 1

plt.tight_layout()
plt.show()

In [None]:
# df = df_EDA.copy()

In [None]:
# df['Host Since Year'] = df['Host Since Year'].astype('str').apply(lambda x: (x.split('.')[0]))
# df['Host Since Year'] = df['Host Since Year'].apply(lambda x: None if x == 'nan' else x).astype('category')
# df['Host Since Month'] = df['Host Since Month'].astype('str').apply(lambda x: x.split('.')[0])
# df['Host Since Month'] = df['Host Since Month'].apply(lambda x: None if x == 'nan' else x).astype('category')
# df['Host Since Day'] = df['Host Since Day'].astype('str').apply(lambda x: x.split('.')[0])
# df['Host Since Day'] = df['Host Since Day'].apply(lambda x: None if x == 'nan' else x).astype('category')

In [None]:
df['Host Since Year'].info()

In [None]:
df['Host Since Year'].isna().sum()

In [None]:
# df['Host Since Month'].value_counts().index.sort_values()
# df['Host Since Month'].value_counts().index.sort_values( key=lambda x: pd.to_numeric(x, errors='coerce'))

In [None]:
i = 1
plt.figure(figsize=(15, 8))
# Adjust spacing
plt.subplots_adjust(hspace=0.6)
for col in ['Host Since Year','Host Since Month','Host Since Day']:
  plt.subplot(2, 2, i)
  plt.title(f"Distribution of `{col}`", fontsize=10, fontweight="bold")
  plt.xlabel(col)
  plt.xticks(rotation=xticks_map.get(col, 0))
  plt.ylabel("Count")
  sorted_order = df[col].value_counts().index.sort_values( key=lambda x: pd.to_numeric(x, errors='coerce'))
  sns.countplot(data=df, x=col, order=sorted_order, palette="viridis")
  i = i + 1

plt.tight_layout()
plt.show()

## Continues and numrical data

### Histograms for continues numbers

Checking the distribution of key numerical features

In [None]:
numerical_columns

In [None]:
def plot_histplot_for_columns(df, numerical_columns):
  plt.figure(figsize=(12, 12))
  plt.subplots_adjust(hspace = 0.5)
  for i, col in enumerate(numerical_columns, 1):
    plt.subplot(4, 2, i)
    sns.histplot(df[col].dropna(), bins=50, kde=False)
    plt.title(f"Distribution of {col}")

  plt.tight_layout()
  plt.show()

In [None]:
# Checking the distribution of key numerical features
# numerical_columns = [
#     "Price", "Reviews", "Overall Rating", "Bedrooms", "Bathrooms", "Beds", "Accomodates"
# ]

plot_histplot_for_columns(df, numerical_columns)

As expected, we can notice that some of the numerics are more categorical data like (for example, Accomadates, Bathrooms and Beds).
It worth using barplot to show relations ship with target value

In [None]:
plot_histplot_for_columns(df, rating_columns)

### Bars for continues numbers

In [None]:
categorical_columns

In [None]:
plt.figure(figsize=(12, 12))
plt.subplots_adjust(hspace = 0.5)
for i, col in enumerate(categorical_columns, 1):
  plt.subplot(3, 2, i)
  sns.barplot(x=df[col], y=df['Price'], errcolor="red")
  plt.xticks(rotation=90)
  plt.title(f"Distribution of Price by '{col}'")

plt.tight_layout()
plt.show()

In [None]:
df['Room Type'].value_counts()

In [None]:
# sns.histplot(data = df[df['Room Type'] == 'Private room'], x="Price", bins=50, kde=True)
sns.histplot(data = df, x="Price", bins=50, hue="Room Type", kde=True, multiple="dodge")

In [None]:
boolean_columns

In [None]:
sns.histplot(data = df, x="Price", bins=50, hue="Is Superhost", kde=True, multiple="dodge")

In [None]:
sns.histplot(data = df, x="Price", bins=50, hue="Neighborhood Group", kde=True, multiple="dodge")

In [None]:
bar_plot_columns = ['Bedrooms', 'Bathrooms', 'Beds']
plt.figure(figsize=(15, 10))
plt.subplots_adjust(hspace = 0.5)
i = 1
for col in bar_plot_columns:
  plt.subplot(3, 2, i)
  sns.scatterplot(x=df[col], y=df['Price'], color='red')
  plt.title(f"Distribution of Price by '{col}'")

  plt.subplot(3, 2, i+1)
  sns.barplot(x=df[col], y=df['Price'], errcolor="red")
  plt.title(f"Distribution of Price by '{col}'")

  i = i + 2
plt.tight_layout()
plt.show()

In [None]:
bar_plot_columns = ['Bedrooms', 'Bathrooms', 'Beds']
plt.figure(figsize=(15, 6))
plt.subplots_adjust(hspace = 0.5)
for i, col in enumerate(bar_plot_columns, 1):
  plt.subplot(2, 2, i)
  sns.barplot(x=df[col], y=df['Price'], errcolor="red")
  plt.title(f"Distribution of Price by '{col}'")
  plt.axvline(df[col].mean(), color='red', linestyle='dashed', linewidth=2)
  plt.axvline(df[col].median(), color='green', linestyle='dashed', linewidth=2)

plt.tight_layout()
plt.show()

## Skewness

## Outliers

### Target Value

In [None]:
# Outlier detection using boxplots for key numerical features
# Selecting key numerical columns for outlier detection
numerical_columns = [
    "Price", "Reviews", "Overall Rating", "Bedrooms", "Bathrooms", "Beds", "Accomodates"
]

plt.figure(figsize=(12, 10))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(4, 2, i)
    sns.boxplot(x=df[col].dropna(), palette="coolwarm")
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()
