In [None]:
# !pip install missingno
# !pip install geopy

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
df_EDA = pd.read_csv('/content/drive/My Drive/Airbnb/df_EDA.csv', index_col=0)

In [25]:
df = df_EDA.copy()

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23536 entries, 0 to 23535
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Listing ID             23536 non-null  int64  
 1   Accomodates            23536 non-null  float64
 2   Accuracy Rating        18888 non-null  float64
 3   Bathrooms              23507 non-null  float64
 4   Bedrooms               23516 non-null  float64
 5   Beds                   23501 non-null  float64
 6   Checkin Rating         18870 non-null  float64
 7   Cleanliness Rating     18892 non-null  float64
 8   Communication Rating   18886 non-null  float64
 9   Guests Included        23536 non-null  float64
 10  Host Response Rate     13046 non-null  float64
 11  Latitude               23536 non-null  float64
 12  Location Rating        18871 non-null  float64
 13  Longitude              23536 non-null  float64
 14  Min Nights             23536 non-null  float64
 15  Overall

In [None]:
# df = df.drop(columns=['Review ID', 'Reviewer ID', 'Reviewer Name', 'Listing URL','Listing Name',
#                       'Host URL', 'Host Name', 'City', 'Country Code', 'Country','Comments'])

# EDA - Explenatory Data Analysis

# EDA - Explenatory Data Analysis - GPT

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23536 entries, 0 to 23535
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Listing ID             23536 non-null  int64  
 1   Accomodates            23536 non-null  float64
 2   Accuracy Rating        18888 non-null  float64
 3   Bathrooms              23507 non-null  float64
 4   Bedrooms               23516 non-null  float64
 5   Beds                   23501 non-null  float64
 6   Checkin Rating         18870 non-null  float64
 7   Cleanliness Rating     18892 non-null  float64
 8   Communication Rating   18886 non-null  float64
 9   Guests Included        23536 non-null  float64
 10  Host Response Rate     13046 non-null  float64
 11  Latitude               23536 non-null  float64
 12  Location Rating        18871 non-null  float64
 13  Longitude              23536 non-null  float64
 14  Min Nights             23536 non-null  float64
 15  Overall

I've conducted an exploratory data analysis (EDA) on your dataset, covering:

1. Missing Values Analysis: Visualized the missing values in a heatmap and bar chart.
2. Summary Statistics: Displayed key statistics for numerical columns.
3. Numerical Feature Distributions: Plotted histograms for features like price, reviews, ratings, and accommodations.
4. Correlation Analysis: Generated a heatmap to highlight relationships between numerical variables.
5. Categorical Feature Analysis: Visualized the distribution of key categorical variables.

### Summary statistics

In [None]:
# Proceeding without missingno, using alternative visualization

# Summary statistics
summary_stats = df.describe()
print("Summary Statistics:")
print(summary_stats)

## Missing values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()

# Visualizing missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

# Display missing values as a bar chart
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=missing_values.values, y=missing_values.index, palette="viridis")
plt.xlabel("Count of Missing Values")
plt.ylabel("Columns")
plt.title("Missing Values Count per Column")
plt.show()


print("\nMissing Values:")
print(missing_values)

## Checking the distribution of key numerical features

In [None]:
# Checking the distribution of key numerical features
numerical_columns = [
    "Price", "Reviews", "Overall Rating", "Bedrooms", "Bathrooms", "Beds", "Accomodates"
]

plt.figure(figsize=(12, 10))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(4, 2, i)
    sns.histplot(df[col].dropna(), bins=30, kde=True)
    plt.title(f"Distribution of {col}")

plt.tight_layout()
plt.show()


## Checking correlations among numerical features

In [None]:
# Checking correlations among numerical features
plt.figure(figsize=(12, 8))
correlation_matrix = df[numerical_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.show()


## Visualizing categorical feature distributions

In [None]:
# Visualizing categorical feature distributions

categorical_columns = ["Room Type", "Property Type Reduced", "Neighbourhood Grouped", "Is Superhost"]

plt.figure(figsize=(12, 10))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(2, 2, i)
    sns.countplot(y=df[col], order=df[col].value_counts().index, palette="viridis")
    plt.title(f"Distribution of {col}")
    plt.xlabel("Count")

plt.tight_layout()
plt.show()


# Outliers

In [None]:
# Outlier detection using boxplots for key numerical features
# Selecting key numerical columns for outlier detection
numerical_columns = [
    "Price", "Reviews", "Overall Rating", "Bedrooms", "Bathrooms", "Beds", "Accomodates"
]

plt.figure(figsize=(12, 10))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(4, 2, i)
    sns.boxplot(x=df[col].dropna(), palette="coolwarm")
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()


The boxplots reveal potential outliers in several numerical features, particularly:

1. Price: Likely contains extreme values (high-end listings or pricing errors).
2. Reviews: Some listings have significantly higher review counts than others.
3. Bedrooms, Bathrooms, Beds, Accommodates: Some properties might have unrealistic values (e.g., too many or too few bedrooms relative to accommodations).

### Outlier removal

In [None]:
# Implementing Outlier Removal using the IQR Method

def remove_outliers(df, column):
    """Removes outliers based on IQR for a given column."""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Applying outlier removal to selected numerical columns
cleaned_df = df.copy()
for col in ["Price", "Reviews", "Bedrooms", "Bathrooms", "Beds", "Accomodates"]:
    cleaned_df = remove_outliers(cleaned_df, col)

# Displaying summary after outlier removal
print(f"Cleaned Dataset After Outlier Removal\n {cleaned_df}")

I have removed outliers from key numerical features (Price, Reviews, Bedrooms, Bathrooms, Beds, Accommodates) using the IQR method. The cleaned dataset is now available for further analysis.

# Feature Engineering

Feature Engineering Suggestions
To enhance the dataset and improve model performance, consider:

1. Handling Outliers
 - Use log transformation for skewed features like Price and Reviews to reduce the impact of extreme values.
 - Use IQR (Interquartile Range) method to remove or cap outliers in Price, Bedrooms, Bathrooms, etc.
2. Creating New Features
 - Price Per Person: Price / Accommodates
 - Host Experience: Convert Host Since Year to Years of Hosting = Current Year - Host Since Year
 - Review Activity: Reviews / (Last Review Year - First Review Year + 1) to measure review frequency.
 - Availability Score: Based on the number of reviews and overall rating.
3. Encoding Categorical Variables
 - Convert Room Type, Property Type, Neighbourhood Grouped into one-hot encoded or label encoded features for ML models.
 - Convert Is Superhost and Instant Bookable into binary (0/1) values.

In [None]:
from datetime import datetime

# Making a copy of the cleaned dataset to work on feature creation
df_features = cleaned_df.copy()

# Feature 1: Price per Person
df_features["Price Per Person"] = df_features["Price"] / df_features["Accomodates"]

# Feature 2: Years of Hosting
current_year = datetime.now().year
df_features["Years of Hosting"] = current_year - df_features["Host Since Year"]

# Feature 3: Review Frequency (Reviews per Year since First Review)
df_features["Review Frequency"] = df_features["Reviews"] / ((df_features["Last Review Year"] - df_features["First Review Year"]).replace(0, 1))

# Feature 4: Availability Score (Normalized Reviews * Rating)
df_features["Availability Score"] = df_features["Reviews"] * df_features["Overall Rating"]

# Feature 5: Encoding 'Is Superhost' into Binary (1 for 't', 0 for 'f')
df_features["Is Superhost Binary"] = df_features["Is Superhost"].map({"t": 1, "f": 0})

# Feature 6: Encoding 'Instant Bookable' into Binary
df_features["Instant Bookable Binary"] = df_features["Instant Bookable"].astype(int)

# Display the dataset with newly engineered features
df_features
