In [None]:
# !pip install missingno
# !pip install geopy

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', 500)
## sample

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# AirbnbBerlin_df = pd.read_csv('/content/drive/My Drive/Airbnb/Airbnb Berlin.csv', index_col=0)
# AirbnbBerlin_df = pd.read_csv('/content/Airbnb Berlin.csv')
# df_2019 = AirbnbBerlin_df[AirbnbBerlin_df['review_date'].astype(str).str[6:] == '19']

# df_2019 = pd.read_csv('/content/drive/My Drive/Airbnb/AirbnbBerlin_2019.csv', index_col=0)
df_2019 = pd.read_csv('./content/AirbnbBerlin_2019.csv')

# 1. Data Preparation

### Features Selection
1. If a categorical column is not relevant to the analysis, we can remove it.
2. Listing URL, Listing Name, Host URL, Host Name: These are mostly unique to each listing, so not useful for category reduction

In [None]:
# drop the columns that is not helpful for prediction
df = df_2019.drop(columns=['Review ID', 'Reviewer ID', 'Reviewer Name', 'Listing URL','Listing Name',
                           'Host ID', 'Host URL', 'Host Name',
                           'City', 'Country Code', 'Country',
                           'First Review', 'Last Review', 'Square Feet', 'Business Travel Ready',
                           ])

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66833 entries, 0 to 66832
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 66833 non-null  int64  
 1   review_date           66833 non-null  object 
 2   Comments              66782 non-null  object 
 3   Listing ID            66833 non-null  int64  
 4   Host Since            66833 non-null  object 
 5   Host Response Time    65868 non-null  object 
 6   Host Response Rate    65868 non-null  object 
 7   Is Superhost          66833 non-null  object 
 8   neighbourhood         66833 non-null  object 
 9   Neighborhood Group    66833 non-null  object 
 10  Postal Code           65737 non-null  object 
 11  Latitude              66833 non-null  float64
 12  Longitude             66833 non-null  float64
 13  Is Exact Location     66833 non-null  object 
 14  Property Type         66833 non-null  object 
 15  Room Type          

### Clean Text

In [None]:
# 1. Clean Text: Perform text cleaning, remove currency symbols & commas
df['Price'] = df['Price'].replace('[\$,]', '', regex=True).astype(float)
df['Host Response Rate'] = df['Host Response Rate'].replace('[\%,]', '', regex=True).astype(float)

# Fix Postal Code incorrect values, remove '\n' and other irrelevant text
df['Postal Code'] = df['Postal Code'].astype(str).str[:5]

### Reduce Large Categories

1. Group Rare Categories: If a categorical column has many unique values, we can group infrequent categories into an "Other" category like 'Reviewer Name'.
2. Merge Similar Categories: If there are similar categories (e.g., different spellings or formats of the same category), we can merge them.
3. Binning: For numerical categories (like "Overall Rating" or "Accommodates"), we can create bins to reduce the number of unique values.

In [None]:
# Define bins and labels for 'Host Response Rate'
bins = [0, 50, 80, 95, 100]
labels = ["Low Response", "Moderate Response", "High Response", "Very High Response"]

# Apply pd.cut() to create a new binned column
df["Host Response Rate Reduced"] = pd.cut(df["Host Response Rate"], bins=bins, labels=labels, include_lowest=True)

#  Reduce Categories
df['Overall Rating Reduced'] = df['Overall Rating'].apply(lambda x: np.ceil(x/10))

# 2. Grouping neighbourhoods into Neighborhood Groups
if 'Neighborhood Group' in df.columns:
  neighbourhood_mapping = df.groupby('neighbourhood')['Neighborhood Group'].first()
  df['Neighbourhood Grouped'] = df['neighbourhood'].map(neighbourhood_mapping)

# 3. Reducing Property Types
property_mapping = {
    "Villa": "Vacation Rental",
    "Cottage": "Vacation Rental",
    "Bungalow": "Vacation Rental",
    "Cabin": "Vacation Rental",
    "Tiny house": "Vacation Rental",
    "Earth house": "Vacation Rental",
    "Treehouse": "Vacation Rental",
    "Hut": "Vacation Rental",
    "Barn": "Vacation Rental",
    "Houseboat": "Boats & Houseboats",
    "Boat": "Boats & Houseboats",
    "Camper/RV": "Mobile/Alternative Lodging",
    "Cave": "Mobile/Alternative Lodging",
    "Pension (South Korea)": "Mobile/Alternative Lodging",
    "Casa particular (Cuba)": "Mobile/Alternative Lodging",
}

# Apply mapping and assign 'Other' to rare categories
top_property_types = [
    "Apartment", "Loft", "House", "Townhouse", "Condominium", "Serviced apartment",
    "Hotel", "Hostel", "Guesthouse", "Bed and breakfast", "Boutique hotel"
]

df['Property Type Reduced'] = df['Property Type'].apply(
    lambda x: property_mapping.get(x, x) if x in top_property_types or x in property_mapping else "Other"
)

# 4. Binning Postal Codes (first two digits represent broad area)
df['Postal Code Reduced'] = df['Postal Code'].astype(str).str[:2]  # Use only first 2 digits


### Transform/Manipulate data

In [None]:
# Extracting years from date columns
df['Host Since'] = pd.to_datetime(df['Host Since'])
df['Host Since Year'] = df['Host Since'].dt.year

df['review_date'] = pd.to_datetime(df['review_date'])
df['Review Date Year'] = df['review_date'].dt.year

# 5. transform true/false into bool
df['Instant Bookable'] = df['Instant Bookable'].replace({'t': True, 'f': False})
df['Is Superhost'] = df['Is Superhost'].replace({'t': True, 'f': False})
df['Is Exact Location'] = df['Is Exact Location'].replace({'t': True, 'f': False})
df['Instant Bookable'] = df['Instant Bookable'].replace({'t': True, 'f': False})

In [None]:
df.head(2)

Unnamed: 0,index,review_date,Comments,Listing ID,Host Since,Host Response Time,Host Response Rate,Is Superhost,neighbourhood,Neighborhood Group,Postal Code,Latitude,Longitude,Is Exact Location,Property Type,Room Type,Accomodates,Bathrooms,Bedrooms,Beds,Price,Guests Included,Min Nights,Reviews,Overall Rating,Accuracy Rating,Cleanliness Rating,Checkin Rating,Communication Rating,Location Rating,Value Rating,Instant Bookable,Host Response Rate Reduced,Overall Rating Reduced,Neighbourhood Grouped,Property Type Reduced,Postal Code Reduced,Host Since Year,Review Date Year
0,0,2019-01-03,All is awesome in this house ;),10029891,2014-10-20,within a few hours,100.0,False,Kreuzberg,Friedrichshain-Kreuzberg,10967,52.49147,13.40926,False,Apartment,Entire home/apt,4,1.0,2.0,3.0,50.0,2,3,7,97.0,10.0,9.0,9.0,10.0,10.0,9.0,False,Very High Response,10.0,Friedrichshain-Kreuzberg,Apartment,10,2014,2019
1,1,2019-03-09,"Good location, train station down the block wi...",10029891,2014-10-20,within a few hours,100.0,False,Kreuzberg,Friedrichshain-Kreuzberg,10967,52.49147,13.40926,False,Apartment,Entire home/apt,4,1.0,2.0,3.0,50.0,2,3,7,97.0,10.0,9.0,9.0,10.0,10.0,9.0,False,Very High Response,10.0,Friedrichshain-Kreuzberg,Apartment,10,2014,2019


In [None]:
df = df.drop(columns=['Host Since', 'review_date', 'neighbourhood', 'Property Type', 'Postal Code',
                 'Host Response Rate', 'Overall Rating','Instant Bookable', 'Is Superhost',
                 'Is Exact Location',
                 'Comments'])

In [None]:
df.head(2)

Unnamed: 0,index,Listing ID,Host Response Time,Neighborhood Group,Latitude,Longitude,Room Type,Accomodates,Bathrooms,Bedrooms,Beds,Price,Guests Included,Min Nights,Reviews,Accuracy Rating,Cleanliness Rating,Checkin Rating,Communication Rating,Location Rating,Value Rating,Host Response Rate Reduced,Overall Rating Reduced,Neighbourhood Grouped,Property Type Reduced,Postal Code Reduced,Host Since Year,Review Date Year
0,0,10029891,within a few hours,Friedrichshain-Kreuzberg,52.49147,13.40926,Entire home/apt,4,1.0,2.0,3.0,50.0,2,3,7,10.0,9.0,9.0,10.0,10.0,9.0,Very High Response,10.0,Friedrichshain-Kreuzberg,Apartment,10,2014,2019
1,1,10029891,within a few hours,Friedrichshain-Kreuzberg,52.49147,13.40926,Entire home/apt,4,1.0,2.0,3.0,50.0,2,3,7,10.0,9.0,9.0,10.0,10.0,9.0,Very High Response,10.0,Friedrichshain-Kreuzberg,Apartment,10,2014,2019


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66833 entries, 0 to 66832
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   index                       66833 non-null  int64   
 1   Listing ID                  66833 non-null  int64   
 2   Host Response Time          65868 non-null  object  
 3   Neighborhood Group          66833 non-null  object  
 4   Latitude                    66833 non-null  float64 
 5   Longitude                   66833 non-null  float64 
 6   Room Type                   66833 non-null  object  
 7   Accomodates                 66833 non-null  int64   
 8   Bathrooms                   66791 non-null  float64 
 9   Bedrooms                    66763 non-null  float64 
 10  Beds                        66829 non-null  float64 
 11  Price                       66833 non-null  float64 
 12  Guests Included             66833 non-null  int64   
 13  Min Nights      

# 2. Exploratory Data Analysis (EDA)

# 5. Feature Engineering

## Enriching
Create new feature of the listing distance from city center

In [None]:
from geopy.distance import great_circle
def distance_to_mid(lat, lon):
    berlin_centre = (52.5027778, 13.404166666666667)
    accommodation = (lat, lon)
    return great_circle(berlin_centre, accommodation).km

df['Distance From Center'] = df.apply(lambda row: round(distance_to_mid(row['Latitude'], row['Longitude']), 1), axis=1)
bins = [0, 0.5, 1, 2, 4, 8, 16, 32]
labels = ["0_5_km", "1_km", "2_km", "4_km", "8_km", "16_km", "32_km"]

# Apply pd.cut() to create a new binned column
df["Distance From Center"] = pd.cut(df["Distance From Center"], bins=bins, labels=labels, include_lowest=True)