In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

pd.set_option('display.max_columns', None)

ImportError: DLL load failed while importing _imaging: The specified module could not be found.

In [None]:
from sklearn.preprocessing import OneHotEncoder


In [None]:
# import data

df = pd.read_csv("../Data/Original/Merged_Data/airbnb_data.csv")
df.head()

In [None]:
df.info()

## Meaning of variables

- realSum : Price of accommodation for two people for two nights (EUR)
- room_type : type of accommodation
- room_shared : variable for shared rooms
- room_private : variable for private rooms
- person_capacity : maximum number of guests
- host_is_superhost : superhost status
- multi : if listing belongs to host with 2-4 offers
- biz : if listing belongs to host with > 4 offers
- cleanliness_rating
- guest_satisfaction_overall : overall rating of listing
- bedrooms : number of bedrooms (0 for studios)
- dist : distance from city centre (in km)
- metro_dist : distance from nearest metro station (in km)
- attr_index : attraction index of listing location
- attr_index_norm : normalized attraction index (0 - 100)
- rest_index : restaurant index of listing location
- rest_index_norm : normalized restaurant index (0 - 100)
- lng : longitude
- lat : latitude
- city
- day_type : weekend or weekday

Konok's EDA

- add heatmap, see correlation, understand factors.

# Data Preprocessing

In [None]:
df.describe(include='all')  # to showcase both numerical and categorical variables

In [None]:
# check for NaNs
df.isna().sum()

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
df.head()

check if the number of categories in the columns are correct for each.

In [14]:
len(df['room_type'].value_counts())

3

In [28]:
df['bedrooms'].value_counts()

bedrooms
1     36333
2      9290
0      4485
3      1477
4        96
5        10
9        10
6         2
10        2
8         2
Name: count, dtype: int64

In [None]:
# Plot heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(df, annot=True, cmap='coolwarm', fmt=".1f")
plt.title('Heatmap of DataFrame')
plt.show()

## 1. Convert Categorical Variables to Encodings


Label Encodings are used when there is an inherent order between the categories i.e. 1, 2, 3 actually mean something in that order. But in our case, all the categories in a column are independent and don't represent any order. Therefore, we go with One Hot Encoding.

### A. Binary Encoding (for binary categories)

One-hot encoding isn't typically used for binary variables like True and False because it's redundant; the information is already encoded in a single binary feature.

True --> 1
False --> 0 

In [19]:
# note that other binary categorical columns are already in binary forms (0, 1).
df['room_shared'] = df['room_shared'].astype(int)
df['room_private'] = df['room_private'].astype(int)
df['host_is_superhost'] = df['host_is_superhost'].astype(int)
df.head()

Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,day_type
0,194.033698,Private room,0,1,2.0,0,1,0,10.0,93.0,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam,weekdays
1,344.245776,Private room,0,1,4.0,0,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam,weekdays
2,264.101422,Private room,0,1,2.0,0,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,amsterdam,weekdays
3,433.529398,Private room,0,1,4.0,0,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam,weekdays
4,485.552926,Private room,0,1,2.0,1,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam,weekdays


### B. One Hot Encoding

When variables are independent as are the categories within a column. i.e. not present in any impactful order.

In [29]:
def one_hot_encode_column(df, column_name):
    # Extract the column to be encoded
    data = df[[column_name]]
    onehot_encoder = OneHotEncoder(sparse=False)
    # Fit and transform the data
    onehot_encoded = onehot_encoder.fit_transform(data)
    # Get the feature names
    feature_names = onehot_encoder.get_feature_names_out([column_name])

    # Create a DataFrame from the one-hot encoded array
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=feature_names)

    # Drop the original column from the DataFrame
    df.drop(column_name, axis=1, inplace=True)

    # Concatenate the one-hot encoded DataFrame with the original DataFrame
    df = pd.concat([df, onehot_encoded_df], axis=1)

    return df
    

In [33]:
# df = one_hot_encode_column(df, 'room_type')
df = one_hot_encode_column(df, 'city')
df = one_hot_encode_column(df, 'day_type')
df.head()



Unnamed: 0,realSum,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,city_amsterdam,city_athens,city_barcelona,city_berlin,city_budapest,city_lisbon,city_london,city_paris,city_rome,city_vienna,day_type_weekdays,day_type_weekends
0,194.033698,0,1,2.0,0,1,0,10.0,93.0,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,344.245776,0,1,4.0,0,0,0,8.0,85.0,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,264.101422,0,1,2.0,0,0,1,9.0,87.0,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,433.529398,0,1,4.0,0,0,1,9.0,90.0,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,485.552926,0,1,2.0,1,0,0,10.0,98.0,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Do we need to do Standard Scaling, Principal Component Analysis?

In [36]:
df.to_csv('../Data/Preprocessed/airbnb_data_preprocessed.csv', index=False)