# Goal

Our goal with this notebook is to prepare the data for different models that will predict the price of a listing based on other features.
By doing so we will see what features influence the most the price of a listing.

# Import Libraries

In [1]:
from collections import Counter
from pathlib import Path
from typing import Optional, List, Tuple

import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load Data

In [2]:
df = pd.read_csv(Path("data") / "data.csv")

In [3]:
df.head()

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,...,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,city,has_license,is_bathroom_private
0,2818,3159,2008-09-24,within an hour,1.0,1.0,t,Indische Buurt,1,1,...,4.82,t,1,0,1,0,1.81,Amsterdam,t,f
1,20168,59484,2009-12-02,within an hour,1.0,0.98,f,Grachtengordel,2,2,...,4.49,t,2,0,2,0,2.3,Amsterdam,t,t
2,27886,97647,2010-03-23,within an hour,1.0,1.0,t,Westelijke Eilanden,1,1,...,4.79,t,1,0,1,0,1.84,Amsterdam,t,f
3,28871,124245,2010-05-13,within an hour,1.0,0.99,t,Amsterdam Centrum,2,2,...,4.82,f,2,0,2,0,2.7,Amsterdam,t,f
4,29051,124245,2010-05-13,within an hour,1.0,0.99,t,Amsterdam Centrum,2,2,...,4.75,f,2,0,2,0,3.98,Amsterdam,t,f


In [4]:
df = df.drop(columns=["id"])

# Preprocess Data

### Drop Outliers

In [5]:
df = df[(0 < df["price"]) & (df["price"] < 2000)]

### Pick Target & Split Data

In [6]:
df["price"].isna().sum()

0

In [7]:
X = df.drop(columns=["price"])
y = df["price"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=True)

In [9]:
len(X_train)

5728

In [10]:
len(X_test)

1432

### Transform DateTime Features

In [11]:
def compute_time_delta(df: pd.DataFrame, columns: List[str], min_max_scaler: Optional[MinMaxScaler] = None) -> Tuple[pd.DataFrame, MinMaxScaler]:
    for column in columns:
        df[column] = pd.to_datetime(df[column], format="%Y-%m-%d")
    df[columns] = df[columns].apply(lambda row: [item.value for item in row], axis=1, result_type='expand')

    if min_max_scaler is None:
        min_max_scaler = MinMaxScaler()
        df[columns] = min_max_scaler.fit_transform(df[columns])
    else:
        df[columns] = min_max_scaler.transform(df[columns])

    return df, min_max_scaler

In [12]:
X_train, min_max_scaler = compute_time_delta(X_train, columns=["host_since", "first_review", "last_review"])
X_test, _ = compute_time_delta(X_test, columns=["host_since", "first_review", "last_review"], min_max_scaler=min_max_scaler)

In [13]:
X_train.head()

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,...,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,city,has_license,is_bathroom_private
5269,420673373,0.958782,within an hour,1.0,1.0,f,,0,0,"['email', 'phone']",...,4.86,t,1,1,0,0,1.41,Amsterdam,t,f
5444,429206111,0.970761,within an hour,0.91,1.0,f,,0,0,"['email', 'phone']",...,5.0,t,10,10,0,0,0.79,Amsterdam,t,f
7029,293464674,0.812589,within a few hours,1.0,1.0,f,,1,1,"['email', 'phone', 'jumio', 'offline_governmen...",...,3.0,t,1,0,1,0,0.36,Rotterdam,t,f
7134,80550302,0.575431,within an hour,0.75,0.9,f,,1,1,"['email', 'phone', 'facebook', 'reviews', 'off...",...,,t,3,0,3,0,,Rotterdam,t,f
4457,237371423,0.765076,within an hour,0.92,0.99,f,,8,8,"['email', 'phone', 'offline_government_id', 's...",...,4.61,t,9,0,9,0,1.89,Amsterdam,t,f


### List to Columns

In [14]:
def str_list_to_list(item: str) -> List[str]:
    return eval(item)

def list_to_columns(df: pd.DataFrame, column: str, most_common_n: int = 5, most_frequent_columns: Optional[List[str]] = None) -> Tuple[pd.DataFrame, List[str]]:
    df = df.copy()

    rows = df[column].transform(lambda item: str_list_to_list(item))

    if most_frequent_columns is None:
        counter = Counter()
        for row in rows:
            for item in row:
                counter[item] += 1
        columns = [c[0] for c in counter.most_common(most_common_n)]
    else:
        columns = most_frequent_columns[:most_common_n]
    rows = [
        ["t" if col in row else "f" for col in columns]
        for row in rows
    ]
    column_names = [f"{column}_{c}" for c in columns]
    df[column_names] = rows

    df = df.drop(columns=[column])

    return df, columns

In [15]:
X_train, columns_host_verifications = list_to_columns(X_train, column="host_verifications")
X_test, _ = list_to_columns(X_test, column="host_verifications", most_frequent_columns=columns_host_verifications)

In [16]:
X_train, columns_amenities = list_to_columns(X_train, column="amenities", most_common_n=10)
X_test, _ = list_to_columns(X_test, column="amenities", most_frequent_columns=columns_amenities, most_common_n=10)

In [17]:
X_train.head()

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,...,amenities_Wifi,amenities_Essentials,amenities_Smoke alarm,amenities_Heating,amenities_Hangers,amenities_Long term stays allowed,amenities_Kitchen,amenities_Hair dryer,amenities_Hot water,amenities_Iron
5269,420673373,0.958782,within an hour,1.0,1.0,f,,0,0,t,...,t,t,t,f,t,t,t,f,t,f
5444,429206111,0.970761,within an hour,0.91,1.0,f,,0,0,t,...,t,f,t,f,f,t,t,f,f,f
7029,293464674,0.812589,within a few hours,1.0,1.0,f,,1,1,t,...,t,t,t,f,f,t,t,t,f,f
7134,80550302,0.575431,within an hour,0.75,0.9,f,,1,1,t,...,t,t,f,t,t,t,t,t,t,f
4457,237371423,0.765076,within an hour,0.92,0.99,f,,8,8,t,...,t,t,t,t,t,t,f,f,f,f


In [18]:
X_test.head()

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,...,amenities_Wifi,amenities_Essentials,amenities_Smoke alarm,amenities_Heating,amenities_Hangers,amenities_Long term stays allowed,amenities_Kitchen,amenities_Hair dryer,amenities_Hot water,amenities_Iron
1895,25677161,0.46599,within an hour,1.0,0.88,t,,1,1,t,...,f,t,t,f,t,t,t,t,t,t
6778,5923086,0.337665,within an hour,1.0,0.71,t,,2,2,t,...,t,t,t,t,t,t,t,t,t,t
240,5878526,0.337056,within a few hours,1.0,0.57,f,Oost,1,1,t,...,t,t,t,t,t,t,t,t,t,t
4612,837675,0.208731,within an hour,1.0,1.0,f,,1,1,t,...,t,t,t,t,t,t,t,t,t,t
5698,32424525,0.489543,within an hour,0.8,1.0,f,,1,1,t,...,t,f,t,f,f,t,t,f,f,f


In [19]:
set(X_test.columns) - set(X_train.columns)

set()

### Impute Values

In [20]:
nans_mask = (X_train.isna() | X_test.isna()).any(axis=0)
nans_mask = nans_mask[nans_mask == True]
nans_mask

host_response_time             True
host_neighbourhood             True
bedrooms                       True
beds                           True
minimum_minimum_nights         True
maximum_minimum_nights         True
minimum_maximum_nights         True
maximum_maximum_nights         True
minimum_nights_avg_ntm         True
maximum_nights_avg_ntm         True
review_scores_rating           True
review_scores_accuracy         True
review_scores_cleanliness      True
review_scores_checkin          True
review_scores_communication    True
review_scores_location         True
review_scores_value            True
reviews_per_month              True
dtype: bool

#### Categorical

In [21]:
OBJECT_CATEGORICAL_VARIABLES = X_train.select_dtypes(include=["object"]).columns

In [22]:
object_categorical_mode_imputer = SimpleImputer(strategy="most_frequent")
X_train[OBJECT_CATEGORICAL_VARIABLES] = object_categorical_mode_imputer.fit_transform(X_train[OBJECT_CATEGORICAL_VARIABLES])

In [23]:
X_test[OBJECT_CATEGORICAL_VARIABLES] = object_categorical_mode_imputer.transform(X_test[OBJECT_CATEGORICAL_VARIABLES])

In [24]:
X_train.head()

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,...,amenities_Wifi,amenities_Essentials,amenities_Smoke alarm,amenities_Heating,amenities_Hangers,amenities_Long term stays allowed,amenities_Kitchen,amenities_Hair dryer,amenities_Hot water,amenities_Iron
5269,420673373,0.958782,within an hour,1.0,1.0,f,Oud-West,0,0,t,...,t,t,t,f,t,t,t,f,t,f
5444,429206111,0.970761,within an hour,0.91,1.0,f,Oud-West,0,0,t,...,t,f,t,f,f,t,t,f,f,f
7029,293464674,0.812589,within a few hours,1.0,1.0,f,Oud-West,1,1,t,...,t,t,t,f,f,t,t,t,f,f
7134,80550302,0.575431,within an hour,0.75,0.9,f,Oud-West,1,1,t,...,t,t,f,t,t,t,t,t,t,f
4457,237371423,0.765076,within an hour,0.92,0.99,f,Oud-West,8,8,t,...,t,t,t,t,t,t,f,f,f,f


In [25]:
X_test.head()

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,...,amenities_Wifi,amenities_Essentials,amenities_Smoke alarm,amenities_Heating,amenities_Hangers,amenities_Long term stays allowed,amenities_Kitchen,amenities_Hair dryer,amenities_Hot water,amenities_Iron
1895,25677161,0.46599,within an hour,1.0,0.88,t,Oud-West,1,1,t,...,f,t,t,f,t,t,t,t,t,t
6778,5923086,0.337665,within an hour,1.0,0.71,t,Oud-West,2,2,t,...,t,t,t,t,t,t,t,t,t,t
240,5878526,0.337056,within a few hours,1.0,0.57,f,Oost,1,1,t,...,t,t,t,t,t,t,t,t,t,t
4612,837675,0.208731,within an hour,1.0,1.0,f,Oud-West,1,1,t,...,t,t,t,t,t,t,t,t,t,t
5698,32424525,0.489543,within an hour,0.8,1.0,f,Oud-West,1,1,t,...,t,f,t,f,f,t,t,f,f,f


### Continuous

In [26]:
X.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [27]:
nans_mask = (X_train.isna() | X_test.isna()).any(axis=0)
nans_mask = nans_mask[nans_mask == True]
nans_mask

bedrooms                       True
beds                           True
minimum_minimum_nights         True
maximum_minimum_nights         True
minimum_maximum_nights         True
maximum_maximum_nights         True
minimum_nights_avg_ntm         True
maximum_nights_avg_ntm         True
review_scores_rating           True
review_scores_accuracy         True
review_scores_cleanliness      True
review_scores_checkin          True
review_scores_communication    True
review_scores_location         True
review_scores_value            True
reviews_per_month              True
dtype: bool

In [28]:
continuous_nan_columns = set(nans_mask.index) - set(OBJECT_CATEGORICAL_VARIABLES)
continuous_nan_columns = list(continuous_nan_columns)
continuous_nan_columns

['minimum_minimum_nights',
 'bedrooms',
 'maximum_nights_avg_ntm',
 'reviews_per_month',
 'minimum_maximum_nights',
 'review_scores_rating',
 'review_scores_accuracy',
 'maximum_maximum_nights',
 'review_scores_checkin',
 'minimum_nights_avg_ntm',
 'review_scores_cleanliness',
 'review_scores_value',
 'review_scores_location',
 'beds',
 'review_scores_communication',
 'maximum_minimum_nights']

In [29]:
mean_imputer = SimpleImputer(strategy="mean")
X_train[continuous_nan_columns] = mean_imputer.fit_transform(X_train[continuous_nan_columns])

In [30]:
X_test[continuous_nan_columns] = mean_imputer.transform(X_test[continuous_nan_columns])

### Final Check

In [31]:
nans_mask = (X_train.isna() | X_test.isna()).any(axis=0)
nans_mask = nans_mask[nans_mask == True]
nans_mask

Series([], dtype: bool)

Alright. We are almost ready to feed the data to the model.

# Categorical Variables

The models that we indent to use can handle categorical variables as long as we map them to ordinal variables. Therefore, there is no need for OHE (or other methods).

In [32]:
from sklearn.base import TransformerMixin


class FeatureTransformer(TransformerMixin):

    def __init__(self, categorical_features: List[str]):
        self.categorical_features = categorical_features

    def fit(self, X):

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Pass a pandas.DataFrame")

        if not isinstance(self.categorical_features, list):
            raise ValueError(
                "Pass categorical_features as a list of column names")

        self.encoding = {}
        for c in self.categorical_features:

            _, int_id = X[c].factorize()
            self.encoding[c] = dict(zip(list(int_id), range(0,len(int_id)+1)))

        return self

    def transform(self, X):

        if not isinstance(X, pd.DataFrame):
            raise ValueError("Pass a pandas.DataFrame")

        if not hasattr(self, 'encoding'):
            raise AttributeError("FeatureTransformer must be fitted")

        df = X.drop(self.categorical_features, axis=1)

        for c in sorted(self.categorical_features):
            df[c] = X[c].map(self.encoding[c]).fillna(0)

        return df

In [33]:
encoder = FeatureTransformer(categorical_features=list(OBJECT_CATEGORICAL_VARIABLES))
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [34]:
X_train.head()

Unnamed: 0,host_id,host_since,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,...,host_verifications_email,host_verifications_government_id,host_verifications_jumio,host_verifications_phone,host_verifications_reviews,instant_bookable,is_bathroom_private,neighbourhood_cleansed,property_type,room_type
5269,420673373,0.958782,1.0,1.0,0,0,52.2986,4.95581,2,1.0,...,0,0,0,0,0,0,0,0,0,0
5444,429206111,0.970761,0.91,1.0,0,0,52.36525,4.87946,4,1.0,...,0,0,0,0,0,0,0,1,1,0
7029,293464674,0.812589,1.0,1.0,1,1,51.92125,4.49959,2,1.0,...,0,1,1,0,0,0,0,2,2,1
7134,80550302,0.575431,0.75,0.9,1,1,51.91032,4.52203,1,1.0,...,0,1,0,0,1,0,0,3,3,1
4457,237371423,0.765076,0.92,0.99,8,8,52.38586,4.87644,2,1.0,...,0,1,0,0,0,0,0,4,4,1


In [35]:
X_test.head()

Unnamed: 0,host_id,host_since,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,...,host_verifications_email,host_verifications_government_id,host_verifications_jumio,host_verifications_phone,host_verifications_reviews,instant_bookable,is_bathroom_private,neighbourhood_cleansed,property_type,room_type
1895,25677161,0.46599,1.0,0.88,1,1,52.37546,4.87656,4,2.0,...,0,1,1,0,1,1,1,4.0,3.0,1
6778,5923086,0.337665,1.0,0.71,2,2,51.92407,4.49146,1,1.0,...,0,1,1,0,1,1,0,29.0,18.0,1
240,5878526,0.337056,1.0,0.57,1,1,52.36064,4.97643,4,1.0,...,0,1,1,0,1,1,0,41.0,10.0,0
4612,837675,0.208731,1.0,1.0,1,1,52.38283,4.96715,4,2.0,...,0,1,1,0,1,1,0,6.0,27.0,0
5698,32424525,0.489543,0.8,1.0,1,1,52.385609,4.878721,4,3.0,...,0,1,1,0,1,1,0,4.0,0.0,0


In [36]:
X_train.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

In [37]:
X_test.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

# Export

In [38]:
X_train.head()

Unnamed: 0,host_id,host_since,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,...,host_verifications_email,host_verifications_government_id,host_verifications_jumio,host_verifications_phone,host_verifications_reviews,instant_bookable,is_bathroom_private,neighbourhood_cleansed,property_type,room_type
5269,420673373,0.958782,1.0,1.0,0,0,52.2986,4.95581,2,1.0,...,0,0,0,0,0,0,0,0,0,0
5444,429206111,0.970761,0.91,1.0,0,0,52.36525,4.87946,4,1.0,...,0,0,0,0,0,0,0,1,1,0
7029,293464674,0.812589,1.0,1.0,1,1,51.92125,4.49959,2,1.0,...,0,1,1,0,0,0,0,2,2,1
7134,80550302,0.575431,0.75,0.9,1,1,51.91032,4.52203,1,1.0,...,0,1,0,0,1,0,0,3,3,1
4457,237371423,0.765076,0.92,0.99,8,8,52.38586,4.87644,2,1.0,...,0,1,0,0,0,0,0,4,4,1


In [39]:
OBJECT_CATEGORICAL_VARIABLES

Index(['host_response_time', 'host_is_superhost', 'host_neighbourhood',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'property_type', 'room_type',
       'has_availability', 'instant_bookable', 'city', 'has_license',
       'is_bathroom_private', 'host_verifications_phone',
       'host_verifications_email', 'host_verifications_jumio',
       'host_verifications_reviews', 'host_verifications_government_id',
       'amenities_Wifi', 'amenities_Essentials', 'amenities_Smoke alarm',
       'amenities_Heating', 'amenities_Hangers',
       'amenities_Long term stays allowed', 'amenities_Kitchen',
       'amenities_Hair dryer', 'amenities_Hot water', 'amenities_Iron'],
      dtype='object')

In [40]:
categorical_features = list(OBJECT_CATEGORICAL_VARIABLES) + ["host_id"]
categorical_features

['host_response_time',
 'host_is_superhost',
 'host_neighbourhood',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'property_type',
 'room_type',
 'has_availability',
 'instant_bookable',
 'city',
 'has_license',
 'is_bathroom_private',
 'host_verifications_phone',
 'host_verifications_email',
 'host_verifications_jumio',
 'host_verifications_reviews',
 'host_verifications_government_id',
 'amenities_Wifi',
 'amenities_Essentials',
 'amenities_Smoke alarm',
 'amenities_Heating',
 'amenities_Hangers',
 'amenities_Long term stays allowed',
 'amenities_Kitchen',
 'amenities_Hair dryer',
 'amenities_Hot water',
 'amenities_Iron',
 'host_id']

In [41]:
features_dir = Path("data") / "features"
features_dir.mkdir(parents=True, exist_ok=True)

In [42]:
import json

X_train.to_csv(features_dir / "X_train.csv", index=False)
X_test.to_csv(features_dir / "X_test.csv", index=False)
y_train.to_csv(features_dir / "y_train.csv", index=False)
y_test.to_csv(features_dir / "y_test.csv", index=False)

with open(features_dir / "meta.json", "w") as f:
    json.dump({
        "categorical_features": categorical_features
    }, f)