In [10]:
# Kennedi Todd
# August 8, 2024
# Chapter 13: Tree-based Methods
# Use gradient boosting to predict a numeric target output for nightly Airbnb fee

# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error

# read data
df = pd.read_csv('listings.csv')
df.shape

(22552, 16)

In [11]:
# remove variables
df.drop(['id','name','host_name','calculated_host_listings_count','last_review',
        'availability_365','longitude','latitude','neighbourhood'],
        axis =1,
       inplace = True)
df.shape

(22552, 7)

In [12]:
# one-hot encoding
df = pd.get_dummies(df, columns = ['neighbourhood_group','room_type'])
df.shape

(22552, 20)

In [13]:
# remove missing rows
df.isnull().sum()

host_id                                            0
price                                              0
minimum_nights                                     0
number_of_reviews                                  0
reviews_per_month                               3914
neighbourhood_group_Charlottenburg-Wilm.           0
neighbourhood_group_Friedrichshain-Kreuzberg       0
neighbourhood_group_Lichtenberg                    0
neighbourhood_group_Marzahn - Hellersdorf          0
neighbourhood_group_Mitte                          0
neighbourhood_group_Neukölln                       0
neighbourhood_group_Pankow                         0
neighbourhood_group_Reinickendorf                  0
neighbourhood_group_Spandau                        0
neighbourhood_group_Steglitz - Zehlendorf          0
neighbourhood_group_Tempelhof - Schöneberg         0
neighbourhood_group_Treptow - Köpenick             0
room_type_Entire home/apt                          0
room_type_Private room                        

In [14]:
df.dropna(axis = 0, how = 'any', inplace = True, subset = None)
df.shape

(18638, 20)

In [15]:
# assign X and y variables
X = df.drop('price', axis = 1)
y = df['price']

# split training/testing data 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle = True, 
                                                    test_size = 0.3, 
                                                    random_state = 10)

# set algorithm
model = ensemble.GradientBoostingRegressor(
    n_estimators = 350,
    learning_rate = 0.1,
    max_depth = 5,
    min_samples_split = 4,
    min_samples_leaf = 6,
    max_features = 0.6,
    loss = 'huber'
)

# fit data
model.fit(X_train, y_train)

# evaluate
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print('MAE: %.2f' % mae_train)

MAE: 21.60


In [16]:
mae_test = mean_absolute_error(y_test, model.predict(X_test))
print('MAE: %.2f' % mae_test)

MAE: 27.26
