# Prediction of the price of airbnb using ML methods

Dataset: https://www.kaggle.com/datasets/kritikseth/us-airbnb-open-data

We study the price of airbnb in US 2020 using the ML methods which are lightgbm and xgboost, respectively. We compare their performance.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

Load data

In [2]:
train_X = pd.read_csv('train_X.csv', index_col=0)
test_X = pd.read_csv('test_X.csv', index_col=0)
val_X = pd.read_csv('val_X.csv', index_col=0)

train_y = pd.read_csv('train_y.csv', index_col=0)
test_y = pd.read_csv('test_y.csv', index_col=0)
val_y = pd.read_csv('val_y.csv', index_col=0)

# LGBM Regressor

In [3]:
steps=[('clf',lgb.LGBMRegressor(n_jobs=7, random_state=21))]
pipeline=Pipeline(steps)

In [4]:
param_dist={'clf__n_estimators':np.arange(100,1000,50),
            'clf__max_depth':np.arange(3,12,1),
            'clf__num_leaves':np.arange(20,3000,50),
            'clf__min_data_in_leaf':np.arange(100,200,50),
            'clf__learning_rate':np.arange(0.01,3,0.05)}

In [5]:
mae = make_scorer(mean_absolute_error)
mse = make_scorer(mean_squared_error)
r2 = make_scorer(r2_score)

In [6]:
num_iteration = 300
clf_cv = RandomizedSearchCV(pipeline, param_dist, n_iter=num_iteration, scoring=r2)

In [7]:
%%capture --no-display
clf_cv.fit(train_X, train_y)

RandomizedSearchCV(estimator=Pipeline(steps=[('clf',
                                              LGBMRegressor(n_jobs=7,
                                                            random_state=21))]),
                   n_iter=300,
                   param_distributions={'clf__learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
       0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96, 1.01, 1.06,
       1.11, 1.16, 1.21, 1.26, 1.31, 1.36, 1.41, 1.46, 1.51, 1.56, 1.61,
       1.66, 1.71, 1.76, 1.81, 1.86, 1.91,...
       750, 800, 850, 900, 950]),
                                        'clf__num_leaves': array([  20,   70,  120,  170,  220,  270,  320,  370,  420,  470,  520,
        570,  620,  670,  720,  770,  820,  870,  920,  970, 1020, 1070,
       1120, 1170, 1220, 1270, 1320, 1370, 1420, 1470, 1520, 1570, 1620,
       1670, 1720, 1770, 1820, 1870, 1920, 1970, 2020, 2070, 2120, 2170,
       2220, 2270, 2320, 2370, 2420, 2470, 2520, 2570

In [8]:
train_pred = clf_cv.predict(train_X)
test_pred = clf_cv.predict(test_X)
val_pred = clf_cv.predict(val_X)

In [9]:
dict = {}

In [10]:
def performance(truedata, prediction, name=''):
    print(name)
    print('='*15)
    rmse = round(mean_squared_error(truedata, prediction,  squared=False),2)
    mae = round(mean_absolute_error(truedata, prediction),2)
    r2 = round(r2_score(truedata, prediction),2)
    print('rmse:', rmse)
    print('mae: ', mae)
    print('r2:  ', r2)
    return rmse, mae, r2

In [11]:
_ , _ , _ = performance(train_y, train_pred, 'Train set') 

Train set
rmse: 214.2
mae:  87.7
r2:   0.51


In [12]:
_ , _ , _ = performance(val_y, val_pred, 'Validation set') 

Validation set
rmse: 199.63
mae:  100.58
r2:   0.37


In [13]:
dict['lgm'] = performance(test_y, test_pred, 'Test set') 

Test set
rmse: 340.98
mae:  107.46
r2:   0.31


In [14]:
%%capture --no-display
num_leaves = clf_cv.best_params_['clf__num_leaves']
n_estimators = clf_cv.best_params_['clf__n_estimators']
max_depth = clf_cv.best_params_['clf__max_depth']
min_data_in_leaf = clf_cv.best_params_['clf__min_data_in_leaf']
learning_rate = clf_cv.best_params_['clf__learning_rate']

lgb_model = lgb.LGBMRegressor(num_leaves=num_leaves, 
                              n_estimators=n_estimators, 
                              max_depth=max_depth,
                              min_data_in_leaf=min_data_in_leaf,
                              learning_rate=learning_rate,
                              random_state=21)

lgb_model.fit(train_X, train_y)

LGBMRegressor(learning_rate=0.11, max_depth=10, min_data_in_leaf=100,
              n_estimators=450, num_leaves=720, random_state=21)

In [15]:
feature = pd.DataFrame(index=train_X.columns)
feature['lgm feature'] = list(lgb_model.feature_importances_)

# XGBoost Regressor

In [16]:
steps=[('clf',xgb.XGBRegressor(objective='reg:squarederror',
                               tree_method='hist',
                               verbosity = 0, 
                               random_state=21, 
                               nthread=7))]
pipeline=Pipeline(steps)

In [17]:
param_dist={'clf__n_estimators':np.arange(100,1000,50),
            'clf__max_depth':np.arange(3,10,1),
            'clf__learning_rate':np.arange(0.01,0.3,0.05),
            'clf__colsample_bytree':np.arange(0.5,1,0.1),
            'clf__subsample':np.arange(0.6,1,0.1)
            }

In [18]:
num_iteration = 300
clf_cv2 = RandomizedSearchCV(pipeline, 
                             param_dist, 
                             n_iter=num_iteration, 
                             scoring=r2)

In [19]:
%%capture --no-display
clf_cv2.fit(train_X, train_y)

RandomizedSearchCV(estimator=Pipeline(steps=[('clf',
                                              XGBRegressor(base_score=None,
                                                           booster=None,
                                                           colsample_bylevel=None,
                                                           colsample_bynode=None,
                                                           colsample_bytree=None,
                                                           enable_categorical=False,
                                                           gamma=None,
                                                           gpu_id=None,
                                                           importance_type=None,
                                                           interaction_constraints=None,
                                                           learning_rate=None,
                                                           max_delta_step=

In [20]:
train_pred = clf_cv2.predict(train_X)
test_pred = clf_cv2.predict(test_X)
val_pred = clf_cv2.predict(val_X)

In [21]:
_ , _ , _ = performance(train_y, train_pred, 'Train set') 

Train set
rmse: 69.24
mae:  46.32
r2:   0.95


In [22]:
_ , _ , _ = performance(val_y, val_pred, 'Validation set') 

Validation set
rmse: 201.46
mae:  94.91
r2:   0.36


In [23]:
dict['xgb'] = performance(test_y, test_pred, 'Test set') 

Test set
rmse: 285.4
mae:  98.47
r2:   0.51


In [24]:
%%capture --no-display
n_estimators = clf_cv2.best_params_['clf__n_estimators']
max_depth = clf_cv2.best_params_['clf__max_depth']
learning_rate = clf_cv2.best_params_['clf__learning_rate']
colsample_bytree = clf_cv2.best_params_['clf__colsample_bytree']
subsample = clf_cv2.best_params_['clf__subsample']

xgb_model = xgb.XGBRegressor(n_estimators=n_estimators, 
                             max_depth=max_depth, 
                             learning_rate=learning_rate,
                             colsample_bytree=colsample_bytree,
                             subsample=subsample,
                             random_state=21)

xgb_model.fit(train_X, train_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.060000000000000005,
             max_delta_step=0, max_depth=8, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=850, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=21,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.7999999999999999, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [25]:
feature['xgb feature'] = list(xgb_model.feature_importances_)

# Performance

In [26]:
result = pd.DataFrame.from_dict(dict)
result.index = ['rmse', 'mae', 'r2']

In [27]:
result

Unnamed: 0,lgm,xgb
rmse,340.98,285.4
mae,107.46,98.47
r2,0.31,0.51


# Feature Importance

LightGBM

In [28]:
feature.nlargest(10, 'lgm feature')['lgm feature']

hashname_29                           459
hashneigh_0                           414
hashname_76                           382
x * y                                 376
hashname_91                           368
hashname_5                            335
hashname_54                           319
minimum_nights * number_of_reviews    256
availability_365                      251
duration * availability_365           226
Name: lgm feature, dtype: int64

XGBoost

In [29]:
feature.nlargest(10, 'xgb feature')['xgb feature']

hashname_29                                0.033512
hashneigh_0                                0.031696
hashname_13                                0.028767
room_type_Private room                     0.026555
neighbourhood_group_City of Los Angeles    0.020427
city_Portland                              0.018700
room_type_Shared room                      0.018186
x * z                                      0.016924
hashname_53                                0.014965
hashneigh_9                                0.014454
Name: xgb feature, dtype: float64

# Summary

<li> Xgboost model gives a higher R^2.
<li> From the models, the description of name, neighbourhood and the room type are useful factors related to the price.