# Реализация регрессии градиентного бустинга

In [1]:
import pandas as pd
import numpy as np 
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

In [26]:
airbnb = pd.read_csv('../../datasets/airbnb.csv')
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13759 entries, 0 to 13758
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              13759 non-null  int64  
 1   name                            13759 non-null  object 
 2   host_id                         13759 non-null  int64  
 3   host_name                       13750 non-null  object 
 4   neighbourhood_group             13759 non-null  object 
 5   neighbourhood                   13759 non-null  object 
 6   latitude                        13759 non-null  float64
 7   longitude                       13759 non-null  float64
 8   room_type                       13759 non-null  object 
 9   price                           8821 non-null   float64
 10  minimum_nights                  13759 non-null  int64  
 11  number_of_reviews               13759 non-null  int64  
 12  last_review                     

In [27]:
airbnb.isna().sum()

id                                   0
name                                 0
host_id                              0
host_name                            9
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                             4938
minimum_nights                       0
number_of_reviews                    0
last_review                       3238
reviews_per_month                 3238
calculated_host_listings_count       0
availability_365                     0
number_of_reviews_ltm                0
license                           4956
dtype: int64

In [28]:
airbnb = (
    airbnb
    .drop(columns = ['id', 
           'name', 
           'host_name', 
           'last_review', 
           'calculated_host_listings_count', 
           'availability_365', 
           'latitude', 
           'longitude', 
          'neighbourhood',
            'license'
          ])
)
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13759 entries, 0 to 13758
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   host_id                13759 non-null  int64  
 1   neighbourhood_group    13759 non-null  object 
 2   room_type              13759 non-null  object 
 3   price                  8821 non-null   float64
 4   minimum_nights         13759 non-null  int64  
 5   number_of_reviews      13759 non-null  int64  
 6   reviews_per_month      10521 non-null  float64
 7   number_of_reviews_ltm  13759 non-null  int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 860.1+ KB


In [11]:
airbnb.isna().sum()

host_id                     0
neighbourhood_group         0
room_type                   0
price                    4938
minimum_nights              0
number_of_reviews           0
reviews_per_month        3238
number_of_reviews_ltm       0
dtype: int64

In [29]:
airbnb.dropna(axis = 0, how = 'any', subset=None, inplace = True)

In [30]:
airbnb = pd.get_dummies(airbnb, columns = ['neighbourhood_group', 'room_type'])

In [83]:
X = airbnb.drop(columns = ['price'])
y = airbnb['price']
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 13, shuffle = True)

In [97]:
model = GradientBoostingRegressor(
    n_estimators=350,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split = 4, 
    min_samples_leaf = 6,
    max_features = 0.6,
    loss = 'huber'
)
model.fit(X_train, y_train)

In [98]:
y_train_predict = model.predict(X_train)
y_predict = model.predict(X_test)

In [99]:
print(f'MAE in train data = {mean_absolute_error(y_train_predict, y_train)}')
print(f'MSE in train data = {mean_squared_error(y_train_predict, y_train)}')
print(f'MAE in test data = {mean_absolute_error(y_predict, y_test)}')
print(f'MSE in test data = {mean_squared_error(y_predict, y_test)}')

MAE in train data = 44.993227209993854
MSE in train data = 12464.143145827307
MAE in test data = 75.72731902428602
MSE in test data = 302687.406200033


In [110]:
model_grid = GradientBoostingRegressor(n_estimators=350, max_depth = 5, min_samples_leaf = 5, loss='huber')

In [109]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    # 'n_estimators': [100, 200, 300],
    # 'max_depth': [3, 5, 10],
    # 'min_samples_split': [2, 3,  4 ],
    # 'min_samples_leaf': [3, 5, 6],
    # 'loss': ['absolute_error', 'huber']
}
search_best = GridSearchCV(model_grid, param_grid)
search_best.fit(X_train, y_train)
print(search_best.best_params_)

{'loss': 'huber'}


In [111]:
y_train_best_predict = search_best.best_estimator_.predict(X_train)
y_predict_best = search_best.best_estimator_.predict(X_test)

In [112]:
print(f'MAE in train data = {mean_absolute_error(y_train_best_predict, y_train)}')
print(f'MSE in train data = {mean_squared_error(y_train_best_predict, y_train)}')
print(f'MAE in test data = {mean_absolute_error(y_predict_best, y_test)}')
print(f'MSE in test data = {mean_squared_error(y_predict_best, y_test)}')

MAE in train data = 43.143318835419485
MSE in train data = 11755.663411251617
MAE in test data = 74.80351682346854
MSE in test data = 302091.1744271777


In [134]:
new_property = np.array([[
    2217, #host_id
    4, #minimum_nights
    118, #number_of_reviews
    3.76, #reviews_per_month
    20,
    0, #neighbourhood_group_Charlottenburg-Wilm.
    0, #neighbourhood_group_Friedrichshain-Kreuzberg
    0, #neighbourhood_group_Lichtenberg
    0, #neighbourhood_group_Marzahn - Hellersdorf
    1, #neighbourhood_group_Mitte
    0, #neighbourhood_group_Neukölln
    0, #neighbourhood_group_Pankow
    0, #neighbourhood_group_Reinickendorf
    0, #neighbourhood_group_Spandau
    0, #neighbourhood_group_Steglitz - Zehlendorf
    0, #neighbourhood_group_Tempelhof - Schöneberg
    0, #neighbourhood_group_Treptow - Köpenick
    1, #room_type_Entire home/apt
    0, # room_type_Hotel
    0, #room_type_Private room
    0, #room_type_Shared room
]])
print(new_property.shape)
print(X_train.columns)

(1, 21)
Index(['host_id', 'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'number_of_reviews_ltm', 'neighbourhood_group_Charlottenburg-Wilm.',
       'neighbourhood_group_Friedrichshain-Kreuzberg',
       'neighbourhood_group_Lichtenberg',
       'neighbourhood_group_Marzahn - Hellersdorf',
       'neighbourhood_group_Mitte', 'neighbourhood_group_Neukölln',
       'neighbourhood_group_Pankow', 'neighbourhood_group_Reinickendorf',
       'neighbourhood_group_Spandau',
       'neighbourhood_group_Steglitz - Zehlendorf',
       'neighbourhood_group_Tempelhof - Schöneberg',
       'neighbourhood_group_Treptow - Köpenick', 'room_type_Entire home/apt',
       'room_type_Hotel room', 'room_type_Private room',
       'room_type_Shared room'],
      dtype='object')


In [136]:
y_predict_value = search_best.best_estimator_.predict(pd.DataFrame(new_property, columns = X_train.columns))

In [137]:
y_predict_value

array([237.78645697])