In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
data = pd.read_csv('hotel_bookings.csv')
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [36]:
data.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [37]:
round(data.isna().sum()/(data.count()+data.isna().sum()), 2) * 100

hotel                              0.0
is_canceled                        0.0
lead_time                          0.0
arrival_date_year                  0.0
arrival_date_month                 0.0
arrival_date_week_number           0.0
arrival_date_day_of_month          0.0
stays_in_weekend_nights            0.0
stays_in_week_nights               0.0
adults                             0.0
children                           0.0
babies                             0.0
meal                               0.0
country                            0.0
market_segment                     0.0
distribution_channel               0.0
is_repeated_guest                  0.0
previous_cancellations             0.0
previous_bookings_not_canceled     0.0
reserved_room_type                 0.0
assigned_room_type                 0.0
booking_changes                    0.0
deposit_type                       0.0
agent                             14.0
company                           94.0
days_in_waiting_list     

In [38]:
#обработка пропусков и удаление неифнормативных столбцов
data.drop(data[data['country'].isnull()].index, inplace=True)
data.drop(data[data['children'].isnull()].index, inplace=True)
data.drop('agent', inplace=True, axis=1)
data.drop('company', inplace=True, axis=1)
data.drop('market_segment', inplace=True, axis=1)
data.drop('distribution_channel', inplace=True, axis=1)
data.drop('reservation_status_date', inplace=True, axis=1)
data.drop('country', inplace=True, axis=1)
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,A,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out


In [39]:
data.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
days_in_waiting_list                int64
customer_type                      object
adr                               float64
required_car_parking_spaces       

In [40]:
# Кодирование категориальных признаков
hotel = LabelEncoder()
nhotel = hotel.fit_transform(data["hotel"])
data["hotel"] = nhotel
data = data.astype({"hotel":"int64"})
np.unique(nhotel)

arrival_date_month = LabelEncoder()
n_arrival_date_month = arrival_date_month.fit_transform(data["arrival_date_month"])
data["arrival_date_month"] = n_arrival_date_month
data = data.astype({"arrival_date_month":"int64"})
np.unique(n_arrival_date_month)

deposit_type = LabelEncoder()
n_deposit_type = hotel.fit_transform(data["deposit_type"])
data["deposit_type"] = n_deposit_type
data = data.astype({"deposit_type":"int64"})
np.unique(n_deposit_type)

customer_type = LabelEncoder()
n_customer_type = customer_type.fit_transform(data["customer_type"])
data["customer_type"] = n_customer_type
data = data.astype({"customer_type":"int64"})
np.unique(n_customer_type)

reserved_room_type = LabelEncoder()
n_reserved_room_type = reserved_room_type.fit_transform(data["reserved_room_type"])
data["reserved_room_type"] = n_reserved_room_type
data = data.astype({"reserved_room_type":"int64"})
np.unique(n_reserved_room_type)

assigned_room_type = LabelEncoder()
n_assigned_room_type = assigned_room_type.fit_transform(data["assigned_room_type"])
data["assigned_room_type"] = n_assigned_room_type
data = data.astype({"assigned_room_type":"int64"})
np.unique(n_assigned_room_type)

meal = LabelEncoder()
nmeal = meal.fit_transform(data["meal"])
data["meal"] = nmeal
data = data.astype({"meal":"int64"})
np.unique(nmeal)

reservation_status = LabelEncoder()
n_reservation_status = reservation_status.fit_transform(data["reservation_status"])
data["reservation_status"] = n_reservation_status
data = data.astype({"reservation_status":"int64"})
np.unique(n_reservation_status)

data.dtypes

hotel                               int64
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                  int64
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                                int64
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                  int64
assigned_room_type                  int64
booking_changes                     int64
deposit_type                        int64
days_in_waiting_list                int64
customer_type                       int64
adr                               float64
required_car_parking_spaces       

In [41]:
#Разделение выборки на обучающую и тестовую
X_train, X_test, y_train, y_test = train_test_split(data, data.hotel,random_state=1)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((89173, 26), (89173,), (29725, 26), (29725,))

In [46]:
# Логистическая регрессия
from sklearn.linear_model import LogisticRegression
model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)
targ_logistic = model_logistic.predict(X_test)

R2_LR = r2_score(y_test, model_logistic.predict(X_test))
RMSE_LR = mean_squared_error(y_test, model_logistic.predict(X_test), squared=True)
print("Коэфф. детерминации: {}".format(R2_LR))
print("Среднеквадратическая ошибка (RMSE): {}".format(RMSE_LR))

Коэфф. детерминации: 0.05370677742085006
Среднеквадратическая ошибка (RMSE): 0.21170731707317073


In [45]:
# Градиентный бустинг
GB = GradientBoostingRegressor(n_estimators=10, random_state=1)
GB.fit(X_train, y_train)
R2_GB = r2_score(y_test, GB.predict(X_test))
RMSE_GB= mean_squared_error(y_test, GB.predict(X_test), squared=True)
print("Коэфф. детерминации: {}".format(R2_GB))
print("Среднеквадратическая ошибка (RMSE): {}".format(RMSE_GB))

Коэфф. детерминации: 0.8784004094642676
Среднеквадратическая ошибка (RMSE): 0.027204594152488278


In [None]:
# Видно, что модель градиентного бустинга обладает более высоким коеф детерминации. Среднекватическая ошибка достаточно мала в обоих методах.