In [15]:
pip install lightgbm

Collecting lightgbmNote: you may need to restart the kernel to use updated packages.
  Downloading lightgbm-3.1.1-py2.py3-none-win_amd64.whl (754 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.1.1



In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import roc_curve, auc, average_precision_score


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

data = pd.read_csv('data/hotel_bookings.csv')
data.shape

(119390, 32)

In [2]:
label=['is_canceled']
feature = ['hotel','is_canceled','lead_time','Direct(bool)','precancel(bool)','prebook(bool)','change_room(bool)','reserve_change(bool)', 'parkinglot(bool)','special_request(bool)','customer_type(label)']

## feature 컬럼 전처리

In [3]:
# Direct/non direct
data.loc[data["distribution_channel"]=='Direct', 'Direct(bool)'] = 1
data.loc[data["distribution_channel"]!='Direct', 'Direct(bool)'] = 0

#previous_cancel = 0 or <0
data['precancel(bool)'] = 1
data.loc[data['previous_cancellations'] == 0, 'precancel(bool)'] = 0

#previous_book_not_cancel = 0 or <0
data['prebook(bool)'] = 1
data.loc[data['previous_bookings_not_canceled'] == 0, 'prebook(bool)'] = 0

#change_room (방이 바뀌었으면 1)
data['change_room(bool)'] = 0
data.loc[data['assigned_room_type'] != data['reserved_room_type'], 'change_room(bool)'] = 1

#예약 사항 변경(booking changes =0 or <0)
data['reserve_change(bool)'] = 0
data.loc[data['previous_cancellations'] > 0, 'reserve_change(bool)'] = 1 

#주차 공간 요구
data.loc[data['required_car_parking_spaces'] == 0, 'parkinglot(bool)'] = 0
data.loc[data['required_car_parking_spaces'] > 0,'parkinglot(bool)'] = 1

#특별 요청 사항 요구
data.loc[data['total_of_special_requests'] == 0, 'special_request(bool)'] = 0
data.loc[data['total_of_special_requests'] > 0,'special_request(bool)'] = 1

#customer type 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data["customer_type"])
label_encoded = le.transform(data["customer_type"])
data["customer_type(label)"] = label_encoded.reshape((-1, 1))

In [4]:
ml_data = data[feature]

In [5]:
ml_resort_data = ml_data[ml_data["hotel"]=="Resort Hotel"]
ml_city_data = ml_data[ml_data["hotel"]=="City Hotel"]

In [6]:
ml_resort_data = ml_resort_data.drop(['hotel'], axis=1)
ml_city_data = ml_city_data.drop(['hotel'], axis=1)
ml_city_data = ml_city_data.reset_index(drop=True)

## city hotel

In [7]:
x = ml_city_data.drop("is_canceled", 1)
y = ml_city_data.loc[:,['is_canceled']]

In [8]:
#셋분리
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [9]:
d_train = lgb.Dataset(x_train, label=y_train)

params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'

clf = lgb.train(params, d_train)

[LightGBM] [Info] Number of positive: 26510, number of negative: 36954
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281
[LightGBM] [Info] Number of data points in the train set: 63464, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.417717 -> initscore=-0.332152
[LightGBM] [Info] Start training from score -0.332152


In [10]:
#예측
#Prediction
y_pred=clf.predict(x_test)
#convert into binary values
for i in range(len(y_pred)):
    if y_pred[i]>=.36:
        y_pred[i]=1
    else:
        y_pred[i]=0

AttributeError: 'Booster' object has no attribute 'predict_proba'

In [15]:
# 분류모델평가: 정확도, 재현율, 정밀도, f1-점수와 roc_auc 점수
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score, roc_auc_score
print(f"accuracy_score(정확도):{accuracy_score(y_test, y_pred)}")
print(f"recall_score(재현율):{recall_score(y_test, y_pred)}")
print(f"precision_score(정밀도):{precision_score(y_test, y_pred)}")
print(f"f1_score(f1점수):{f1_score(y_test, y_pred)}")
print(f"roc_auc_score:{roc_auc_score(y_test, y_pred)}")  #0.5면 랜덤예측 1.0이면 완벽예측

accuracy_score(정확도):0.7954745997730998
recall_score(재현율):0.7381346994123851
precision_score(정밀도):0.7647517951920075
f1_score(f1점수):0.7512075442766234
roc_auc_score:0.787422534449935


## resort hotel

In [18]:
X = ml_resort_data.drop("is_canceled", 1)
y = ml_resort_data.loc[:,['is_canceled']]

In [19]:
#셋분리
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [20]:
d_train = lgb.Dataset(X_train, label=y_train)

params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'

clf = lgb.train(params, d_train)

[LightGBM] [Info] Number of positive: 8899, number of negative: 23149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 281
[LightGBM] [Info] Number of data points in the train set: 32048, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.277677 -> initscore=-0.956013
[LightGBM] [Info] Start training from score -0.956013


In [21]:
#예측
#Prediction
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(len(y_pred)):
    if y_pred[i]>=.32:
        y_pred[i]=1
    else:
        y_pred[i]=0

In [22]:
# 분류모델평가: 정확도, 재현율, 정밀도, f1-점수와 roc_auc 점수
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score, roc_auc_score
print(f"accuracy_score(정확도):{accuracy_score(y_test, y_pred)}")
print(f"recall_score(재현율):{recall_score(y_test, y_pred)}")
print(f"precision_score(정밀도):{precision_score(y_test, y_pred)}")
print(f"f1_score(f1점수):{f1_score(y_test, y_pred)}")
print(f"roc_auc_score:{roc_auc_score(y_test, y_pred)}")  #0.5면 랜덤예측 1.0이면 완벽예측

accuracy_score(정확도):0.7606090863704443
recall_score(재현율):0.8466036887089519
precision_score(정밀도):0.5440878866724487
f1_score(f1점수):0.6624428018303413
roc_auc_score:0.7870952456327623


In [23]:
print(confusion_matrix(y_test, y_pred))

[[4212 1577]
 [ 341 1882]]


In [24]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

#설명력 R2 : 얼마나 잘 설명하느냐 ? 
print(r2_score(y_test, y_pred))

#예측력 MSE 
print(mean_squared_error(y_test, y_pred))

# RMSE 예측력 : 해석을 위해서 > 얼마나 잘 맞추었냐 ? 
print(math.sqrt(mean_squared_error(y_test, y_pred)))

# MAE : 해석을 위해서
print(mean_absolute_error(y_test, y_pred))

-0.19411603762141527
0.23939091362955567
0.48927590746894095
0.23939091362955567


# 기본데이터 resort

In [25]:
ml_data = pd.read_csv('data/hotel_bookings.csv')
ml_data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [26]:
ml_hotel_basic = ml_data[ml_data["hotel"]=="Resort Hotel"]

In [27]:
label = ml_hotel_basic['is_canceled']
feature = ml_hotel_basic.drop(['hotel','is_canceled'],axis = 1)

In [28]:
numerical_columns = [column for column in feature.columns if feature.dtypes[column] != 'object']

In [39]:
numerical_columns

['lead_time',
 'arrival_date_year',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'booking_changes',
 'agent',
 'company',
 'days_in_waiting_list',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests']

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature[numerical_columns],label,test_size=0.2, stratify=label,random_state=42)

In [30]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [31]:
d_train = lgb.Dataset(X_train, label=y_train)

params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'

clf = lgb.train(params, d_train)

[LightGBM] [Info] Number of positive: 8898, number of negative: 23150
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1015
[LightGBM] [Info] Number of data points in the train set: 32048, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.277646 -> initscore=-0.956168
[LightGBM] [Info] Start training from score -0.956168


In [33]:
#예측
#Prediction
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(len(y_pred)):
    if y_pred[i]>=.28:
        y_pred[i]=1
    else:
        y_pred[i]=0

In [34]:
# 분류모델평가: 정확도, 재현율, 정밀도, f1-점수와 roc_auc 점수
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score, roc_auc_score
print(f"accuracy_score(정확도):{accuracy_score(y_test, y_pred)}")
print(f"recall_score(재현율):{recall_score(y_test, y_pred)}")
print(f"precision_score(정밀도):{precision_score(y_test, y_pred)}")
print(f"f1_score(f1점수):{f1_score(y_test, y_pred)}")
print(f"roc_auc_score:{roc_auc_score(y_test, y_pred)}")  #0.5면 랜덤예측 1.0이면 완벽예측

accuracy_score(정확도):0.8017973040439341
recall_score(재현율):0.8295863309352518
precision_score(정밀도):0.6041257367387033
f1_score(f1점수):0.6991284577491473
roc_auc_score:0.8103529443204249


# 기본데이터 city

In [43]:
ml_hotel_basic = ml_data[ml_data["hotel"]=="City Hotel"]

In [44]:
label = ml_hotel_basic['is_canceled']
feature = ml_hotel_basic.drop(['hotel','is_canceled'],axis = 1)

In [45]:
numerical_columns = [column for column in feature.columns if feature.dtypes[column] != 'object']

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature[numerical_columns],label,test_size=0.2, stratify=label,random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [47]:
d_train = lgb.Dataset(X_train, label=y_train)

params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'

clf = lgb.train(params, d_train)

[LightGBM] [Info] Number of positive: 26482, number of negative: 36982
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1128
[LightGBM] [Info] Number of data points in the train set: 63464, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.417276 -> initscore=-0.333966
[LightGBM] [Info] Start training from score -0.333966


In [48]:
#예측
#Prediction
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(len(y_pred)):
    if y_pred[i]>=.4:
        y_pred[i]=1
    else:
        y_pred[i]=0

In [49]:
# 분류모델평가: 정확도, 재현율, 정밀도, f1-점수와 roc_auc 점수
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score, roc_auc_score
print(f"accuracy_score(정확도):{accuracy_score(y_test, y_pred)}")
print(f"recall_score(재현율):{recall_score(y_test, y_pred)}")
print(f"precision_score(정밀도):{precision_score(y_test, y_pred)}")
print(f"f1_score(f1점수):{f1_score(y_test, y_pred)}")
print(f"roc_auc_score:{roc_auc_score(y_test, y_pred)}")  #0.5면 랜덤예측 1.0이면 완벽예측

accuracy_score(정확도):0.8193621580738687
recall_score(재현율):0.7765861027190333
precision_score(정밀도):0.7875306372549019
f1_score(f1점수):0.7820200790994828
roc_auc_score:0.8132876436156274
