In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from catboost import CatBoostClassifier, Pool, metrics, cv

## Load data

In [2]:
try:
    print("Loading data")
    df = pd.read_csv("../data/raw/hotel_booking.csv")
except FileNotFoundError:
    print("CSV not found")
    df = pd.DataFrame()

Loading data


In [3]:
num_features = [
    'lead_time', 'arrival_date_week_number', "arrival_date_day_of_month",
    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
    'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled',
    'required_car_parking_spaces', 'total_of_special_requests', 'adr'
]

cat_features = [
    'hotel', 'agent', 'arrival_date_month', 'meal', 'market_segment',
    'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'
]

## Clean data

In [4]:
data = df.copy()

In [5]:
for col in cat_features:
    print(col, data[col].dtype)
    if data[col].dtype == 'float':
        data[col] = data[col].fillna(0).astype(int)
        print('changed', data[col].dtype)

hotel object
agent float64
changed int64
arrival_date_month object
meal object
market_segment object
distribution_channel object
reserved_room_type object
deposit_type object
customer_type object


In [6]:
data.fillna(-999, inplace=True)

## Split data

In [7]:
X = data[num_features + cat_features]
y = data.is_canceled

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

## Train model

In [9]:
cat_features_indices = np.array([X_train.columns.get_loc(col) for col in cat_features])
cat_features_indices

array([14, 15, 16, 17, 18, 19, 20, 21, 22])

In [10]:

model = CatBoostClassifier(
    max_depth=3,
    iterations=300,
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [12]:
model.fit(
    X_train, y_train,
    cat_features=cat_features_indices,
    eval_set=(X_valid, y_valid),
    logging_level='Verbose', 
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.159629
0:	learn: 0.6126239	test: 0.6121990	best: 0.6121990 (0)	total: 120ms	remaining: 35.9s
1:	learn: 0.5645013	test: 0.5633498	best: 0.5633498 (1)	total: 155ms	remaining: 23s
2:	learn: 0.5314239	test: 0.5302182	best: 0.5302182 (2)	total: 177ms	remaining: 17.6s
3:	learn: 0.4987297	test: 0.4976279	best: 0.4976279 (3)	total: 203ms	remaining: 15s
4:	learn: 0.4835451	test: 0.4823367	best: 0.4823367 (4)	total: 227ms	remaining: 13.4s
5:	learn: 0.4733848	test: 0.4720620	best: 0.4720620 (5)	total: 254ms	remaining: 12.4s
6:	learn: 0.4609952	test: 0.4594196	best: 0.4594196 (6)	total: 279ms	remaining: 11.7s
7:	learn: 0.4498571	test: 0.4483967	best: 0.4483967 (7)	total: 296ms	remaining: 10.8s
8:	learn: 0.4425021	test: 0.4409839	best: 0.4409839 (8)	total: 312ms	remaining: 10.1s
9:	learn: 0.4368493	test: 0.4352210	best: 0.4352210 (9)	total: 327ms	remaining: 9.47s
10:	learn: 0.4311638	test: 0.4294865	best: 0.4294865 (10)	total: 350ms	remaining: 9.2s
11:	learn: 0.4273553	test: 

<catboost.core.CatBoostClassifier at 0x7f7949d19ac0>

## Evaludate model

In [13]:
predictions = model.predict(X_valid)

acc = accuracy_score(y_valid, predictions)
f1_sc = f1_score(y_valid, predictions)
    
print(f"Accuracy: {acc}")
print(f"F1 score: {f1_sc}")

Accuracy: 0.8333193734818661
F1 score: 0.7481225213062189
