In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from catboost import CatBoostClassifier, Pool, metrics, cv

## Load data

In [2]:
try:
    print("Loading data")
    df = pd.read_csv("../data/raw/hotel_booking.csv")
except FileNotFoundError:
    print("CSV not found")
    df = pd.DataFrame()

Loading data


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

## Clean data

In [4]:
data = df.copy()

In [5]:
# Fill missing values
null_value_stats = data.isnull().sum(axis=0)
null_values_columns = null_value_stats[null_value_stats != 0].to_frame().reset_index().rename(columns={0: 'num_null_values',
                                                                                               'index': 'column'})
print(null_values_columns)

data_size = len(df)
num_null_allowed = int(data_size * 0.6)
columns_to_remove = null_values_columns[null_values_columns.num_null_values > num_null_allowed].column.to_list()
print(columns_to_remove)

     column  num_null_values
0  children                4
1   country              488
2     agent            16340
3   company           112593
['company']


In [6]:
for col in columns_to_remove:
    data = data.drop(col, axis=1)

In [7]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,Transient,0.0,0,0,Check-Out,2015-07-01,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,Transient,75.0,0,0,Check-Out,2015-07-02,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,Transient,98.0,0,1,Check-Out,2015-07-03,Linda Hines,LHines@verizon.com,713-226-5883,************5498


In [8]:
data.agent.fillna(0, inplace=True)

data.fillna(-999, inplace=True)

In [9]:
# Feature engineering

features_to_extract_date = ['reservation_status_date']

def extract_date(data, col):
    data[col] = pd.to_datetime(data[col])
    
    data[col + '_year'] = data[col].dt.year

    data[col + '_month'] = data[col].dt.month
    
    data[col + '_day'] = data[col].dt.day
    
    data = data.drop(col, axis=1)
    
    return data    

for feature in features_to_extract_date:
    data = extract_date(data, feature)

In [10]:
cat_features = ['hotel',
 'arrival_date_month',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'is_repeated_guest',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'agent',
 'customer_type',
 'reservation_status',
 'name',
 'email',
 'phone-number',
 'credit_card',
]

print(len(cat_features))

17


In [11]:

# Convert all cat columns to integer type
for col in cat_features:
    print(col, data[col].dtype)
    if data[col].dtype == 'float':
        data[col] = data[col].astype(int)
        print('changed', data[col].dtype)


hotel object
arrival_date_month object
meal object
country object
market_segment object
distribution_channel object
is_repeated_guest int64
reserved_room_type object
assigned_room_type object
deposit_type object
agent float64
changed int64
customer_type object
reservation_status object
name object
email object
phone-number object
credit_card object


In [12]:
data['agent'].head()

0      0
1      0
2      0
3    304
4    240
Name: agent, dtype: int64

In [13]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,required_car_parking_spaces,total_of_special_requests,reservation_status,name,email,phone-number,credit_card,reservation_status_date_year,reservation_status_date_month,reservation_status_date_day
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,0,0,Check-Out,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,2015,7,1
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,0,0,Check-Out,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2015,7,1
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,0,Check-Out,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,2015,7,2
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,0,Check-Out,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,2015,7,2
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,1,Check-Out,Linda Hines,LHines@verizon.com,713-226-5883,************5498,2015,7,3


## Split data

In [26]:
#X = data.drop('is_canceled', axis=1)

num_features = ['lead_time', 'arrival_date_year', 'arrival_date_week_number','stays_in_week_nights', 
                   'previous_cancellations','previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list',
                    'adr', 'required_car_parking_spaces', 'total_of_special_requests',
                  ]
X = data[num_features + cat_features]
y = data.is_canceled

In [27]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 28 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   lead_time                       119390 non-null  int64  
 1   arrival_date_year               119390 non-null  int64  
 2   arrival_date_week_number        119390 non-null  int64  
 3   stays_in_week_nights            119390 non-null  int64  
 4   previous_cancellations          119390 non-null  int64  
 5   previous_bookings_not_canceled  119390 non-null  int64  
 6   booking_changes                 119390 non-null  int64  
 7   days_in_waiting_list            119390 non-null  int64  
 8   adr                             119390 non-null  float64
 9   required_car_parking_spaces     119390 non-null  int64  
 10  total_of_special_requests       119390 non-null  int64  
 11  hotel                           119390 non-null  object 
 12  arrival_date_mon

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=42,
                                                      stratify=y)

## Train model

In [29]:
cat_features_indices = np.array([X.columns.get_loc(col) for col in cat_features])
cat_features_indices

array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])

In [30]:

model = CatBoostClassifier(
    max_depth=3,
    iterations=300,
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

In [31]:
model.fit(
    X_train, y_train,
    cat_features=cat_features_indices,
    eval_set=(X_valid, y_valid),
    logging_level='Verbose', 
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.164982
0:	learn: 0.1872694	test: 0.1872744	best: 0.1872744 (0)	total: 83ms	remaining: 24.8s
1:	learn: 0.0520100	test: 0.0519230	best: 0.0519230 (1)	total: 126ms	remaining: 18.7s
2:	learn: 0.0180277	test: 0.0179309	best: 0.0179309 (2)	total: 166ms	remaining: 16.4s
3:	learn: 0.0071433	test: 0.0070623	best: 0.0070623 (3)	total: 207ms	remaining: 15.3s
4:	learn: 0.0029233	test: 0.0028561	best: 0.0028561 (4)	total: 240ms	remaining: 14.1s
5:	learn: 0.0013368	test: 0.0012764	best: 0.0012764 (5)	total: 280ms	remaining: 13.7s
6:	learn: 0.0013196	test: 0.0012629	best: 0.0012629 (6)	total: 310ms	remaining: 13s
7:	learn: 0.0013065	test: 0.0012534	best: 0.0012534 (7)	total: 355ms	remaining: 13s
8:	learn: 0.0012992	test: 0.0012468	best: 0.0012468 (8)	total: 386ms	remaining: 12.5s
9:	learn: 0.0012992	test: 0.0012468	best: 0.0012468 (9)	total: 436ms	remaining: 12.6s
10:	learn: 0.0012992	test: 0.0012467	best: 0.0012467 (10)	total: 468ms	remaining: 12.3s
11:	learn: 0.0012992	test: 

<catboost.core.CatBoostClassifier at 0x7fad29043fd0>

## Evaludate model

In [32]:
predictions = model.predict(X_valid)

acc = accuracy_score(y_valid, predictions)
f1_sc = f1_score(y_valid, predictions)
    
print(f"Accuracy: {acc}")
print(f"F1 score: {f1_sc}")

Accuracy: 1.0
F1 score: 1.0


In [None]:
#model.fit(X_train, y_train)

#predictions = model_pipeline.predict(X_valid)

#acc = accuracy_score(y_valid, predictions)
#f1_sc = f1_score(y_valid, predictions)
    
#print(f"Accuracy: {acc}")
#print(f"F1 score: {f1_sc}")