In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, metrics

In [18]:
try:
    print("Loading data")
    df = pd.read_csv("../data/raw/hotel_booking.csv")
except FileNotFoundError:
    print("CSV not found")
    df = pd.DataFrame()

Loading data


In [19]:
num_features = [
    'lead_time', 'arrival_date_week_number', "arrival_date_day_of_month",
    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
    'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled',
    'required_car_parking_spaces', 'total_of_special_requests', 'adr'
]

cat_features = [
    'hotel', 'agent', 'company', 'arrival_date_month', 'meal', 'market_segment',
    'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'
]


In [None]:
# Convert all cat columns to integer type
for col in cat_features:
    print(col, df[col].dtype)
    if df[col].dtype == 'float':
        df[col] = df[col].fillna(0).astype(int)
        print('changed', df[col].dtype)

In [20]:
features = num_features + cat_features

In [21]:
X = df[features].copy()
y = df.is_canceled.copy()

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
num_transformer = SimpleImputer(strategy='constant')

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

preprocessor

In [25]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

In [26]:
pipeline.fit(X_train, y_train)

In [27]:
predictions = pipeline.predict(X_test)

acc = accuracy_score(y_test, predictions)
f1_sc = f1_score(y_test, predictions)
    
print(f"Accuracy: {acc}")
print(f"F1 score: {f1_sc}")

Accuracy: 0.8640031270067287
F1 score: 0.8040706327179116
