In [1]:
from random import random
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from category_encoders import CountEncoder

from sklearn.pipeline import Pipeline
from sklearn. compose import ColumnTransformer

In [2]:
try:
    print("Loading data")
    df = pd.read_csv("../data/raw/hotel_booking.csv")
except FileNotFoundError:
    print("CSV not found")
    df = pd.DataFrame()

Loading data


In [3]:
data = df.copy()

features_to_drop = ['name', 'email', 'phone-number', 'credit_card', 'company']
data = data.drop(features_to_drop, axis=1)

In [4]:
features_to_extract_date = ['reservation_status_date']

def extract_date(data, col):
    data[col] = pd.to_datetime(data[col])
    
    data[col + '_year'] = data[col].dt.year

    data[col + '_month'] = data[col].dt.month
    
    data[col + '_day'] = data[col].dt.day
    
    data = data.drop(col, axis=1)
    
    return data    

for feature in features_to_extract_date:
    data = extract_date(data, feature)

In [5]:
features_to_onehot_encode = ['hotel', 'arrival_date_month', 'meal', 'deposit_type', 'customer_type', 'reservation_status']

features_to_count_encode = ['country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type']

features_to_impute = ['children', 'agent']

In [6]:
X = data.drop('is_canceled', axis=1)
y = data.is_canceled

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      random_state=1,
                                                      train_size=0.8)

In [8]:
imputer = SimpleImputer(fill_value=0)
onehot_enc = OneHotEncoder()
count_enc = CountEncoder()

In [9]:
preprocessor = ColumnTransformer(
    transformers=[('imputer', imputer, features_to_impute),
            ('onehot', onehot_enc, features_to_onehot_encode),
            ('count_encode', count_enc, features_to_count_encode)
])

In [10]:
params = {
        "n_estimators": 10,
        "max_depth": 5
    }

model = LogisticRegression()

In [11]:
model_pipeline = Pipeline(
    steps=[
       ('preprocessor', preprocessor),
       ('model', model)
])

In [12]:
model_pipeline.fit(X_train, y_train)

predictions = model_pipeline.predict(X_valid)

acc = accuracy_score(y_valid, predictions)
f1_sc = f1_score(y_valid, predictions)
    
print(f"Accuracy: {acc}")
print(f"F1 score: {f1_sc}")

Accuracy: 0.6781556244241561
F1 score: 0.550663626264398
