## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split  # [web:1]
%matplotlib inline


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)       
pd.set_option('display.max_colwidth', None)   


## Cria√ß√£o do DataFrame

In [21]:
load_dotenv() 
database_url = os.getenv("NEON_DATABASE_URL")

engine = create_engine(database_url)
query = "SELECT * FROM hotel_bookings"

df = pd.read_sql(query, engine)

### Separa√ß√£o dos Dados para Streaming

In [None]:
df_modeling, df_validation = train_test_split(
    df, test_size=0.15, stratify=df["is_canceled"], random_state=42
)

df_validation.to_csv("../data/hotel_bookings_validation.csv", index=False)
print(f"Valida√ß√£o salva: {len(df_validation)} linhas")

df = df_modeling


Valida√ß√£o salva: 17909 linhas


### EDA

"Preprocessing includes lumping infrequent categories of the categorical predictor
Country (originally with 126 levels or countries) into 11 levels (CN, DEU, ESP, FRA, GBR,
IRL, NLD, PRT, USA, NULL and OTHER). The categorical predictor ReservedRoomType has
11 levels, and the categorical predictor AssignedRoomType has 10 levels; the levels of these
two predictors were lumped into a total of seven categories to increase the counts of
infrequent levels."

In [8]:
def preprocess_hotel_categories(df):
    """
    Replica o pr√©-processamento do paper:
    - Country: 126 n√≠veis -> 11 n√≠veis (CN, DEU, ESP, FRA, GBR, IRL, NLD, PRT, USA, NULL, OTHER)
    - ReservedRoomType (11 n√≠veis) e AssignedRoomType (10 n√≠veis) -> 7 categorias totais combinadas
    """
    
    df_processed = df.copy()
    
    priority_countries = ['CN', 'DEU', 'ESP', 'FRA', 'GBR', 'IRL', 'NLD', 'PRT', 'USA']
    
    def lump_country(country):
        if pd.isna(country):
            return 'NULL'
        elif country in priority_countries:
            return country
        else:
            return 'OTHER'
    
    df_processed['country'] = df_processed['country'].apply(lump_country)
    
    common_room_types = ['A', 'B', 'C', 'D', 'E', 'F']  # Baseado em datasets t√≠picos
    
    def lump_room_type(room):
        if pd.isna(room):
            return 'NULL'
        elif room in common_room_types:
            return room
        else:
            return 'OTHER'
    
    df_processed['reserved_room_type'] = df_processed['reserved_room_type'].apply(lump_room_type)
    df_processed['assigned_room_type'] = df_processed['assigned_room_type'].apply(lump_room_type)
    
    return df_processed

# APLICAR O PR√â-PROCESSAMENTO
print("=== ANTES DO PR√â-PROCESSAMENTO ===")
print("Country n√≠veis √∫nicos:", df['country'].nunique(), df['country'].unique()[:10])
print("ReservedRoomType n√≠veis √∫nicos:", df['reserved_room_type'].nunique())
print("AssignedRoomType n√≠veis √∫nicos:", df['assigned_room_type'].nunique())

# Executar pr√©-processamento
df = preprocess_hotel_categories(df)

print("\n=== DEPOIS DO PR√â-PROCESSAMENTO ===")
print("Country n√≠veis √∫nicos:", df['country'].nunique(), sorted(df['country'].unique()))
print("ReservedRoomType n√≠veis √∫nicos:", df['reserved_room_type'].nunique(), sorted(df['reserved_room_type'].unique()))
print("AssignedRoomType n√≠veis √∫nicos:", df['assigned_room_type'].nunique(), sorted(df['assigned_room_type'].unique()))

print("\nDistribui√ß√£o Country (top 11):")
print(df['country'].value_counts().head(11))


=== ANTES DO PR√â-PROCESSAMENTO ===
Country n√≠veis √∫nicos: 172 ['SWE' 'NZL' 'PRT' 'FRA' 'GBR' 'NLD' 'USA' 'DEU' 'ESP' 'ISR']
ReservedRoomType n√≠veis √∫nicos: 10
AssignedRoomType n√≠veis √∫nicos: 12

=== DEPOIS DO PR√â-PROCESSAMENTO ===
Country n√≠veis √∫nicos: 11 ['CN', 'DEU', 'ESP', 'FRA', 'GBR', 'IRL', 'NLD', 'NULL', 'OTHER', 'PRT', 'USA']
ReservedRoomType n√≠veis √∫nicos: 7 ['A', 'B', 'C', 'D', 'E', 'F', 'OTHER']
AssignedRoomType n√≠veis √∫nicos: 7 ['A', 'B', 'C', 'D', 'E', 'F', 'OTHER']

Distribui√ß√£o Country (top 11):
country
PRT      41369
OTHER    19572
GBR      10319
FRA       8808
ESP       7262
DEU       6202
IRL       2875
NLD       1788
USA       1781
CN        1088
NULL       417
Name: count, dtype: int64


### Pre Processamento

In [9]:
df['arrival_date_month'] = pd.Categorical(df['arrival_date_month'], 
                                          categories=["January","February","March",
                                                      "April","May","June","July","August",
                                                      "September","October","November","December"], 
                                          ordered=True)

df['arrival_date_day_of_month'] = df['arrival_date_day_of_month'].astype(int)


### Divis√£o de Treino e Teste
    "75% training set and a 25% test set to estimate the performance of the machine learning algorithms"

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [11]:
X = df.drop(columns=['is_canceled'])
y = df['is_canceled']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (81184, 31), y_train: (81184,)
X_test: (20297, 31), y_test: (20297,)


### Normaliza√ß√£o

In [14]:
# 1. DIAGN√ìSTICO: Identificar qual coluna tem datas
print("üîç COLUNAS COM MUITAS CATEGORIAS √öNICAS:")
for col in X_train.select_dtypes(include=['object']).columns:
    unique_count = X_train[col].nunique()
    if unique_count > 20:  # Suspeita de datas
        print(f"‚ùå {col}: {unique_count} n√≠veis √∫nicos")
        print(f"   Amostra: {X_train[col].unique()[:5]}")
        print()

# 2. TRATAR COLUNAS DE DATA PROBLEM√ÅTICAS
def fix_date_columns(df):
    df_fixed = df.copy()
    
    # Converter colunas suspeitas de data para num√©ricas ou remover
    date_suspect_cols = []
    for col in df_fixed.select_dtypes(include=['object']).columns:
        if df_fixed[col].nunique() > 20:  # Muitas categorias = provavelmente data
            try:
                # Tentar converter para datetime e extrair features
                pd.to_datetime(df_fixed[col], errors='coerce')
                print(f"üìÖ Convertendo {col} para num√©rico...")
                df_fixed[col] = pd.to_datetime(df_fixed[col], errors='coerce').dt.dayofyear
            except:
                # Se n√£o conseguir, remover a coluna
                print(f"üóëÔ∏è Removendo {col} (demasiadas categorias)")
                df_fixed = df_fixed.drop(columns=[col])
    
    return df_fixed

# APLICAR TRATAMENTO
X_train_fixed = fix_date_columns(X_train)
X_test_fixed = fix_date_columns(X_test)

# 3. AGORA O PREPROCESSADOR FUNCIONA
continuous_cols = X_train_fixed.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train_fixed.select_dtypes(include=['object']).columns.tolist()

print(f"\n‚úÖ Ap√≥s tratamento:")
print("Cont√≠nuas:", continuous_cols)
print("Categ√≥ricas:", categorical_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), continuous_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ])

# FIT E TRANSFORM
X_train_processed = preprocessor.fit_transform(X_train_fixed)
X_test_processed = preprocessor.transform(X_test_fixed)

print(f"\n‚úÖ X_train processado: {X_train_processed.shape}")
print(f"‚úÖ X_test processado: {X_test_processed.shape}")


üîç COLUNAS COM MUITAS CATEGORIAS √öNICAS:
‚ùå reservation_status_date: 913 n√≠veis √∫nicos
   Amostra: ['2016-10-06' '2015-07-02' '2016-04-27' '2015-10-16' '2016-09-27']

üìÖ Convertendo reservation_status_date para num√©rico...
üìÖ Convertendo reservation_status_date para num√©rico...

‚úÖ Ap√≥s tratamento:
Cont√≠nuas: ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status_date']
Categ√≥ricas: ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status']

‚úÖ X_train processado: (81184, 65)
‚úÖ X_test processado: (20297, 65)


### Treinamento dos Modelos e Otimiza√ß√£o dos hiperpar√¢mtros

#### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

numeric_cols = X_train.select_dtypes(include=[np.number]).columns
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]


rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=rf_param_grid,
    cv=5,
    n_jobs=-1
)

rf_grid.fit(X_train_numeric, y_train)
print(f"‚úÖ Melhores par√¢metros RandomForest: {rf_grid.best_params_}")
print(f"‚úÖ Score CV: {rf_grid.best_score_:.3f}")



‚úÖ Melhores par√¢metros RandomForest: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
‚úÖ Score CV: 0.864


#### XGBoost

In [11]:
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
X_train_numeric = X_train[numeric_cols]
X_test_numeric = X_test[numeric_cols]

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    param_grid=xgb_param_grid,
    cv=5,
    n_jobs=-1
)

xgb_grid.fit(X_train_numeric, y_train)
print(f"‚úÖ Melhores par√¢metros XGBoost: {xgb_grid.best_params_}")
print(f"‚úÖ Score CV: {xgb_grid.best_score_:.3f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Melhores par√¢metros XGBoost: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.8}
‚úÖ Score CV: 0.858


#### SVM

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # preencher NaN com m√©dia da coluna
    ('scaler', StandardScaler()),                  # escalar dados
    ('svm', SVC(random_state=42))
])

pipeline.fit(X_train_numeric, y_train)
print(f"‚úÖ Score treino: {pipeline.score(X_train_numeric, y_train):.3f}")
print(f"‚úÖ Score teste: {pipeline.score(X_test_numeric, y_test):.3f}")


‚úÖ Score treino: 0.786
‚úÖ Score teste: 0.782


### Terceiro Ciclo (novo dataframe)

In [35]:
database_url = os.getenv("NEON_DATABASE_URL")

engine = create_engine(database_url)
query = "SELECT * FROM hotel_bookings"

df1 = pd.read_sql(query, engine)
df1.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


#### Resort Hotel

In [36]:
hotel_alvo = "Resort Hotel" 

df_hotel = df1[df1["hotel"] == hotel_alvo].copy()
print(hotel_alvo, df_hotel.shape)


Resort Hotel (40060, 32)


In [37]:
y = df_hotel['is_canceled']

cols_to_drop = ['is_canceled', 'reservation_status', 'reservation_status_date']
X = df_hotel.drop(columns=cols_to_drop)

print("y value_counts:")
print(y.value_counts())
print("\nX shape:", X.shape)
print("X columns:", X.columns.tolist())

y value_counts:
is_canceled
0    28938
1    11122
Name: count, dtype: int64

X shape: (40060, 29)
X columns: ['hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Shapes ap√≥s split:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test :", X_test.shape, "y_test :", y_test.shape)


Shapes ap√≥s split:
X_train: (30045, 29) y_train: (30045,)
X_test : (10015, 29) y_test : (10015,)


In [39]:
def fix_date_columns(df_raw):
    df_fixed = df_raw.copy()
    for col in df_fixed.select_dtypes(include=['object']).columns:
        if df_fixed[col].nunique() > 20:
            try:
                conv = pd.to_datetime(df_fixed[col], errors='coerce')
                if conv.notna().sum() > 0:
                    print(f"üìÖ Convertendo {col} para dayofyear...")
                    df_fixed[col] = conv.dt.dayofyear
            except Exception:
                print(f"üóëÔ∏è Removendo {col}")
                df_fixed = df_fixed.drop(columns=[col])
    return df_fixed

X_train_fixed = fix_date_columns(X_train)
X_test_fixed = fix_date_columns(X_test)

continuous_cols = X_train_fixed.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train_fixed.select_dtypes(include=['object']).columns.tolist()

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), continuous_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train_fixed)
X_test_processed = preprocessor.transform(X_test_fixed)

print("X_train_processed:", X_train_processed.shape)
print("X_test_processed :", X_test_processed.shape)


  conv = pd.to_datetime(df_fixed[col], errors='coerce')
  conv = pd.to_datetime(df_fixed[col], errors='coerce')


X_train_processed: (30045, 182)
X_test_processed : (10015, 182)


### Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_param_grid = {
    'n_estimators': [200],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'bootstrap': [True],
    'class_weight': ['balanced']
}

rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train_processed, y_train)
rf_best = rf_grid.best_estimator_
print("RF best:", rf_grid.best_params_)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
RF best: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [42]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def metrics_paper(y_true, y_pred):
    oa = float(accuracy_score(y_true, y_pred))
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average=None, labels=[1, 0]
    )
    return {
        "OA": float(oa),
        "P_1": float(precision[0]),
        "R_1": float(recall[0]),
        "F1_1": float(f1[0]),
        "P_0": float(precision[1]),
        "R_0": float(recall[1]),
        "F1_0": float(f1[1]),
    }

y_rf_train = rf_best.predict(X_train_processed)
y_rf_test = rf_best.predict(X_test_processed)

m_train = metrics_paper(y_train, y_rf_train)
m_test = metrics_paper(y_test, y_rf_test)

print("RF Treino:")
for k, v in m_train.items():
    print(f"  {k}: {v:.3f}")

print("\nRF Teste:")
for k, v in m_test.items():
    print(f"  {k}: {v:.3f}")


RF Treino:
  OA: 0.893
  P_1: 0.749
  R_1: 0.925
  F1_1: 0.827
  P_0: 0.968
  R_0: 0.881
  F1_0: 0.922

RF Teste:
  OA: 0.876
  P_1: 0.728
  R_1: 0.881
  F1_1: 0.797
  P_0: 0.950
  R_0: 0.874
  F1_0: 0.910


In [46]:
print(f"Random Forest - OA teste: {m_test['OA']:.3f}")

Random Forest - OA teste: 0.876


### XGBoost

In [44]:
xgb_param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid = GridSearchCV(
    estimator=XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ),
    param_grid=xgb_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)


y_xgb_train = xgb_best.predict(X_train_processed)
y_xgb_test = xgb_best.predict(X_test_processed)

m_train_xgb = metrics_paper(y_train, y_xgb_train)
m_test_xgb = metrics_paper(y_test, y_xgb_test)

print("XGB Treino:")
for k, v in m_train_xgb.items():
    print(f"  {k}: {v:.3f}")

print("\nXGB Teste:")
for k, v in m_test_xgb.items():
    print(f"  {k}: {v:.3f}")

XGB Treino:
  OA: 0.963
  P_1: 0.932
  R_1: 0.933
  F1_1: 0.933
  P_0: 0.974
  R_0: 0.974
  F1_0: 0.974

XGB Teste:
  OA: 0.907
  P_1: 0.838
  R_1: 0.825
  F1_1: 0.831
  P_0: 0.933
  R_0: 0.939
  F1_0: 0.936


In [47]:
print(f"XGBoost - OA teste: {m_test_xgb['OA']:.3f}")


XGBoost - OA teste: 0.907


### SVM

In [49]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

X_train_svm = imputer.fit_transform(X_train_processed)
X_test_svm = imputer.transform(X_test_processed)

print("NaNs em X_train_svm:", np.isnan(X_train_svm).sum())
print("NaNs em X_test_svm :", np.isnan(X_test_svm).sum())

NaNs em X_train_svm: 0
NaNs em X_test_svm : 0


In [50]:
svm_param_grid = {
    'C': [1, 5],
    'gamma': ['scale', 0.1, 0.01],
    'kernel': ['rbf']
}

svm_grid = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=svm_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train_svm, y_train)
svm_best = svm_grid.best_estimator_
print("SVM best:", svm_grid.best_params_)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
SVM best: {'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}


In [51]:
y_svm_train = svm_best.predict(X_train_svm)
y_svm_test = svm_best.predict(X_test_svm)

m_train_svm = metrics_paper(y_train, y_svm_train)
m_test_svm = metrics_paper(y_test, y_svm_test)

print("SVM Treino:")
for k, v in m_train_svm.items():
    print(f"  {k}: {v:.3f}")

print("\nSVM Teste:")
for k, v in m_test_svm.items():
    print(f"  {k}: {v:.3f}")


SVM Treino:
  OA: 0.901
  P_1: 0.839
  R_1: 0.795
  F1_1: 0.817
  P_0: 0.923
  R_0: 0.941
  F1_0: 0.932

SVM Teste:
  OA: 0.878
  P_1: 0.803
  R_1: 0.743
  F1_1: 0.772
  P_0: 0.904
  R_0: 0.930
  F1_0: 0.917


#### City Hotel

In [65]:
hotel_alvo = "City Hotel" 

df_hotel = df1[df1["hotel"] == hotel_alvo].copy()
print(hotel_alvo, df_hotel.shape)

City Hotel (79330, 32)


In [66]:
y = df_hotel['is_canceled']

cols_to_drop = ['is_canceled', 'reservation_status', 'reservation_status_date']
X = df_hotel.drop(columns=cols_to_drop)

print("y value_counts:")
print(y.value_counts())
print("\nX shape:", X.shape)
print("X columns:", X.columns.tolist())

y value_counts:
is_canceled
0    46228
1    33102
Name: count, dtype: int64

X shape: (79330, 29)
X columns: ['hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent', 'company', 'days_in_waiting_list', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']


In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Shapes ap√≥s split:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test :", X_test.shape, "y_test :", y_test.shape)


Shapes ap√≥s split:
X_train: (59497, 29) y_train: (59497,)
X_test : (19833, 29) y_test : (19833,)


In [77]:
def fix_date_columns(df_raw):
    df_fixed = df_raw.copy()
    for col in df_fixed.select_dtypes(include=['object']).columns:
        if df_fixed[col].nunique() > 20:
            try:
                conv = pd.to_datetime(df_fixed[col], errors='coerce')
                if conv.notna().sum() > 0:
                    print(f"üìÖ Convertendo {col} para dayofyear...")
                    df_fixed[col] = conv.dt.dayofyear
            except Exception:
                print(f"üóëÔ∏è Removendo {col}")
                df_fixed = df_fixed.drop(columns=[col])
    return df_fixed

X_train_fixed = fix_date_columns(X_train)
X_test_fixed = fix_date_columns(X_test)


  conv = pd.to_datetime(df_fixed[col], errors='coerce')
  conv = pd.to_datetime(df_fixed[col], errors='coerce')


### Random Forest

In [86]:

rf_param_grid = {
    'n_estimators': [200],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'bootstrap': [True],
    'class_weight': ['balanced']
}

rf_grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train_processed, y_train)
rf_best = rf_grid.best_estimator_
print("RF best (City):", rf_grid.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
RF best (City): {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [87]:
y_rf_train = rf_best.predict(X_train_processed)
y_rf_test = rf_best.predict(X_test_processed)

m_train_rf = metrics_paper(y_train, y_rf_train)
m_test_rf = metrics_paper(y_test, y_rf_test)

print("RF Treino (City):")
for k, v in m_train_rf.items():
    print(f"  {k}: {v:.3f}")

print("\nRF Teste (City):")
for k, v in m_test_rf.items():
    print(f"  {k}: {v:.3f}")

RF Treino (City):
  OA: 0.865
  P_1: 0.880
  R_1: 0.784
  F1_1: 0.829
  P_0: 0.857
  R_0: 0.924
  F1_0: 0.889

RF Teste (City):
  OA: 0.858
  P_1: 0.878
  R_1: 0.766
  F1_1: 0.818
  P_0: 0.846
  R_0: 0.924
  F1_0: 0.883


In [78]:
continuous_cols = X_train_fixed.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train_fixed.select_dtypes(include=['object']).columns.tolist()

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), continuous_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train_fixed)
X_test_processed = preprocessor.transform(X_test_fixed)

print("X_train_processed:", X_train_processed.shape)
print("X_test_processed :", X_test_processed.shape)


X_train_processed: (59497, 225)
X_test_processed : (19833, 225)




In [80]:
print(f"Random Forest - OA teste: {m_test['OA']:.3f}")

Random Forest - OA teste: 0.858


In [None]:
xgb_param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid = GridSearchCV(
    estimator=XGBClassifier(
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ),
    param_grid=xgb_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train_processed, y_train)
xgb_best = xgb_grid.best_estimator_
print("XGB best:", xgb_grid.best_params_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
XGB best: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500, 'subsample': 0.8}


### XGBoost

In [81]:
y_xgb_train = xgb_best.predict(X_train_processed)
y_xgb_test = xgb_best.predict(X_test_processed)

m_train_xgb = metrics_paper(y_train, y_xgb_train)
m_test_xgb = metrics_paper(y_test, y_xgb_test)

print("XGB Treino (City):")
for k, v in m_train_xgb.items():
    print(f"  {k}: {v:.3f}")

print("\nXGB Teste (City):")
for k, v in m_test_xgb.items():
    print(f"  {k}: {v:.3f}")

XGB Treino (City):
  OA: 0.911
  P_1: 0.909
  R_1: 0.873
  F1_1: 0.891
  P_0: 0.911
  R_0: 0.938
  F1_0: 0.924

XGB Teste (City):
  OA: 0.882
  P_1: 0.881
  R_1: 0.828
  F1_1: 0.854
  P_0: 0.882
  R_0: 0.920
  F1_0: 0.900


In [82]:
print(f"XGBoost - OA teste: {m_test_xgb['OA']:.3f}")

XGBoost - OA teste: 0.882


### SVM

In [83]:
imputer = SimpleImputer(strategy='median')

X_train_svm = imputer.fit_transform(X_train_processed)
X_test_svm = imputer.transform(X_test_processed)

print("NaNs em X_train_svm:", np.isnan(X_train_svm).sum())
print("NaNs em X_test_svm :", np.isnan(X_test_svm).sum())


NaNs em X_train_svm: 0
NaNs em X_test_svm : 0


In [84]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm_param_grid = {
    'C': [1, 5],
    'gamma': ['scale', 0.1, 0.01],
    'kernel': ['rbf']
}

svm_grid = GridSearchCV(
    estimator=SVC(random_state=42),
    param_grid=svm_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train_svm, y_train)
svm_best = svm_grid.best_estimator_
print("SVM best (City):", svm_grid.best_params_)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
SVM best (City): {'C': 5, 'gamma': 'scale', 'kernel': 'rbf'}


In [85]:
y_svm_train = svm_best.predict(X_train_svm)
y_svm_test = svm_best.predict(X_test_svm)

m_train_svm = metrics_paper(y_train, y_svm_train)
m_test_svm = metrics_paper(y_test, y_svm_test)

print("SVM Treino:")
for k, v in m_train_svm.items():
    print(f"  {k}: {v:.3f}")

print("\nSVM Teste:")
for k, v in m_test_svm.items():
    print(f"  {k}: {v:.3f}")

SVM Treino:
  OA: 0.869
  P_1: 0.879
  R_1: 0.794
  F1_1: 0.834
  P_0: 0.862
  R_0: 0.922
  F1_0: 0.891

SVM Teste:
  OA: 0.849
  P_1: 0.859
  R_1: 0.763
  F1_1: 0.808
  P_0: 0.843
  R_0: 0.910
  F1_0: 0.875


### Configura√ß√£o MLflow

In [89]:
import mlflow
from dotenv import load_dotenv
import os

# Carrega vari√°veis do .env (se tiver)
load_dotenv()

# Configura tracking local (cria pasta ./mlruns na raiz do projeto)
mlflow.set_tracking_uri("file:../mlruns")

# Cria/seleciona experimento
mlflow.set_experiment("hotel_booking_cancellation_paper_baseline")

print("‚úÖ MLflow configurado!")
print("üìÅ Pasta de tracking: ./mlruns")
print("üî¨ Experimento: hotel_booking_cancellation_paper_baseline")


2025/12/01 13:37:35 INFO mlflow.tracking.fluent: Experiment with name 'hotel_booking_cancellation_paper_baseline' does not exist. Creating a new experiment.


‚úÖ MLflow configurado!
üìÅ Pasta de tracking: ./mlruns
üî¨ Experimento: hotel_booking_cancellation_paper_baseline


### Ensemble Class

In [104]:
from sklearn.ensemble import VotingClassifier

imputer = SimpleImputer(strategy='median')
X_train_svm = imputer.fit_transform(X_train_processed)
X_test_svm = imputer.transform(X_test_processed)

ensemble = VotingClassifier(
    estimators=[
        ('rf', rf_best),
        ('svm', svm_best),
        ('xgb', xgb_best),
        ('mlp', mlp_best)
    ],
    voting='hard'
)

# usar os dados imputados, sem NaN
ensemble.fit(X_train_svm, y_train)

y_ens_train = ensemble.predict(X_train_svm)
y_ens_test = ensemble.predict(X_test_svm)

m_train_ens = metrics_paper(y_train, y_ens_train)
m_test_ens = metrics_paper(y_test, y_ens_test)

print("Ensemble Treino:")
for k, v in m_train_ens.items():
    print(f"  {k}: {v:.3f}")

print("\nEnsemble Teste:")
for k, v in m_test_ens.items():
    print(f"  {k}: {v:.3f}")


Ensemble Treino:
  OA: 0.888
  P_1: 0.915
  R_1: 0.805
  F1_1: 0.857
  P_0: 0.872
  R_0: 0.947
  F1_0: 0.908

Ensemble Teste:
  OA: 0.870
  P_1: 0.902
  R_1: 0.772
  F1_1: 0.832
  P_0: 0.852
  R_0: 0.940
  F1_0: 0.894


In [105]:
print(f"RF   - OA teste: {m_test_rf['OA']:.3f}")
print(f"SVM  - OA teste: {m_test_svm['OA']:.3f}")
print(f"XGB  - OA teste: {m_test_xgb['OA']:.3f}")
print(f"MLP  - OA teste: {m_test_mlp['OA']:.3f}")
print(f"ENS  - OA teste: {m_test_ens['OA']:.3f}")

RF   - OA teste: 0.858
SVM  - OA teste: 0.849
XGB  - OA teste: 0.882
MLP  - OA teste: 0.855
ENS  - OA teste: 0.870


### Contribui√ß√£o para o paper

In [96]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(strategy='median')

X_train_mlp = imputer.fit_transform(X_train_processed)
X_test_mlp = imputer.transform(X_test_processed)

print("NaNs em X_train_mlp:", np.isnan(X_train_mlp).sum())
print("NaNs em X_test_mlp :", np.isnan(X_test_mlp).sum())

NaNs em X_train_mlp: 0
NaNs em X_test_mlp : 0


In [98]:
mlp_param_grid = {
    'hidden_layer_sizes': [(64,), (64, 32)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [1e-4, 1e-3],
    'learning_rate_init': [0.001],
    'max_iter': [100],
    'batch_size': [256]
}

mlp_grid = GridSearchCV(
    estimator=MLPClassifier(random_state=42),
    param_grid=mlp_param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# CORRE√á√ÉO: usar X_train_mlp, n√£o X_train_processed
mlp_grid.fit(X_train_mlp, y_train)
mlp_best = mlp_grid.best_estimator_
print("MLP best:", mlp_grid.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
MLP best: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 256, 'hidden_layer_sizes': (64,), 'learning_rate_init': 0.001, 'max_iter': 100, 'solver': 'adam'}


In [100]:
y_mlp_train = mlp_best.predict(X_train_mlp)
y_mlp_test = mlp_best.predict(X_test_mlp)

m_train_mlp = metrics_paper(y_train, y_mlp_train)
m_test_mlp = metrics_paper(y_test, y_mlp_test)

print("MLP Treino:")
for k, v in m_train_mlp.items():
    print(f"  {k}: {v:.3f}")

print("\nMLP Teste:")
for k, v in m_test_mlp.items():
    print(f"  {k}: {v:.3f}")



MLP Treino:
  OA: 0.879
  P_1: 0.869
  R_1: 0.835
  F1_1: 0.852
  P_0: 0.885
  R_0: 0.910
  F1_0: 0.897

MLP Teste:
  OA: 0.855
  P_1: 0.844
  R_1: 0.802
  F1_1: 0.822
  P_0: 0.863
  R_0: 0.894
  F1_0: 0.878
