<a href="https://colab.research.google.com/github/Juanda16/prediccion_cancelaciones_hoteles/blob/main/proyecto_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
## install kagle to download the DB directly
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mojtaba142/hotel-booking
!unzip hotel-booking.zip

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Dataset URL: https://www.kaggle.com/datasets/mojtaba142/hotel-booking
License(s): copyright-authors
Downloading hotel-booking.zip to /content
  0% 0.00/4.40M [00:00<?, ?B/s]
100% 4.40M/4.40M [00:00<00:00, 578MB/s]
Archive:  hotel-booking.zip
  inflating: hotel_booking.csv       


In [3]:
## set the DB in Memory as 'df' and show the DB info
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

#  --- 1. Load Data into Pandas DataFrame ---
file_name = 'hotel_booking.csv'
try:
    fetched_df = pd.read_csv(file_name)
    print(f"\nDataset '{os.path.basename(file_name)}' loaded into DataFrame successfully!\n")
except FileNotFoundError:
    print(f"Error: CSV file not found at {file_name} after unzipping. Please check the unzipped contents.")
    exit()

# Display initial info to understand data types and non-null counts
print("Initial DataFrame Info:")
fetched_df.info()
print("\n")


Dataset 'hotel_booking.csv' loaded into DataFrame successfully!

Initial DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  f

In [4]:

# Reduce the dataset size by sampling a percentage of rows
percentage_to_keep = 0.1  # @param {type:"slider", min:0.01, max:1.0, step:0.01}

if percentage_to_keep < 1.0:
    print(f"Reducing dataset size to {percentage_to_keep*100:.2f}% of original rows.")
    df = fetched_df.sample(frac=percentage_to_keep, random_state=42) # Use random_state for reproducibility
    print(f"New dataset size: {df.shape[0]} rows.")

# Separate features (X) and target (y) from the potentially reduced DataFrame
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']

print("\nDataFrame after potential size reduction:")
print(X.head())
print(f"Target variable distribution after potential size reduction:\n{y.value_counts(normalize=True)}\n")

# Re-identify numerical and categorical features based on the potentially reduced DataFrame if necessary,
# though column types should remain consistent.
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

print(f"Numerical features identified: {numerical_features}")
print(f"Categorical features identified: {categorical_features}\n")


Reducing dataset size to 10.00% of original rows.
New dataset size: 11939 rows.

DataFrame after potential size reduction:
               hotel  lead_time  arrival_date_year arrival_date_month  \
30946   Resort Hotel        203               2016           December   
40207     City Hotel         82               2015               July   
103708    City Hotel         25               2016           December   
85144     City Hotel          1               2016              March   
109991    City Hotel         70               2017              April   

        arrival_date_week_number  arrival_date_day_of_month  \
30946                         49                          2   
40207                         29                         16   
103708                        53                         27   
85144                         11                          9   
109991                        16                         16   

        stays_in_weekend_nights  stays_in_week_nights  adul

In [5]:
## Initial Data Cleaning & Feature Engineering ---

#Se limpia country por fuera del pipeline para que no quede agregado en el dataset final como categórico
#db['country'].fillna(db['country'].mode()[0], inplace=True)
# Se eliminan las columnas que no tienen sentido para el entrenamiento
# La característica company se elimina porque solo existen 6797 valores de el total (119390) lo que corresponde a un porcentaje de faltante del 94.3%
# La característica reservation_status se elimina porque es una variable de salida
# La característica reservation_status_date se elimina porque es una fecha
#db = db.drop(columns=['company'])
#db.info()

# Handle missing values
df['children'] = df['children'].fillna(0)
df['agent'] = df['agent'].fillna(0)
df['company'] = df['company'].fillna(0)


# Handle 'adr' (Average Daily Rate): Remove rows with 0 or negative ADR, as they are often data errors.
df = df[df['adr'] >= 0]
df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace any inf with NaN if they exist
df.dropna(subset=['adr'], inplace=True) # Drop rows where adr might have become NaN after inf replacement

# Remove rows where adults, children, and babies are all zero.
initial_rows = df.shape[0]
df = df[df['adults'] + df['children'] + df['babies'] > 0]
print(f"Removed {initial_rows - df.shape[0]} rows with 0 total guests.\n")

# # Feature Engineering
# df['total_nights_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
# df['total_guests'] = df['adults'] + df['children'] + df['babies']
# df['room_type_changed'] = (df['assigned_room_type'] != df['reserved_room_type']).astype(int)



# # Drop features that are leakage or less relevant after engineering
# df = df.drop(columns=['reservation_status', 'reservation_status_date',
#                       'stays_in_weekend_nights', 'stays_in_week_nights',
#                       'adults', 'children', 'babies',
#                       'assigned_room_type', 'reserved_room_type'])

# Remove 'reservation_status' and 'reservation_status_date' as they directly indicate cancellation,
# and are leakage if used for predicting 'is_canceled'.
df = df.drop(columns=['reservation_status', 'reservation_status_date'])

# Convert month names to numbers for consistency if needed later, or handle as categorical
# For now, let's keep 'arrival_date_month' as categorical since OneHotEncoder will handle it.
# If you want numerical month, you would do:
# month_map = {'January':1, ..., 'December':12}
# df['arrival_date_month'] = df['arrival_date_month'].map(month_map)

# Separate features (X) and target (y)
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']


print("DataFrame after initial cleaning and feature engineering:")
print(X.head())
print(f"Target variable distribution:\n{y.value_counts(normalize=True)}\n")

# Identify numerical and categorical features for preprocessing pipelines
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

print(f"Numerical features identified: {numerical_features}")
print(f"Categorical features identified: {categorical_features}\n")

Removed 26 rows with 0 total guests.

DataFrame after initial cleaning and feature engineering:
               hotel  lead_time  arrival_date_year arrival_date_month  \
30946   Resort Hotel        203               2016           December   
40207     City Hotel         82               2015               July   
103708    City Hotel         25               2016           December   
85144     City Hotel          1               2016              March   
109991    City Hotel         70               2017              April   

        arrival_date_week_number  arrival_date_day_of_month  \
30946                         49                          2   
40207                         29                         16   
103708                        53                         27   
85144                         11                          9   
109991                        16                         16   

        stays_in_weekend_nights  stays_in_week_nights  adults  children  ...  \
30946 

In [6]:
# --- 3. Define Preprocessing Steps using Pipelines ---

# Numerical features will be scaled
numerical_transformer = StandardScaler()

# Categorical features will be One-Hot Encoded
# handle_unknown='ignore' will set unknown categories to zeros, preventing errors during prediction
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
# This applies different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep columns not specified (e.g., if there were others)
)

print("Preprocessing pipeline defined successfully.\n")

Preprocessing pipeline defined successfully.



In [7]:
# --- 4. Model Training and Evaluation Setup ---
# Stratified K-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to evaluate a model
def evaluate_model(model, X_data, y_data, model_name="Model"):
    f1_scores, auc_roc_scores = [], []
    accuracy_scores, precision_scores, recall_scores = [], [], []

    print(f"\n--- Evaluating {model_name} ---")
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_data, y_data)):
        X_train, X_val = X_data.iloc[train_idx], X_data.iloc[val_idx]
        y_train, y_val = y_data.iloc[train_idx], y_data.iloc[val_idx]

        # Fit the pipeline (includes preprocessing and SMOTE on training data)
        model.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1] # Probability for the positive class

        # Calculate metrics
        accuracy_scores.append(accuracy_score(y_val, y_pred))
        precision_scores.append(precision_score(y_val, y_pred))
        recall_scores.append(recall_score(y_val, y_pred))
        f1_scores.append(f1_score(y_val, y_pred))
        auc_roc_scores.append(roc_auc_score(y_val, y_proba))

        print(f"  Fold {fold+1}: F1 = {f1_scores[-1]:.3f}, AUC = {auc_roc_scores[-1]:.3f}")

    print(f"\n{model_name} - Average Results:")
    print(f"  Accuracy: {np.mean(accuracy_scores):.3f} +/- {np.std(accuracy_scores)*2:.3f} (95% CI)")
    print(f"  Precision: {np.mean(precision_scores):.3f} +/- {np.std(precision_scores)*2:.3f} (95% CI)")
    print(f"  Recall: {np.mean(recall_scores):.3f} +/- {np.std(recall_scores)*2:.3f} (95% CI)")
    print(f"  F1-Score: {np.mean(f1_scores):.3f} +/- {np.std(f1_scores)*2:.3f} (95% CI)")
    print(f"  AUC-ROC: {np.mean(auc_roc_scores):.3f} +/- {np.std(auc_roc_scores)*2:.3f} (95% CI)")

    return {
        'F1-Score': np.mean(f1_scores),
        'AUC-ROC': np.mean(auc_roc_scores),
        'F1-CI': np.std(f1_scores) * 2,
        'AUC-CI': np.std(auc_roc_scores) * 2
    }

In [None]:
# --- . Model Implementations ---

# All models will be part of an ImbPipeline that includes:
# 1. Preprocessing (scaling numerical, one-hot encoding categorical)
# 2. SMOTE (applied only to training data in each fold)
# 3. The classifier itself

print("\n--- Starting Model Training and Evaluation ---")

### Model 1: Logistic Regression
from sklearn.linear_model import LogisticRegression


print("\n--- Model 1: Logistic Regression ---")

# Logistic Regression model implementation and evaluation

class DenseTransformer():
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.toarray()

pipeline_lr = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, solver='saga', penalty='l1'))
])

param_grid_lr = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100] # C values to test
}
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
grid_search_lr.fit(X, y)

print(f"Best parameters for Logistic Regression: {grid_search_lr.best_params_}")

best_lr_model = grid_search_lr.best_estimator_

results_lr = evaluate_model(best_lr_model, X, y, "Optimized Logistic Regression")


--- Starting Model Training and Evaluation ---

--- Model 1: Gaussian Naive Bayes ---
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Best parameters for Logistic Regression: {'classifier__C': 1}

--- Evaluating Optimized Logistic Regression ---
  Fold 1: F1 = 0.747, AUC = 0.882


In [12]:


### Model 2: K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

print("\n--- Model 2: K-Nearest Neighbors (KNN) ---")
pipeline_knn = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', KNeighborsClassifier())
])

param_grid_knn = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__p': [1, 2] # 1 for Manhattan distance, 2 for Euclidean distance
}

grid_search_knn = GridSearchCV(pipeline_knn, param_grid_knn, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
grid_search_knn.fit(X, y)

print(f"Best parameters for KNN: {grid_search_knn.best_params_}")
best_knn_model = grid_search_knn.best_estimator_
results_knn = evaluate_model(best_knn_model, X, y, "Optimized KNN")




--- Model 2: K-Nearest Neighbors (KNN) ---
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters for KNN: {'classifier__n_neighbors': 11, 'classifier__p': 1, 'classifier__weights': 'distance'}

--- Evaluating Optimized KNN ---
  Fold 1: F1 = 0.726, AUC = 0.873
  Fold 2: F1 = 0.764, AUC = 0.892
  Fold 3: F1 = 0.739, AUC = 0.876
  Fold 4: F1 = 0.727, AUC = 0.874
  Fold 5: F1 = 0.761, AUC = 0.886

Optimized KNN - Average Results:
  Accuracy: 0.814 +/- 0.024 (95% CI)
  Precision: 0.779 +/- 0.038 (95% CI)
  Recall: 0.711 +/- 0.031 (95% CI)
  F1-Score: 0.744 +/- 0.033 (95% CI)
  AUC-ROC: 0.880 +/- 0.015 (95% CI)


In [None]:
### Model 3: Random Forest (Ensemble of Decision Trees)
from sklearn.ensemble import RandomForestClassifier

print("\n--- Model 3: Random Forest ---")
pipeline_rf = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt','log2',0.5],
}

# Note: Make sure X and y are defined and represent your features and target
# (These should be defined in previous data loading/cleaning steps)
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
grid_search_rf.fit(X, y) # Fit on the full data after cleaning and feature engineering

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
best_rf_model = grid_search_rf.best_estimator_
# Evaluate on the full data for final results
results_rf = evaluate_model(best_rf_model, X, y, "Optimized Random Forest")


--- Model 3: Random Forest ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__n_estimators': 200}

--- Evaluating Optimized Random Forest ---
  Fold 1: F1 = 0.722, AUC = 0.913
  Fold 2: F1 = 0.775, AUC = 0.931
  Fold 3: F1 = 0.757, AUC = 0.918
  Fold 4: F1 = 0.737, AUC = 0.918
  Fold 5: F1 = 0.762, AUC = 0.925

Optimized Random Forest - Average Results:
  Accuracy: 0.840 +/- 0.021 (95% CI)
  Precision: 0.910 +/- 0.023 (95% CI)
  Recall: 0.639 +/- 0.045 (95% CI)
  F1-Score: 0.751 +/- 0.037 (95% CI)
  AUC-ROC: 0.921 +/- 0.013 (95% CI)


In [None]:
### Model 4: Artificial Neural Network (ANN) - Multi-layer Perceptron (MLP)
from sklearn.neural_network import MLPClassifier

print("\n--- Model 4: Artificial Neural Network (MLP) ---")
pipeline_mlp = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', MLPClassifier(random_state=42, max_iter=500)) # Increased max_iter for convergence
])

param_grid_mlp = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100,50)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__solver': ['adam', 'sgd'],
    'classifier__alpha': [0.0001, 0.001, 0.01], # L2 regularization
    'classifier__learning_rate': ['constant', 'adaptive'] # Adaptive learning rate
}

grid_search_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
grid_search_mlp.fit(X, y)

print(f"Best parameters for MLP: {grid_search_mlp.best_params_}")
best_mlp_model = grid_search_mlp.best_estimator_
results_mlp = evaluate_model(best_mlp_model, X, y, "Optimized MLP")




--- Model 4: Artificial Neural Network (MLP) ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:

### Model 5: Support Vector Machine (SVM)
from sklearn.svm import SVC

# SVMs can be computationally expensive with larger datasets.
# Using a linear kernel or a smaller subset of data might be necessary if it's too slow.
# For simplicity, let's use a smaller C range and a linear/rbf kernel.
print("\n--- Model 5: Support Vector Machine (SVM) ---")
pipeline_svc = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', SVC(random_state=42, probability=True)) # probability=True needed for predict_proba
])

param_grid_svc = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf', 'poly'],
    'classifier__gamma': ['scale', 'auto', 0.1, 1]
}

grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
grid_search_svc.fit(X, y)

print(f"Best parameters for SVM: {grid_search_svc.best_params_}")
best_svc_model = grid_search_svc.best_estimator_
results_svc = evaluate_model(best_svc_model, X, y, "Optimized SVM")



In [None]:

print("\n--- All Model Evaluations Complete ---")

# You can collect all results here to generate your summary table
all_results = {
    "Gaussian Naive Bayes": results_gnb,
    "KNN": results_knn,
    "Random Forest": results_rf,
    "MLP": results_mlp,
    "SVM": results_svc
}

print("\n--- Summary of Best Model Performance ---")
for model_name, metrics in all_results.items():
    print(f"{model_name}: F1-Score = {metrics['F1-Score']:.3f} +/- {metrics['F1-CI']:.3f}, AUC-ROC = {metrics['AUC-ROC']:.3f} +/- {metrics['AUC-CI']:.3f}")
