# Hotel Reservation Cancellation Prediction

This notebook demonstrates an end-to-end machine learning workflow for predicting hotel booking cancellations. The process includes data loading, cleaning, basic exploratory data analysis (EDA), model training, evaluation, and generating predictions for a test dataset. The goal is to predict whether a given reservation will be cancelled (1) or not cancelled (0).



## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Enable inline plots for Jupyter notebooks
%matplotlib inline


## Load Data

In [None]:
# Paths to the training and test datasets
train_path = 'train.csv'
test_path = 'test.csv'

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display the shapes of the datasets
train_df_shape = train_df.shape
test_df_shape = test_df.shape
train_df_shape, test_df_shape


## Data Cleaning

In [None]:
# Remove duplicate rows from both datasets
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df = test_df.drop_duplicates().reset_index(drop=True)

# Fill missing numerical values with the median of the column
for col in train_df.select_dtypes(include=[np.number]).columns:
    if train_df[col].isnull().any():
        train_df[col].fillna(train_df[col].median(), inplace=True)

for col in test_df.select_dtypes(include=[np.number]).columns:
    if test_df[col].isnull().any():
        test_df[col].fillna(test_df[col].median(), inplace=True)

# Fill missing categorical values with the mode of the column
for col in train_df.select_dtypes(include=['object']).columns:
    if train_df[col].isnull().any():
        train_df[col].fillna(train_df[col].mode()[0], inplace=True)

for col in test_df.select_dtypes(include=['object']).columns:
    if test_df[col].isnull().any():
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)

# Show summary of cleaned training dataset
train_df.info()


## Exploratory Data Analysis

In [None]:
# Display the first five rows of the training data
train_df_head = train_df.head()
train_df_head


In [None]:
# Basic statistics for numerical columns
train_df_describe = train_df.describe()
train_df_describe


In [None]:
# Target variable distribution and bar plot
class_counts = train_df['booking_status'].value_counts()
print("Class distribution (counts):")
print(class_counts)

# Bar plot using matplotlib for the target distribution
plt.figure(figsize=(5, 3))
plt.bar(class_counts.index.astype(str), class_counts.values)
plt.title("Target class distribution")
plt.xlabel("booking_status")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix and heatmap for numeric features
numeric_cols = train_df.select_dtypes(include=[np.number]).columns

if len(numeric_cols) > 1:
    corr = train_df[numeric_cols].corr()
    plt.figure(figsize=(8, 6))
    plt.imshow(corr, interpolation='nearest', aspect='auto')
    plt.colorbar()
    plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45, ha='right')
    plt.yticks(range(len(numeric_cols)), numeric_cols)
    plt.title("Correlation heatmap (numeric features)")
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric features for a correlation matrix.")


## Model Training and Evaluation

In [None]:
# Separate features and target
X = train_df.drop(columns=['booking_status'])
y = train_df['booking_status']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Preprocessor for One-Hot encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create the model pipeline with RandomForestClassifier
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(
            n_estimators=300,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ))
    ]
)

# Train the model
clf.fit(X_train, y_train)

# Predict on validation set
val_preds = clf.predict(X_val)

# Compute evaluation metrics
val_accuracy = accuracy_score(y_val, val_preds)
val_precision = precision_score(y_val, val_preds)
val_recall = recall_score(y_val, val_preds)
val_f1 = f1_score(y_val, val_preds)

# Display metrics
print("Validation Accuracy:", val_accuracy)
print("Validation Precision:", val_precision)
print("Validation Recall:", val_recall)
print("Validation F1 Score:", val_f1)
print("Classification Report: ", classification_report(y_val, val_preds))


## Train on Full Data and Predict on Test Set

In [None]:
# Fit the model on the full training data
clf.fit(X, y)

# Predict on the cleaned test data
preds = clf.predict(test_df)

# Map numeric predictions to human-readable labels
label_map = {1: 'Canceled', 0: 'Not_Canceled'}
pred_labels = [label_map[int(p)] for p in preds]

# Create submission DataFrame
submission_df = pd.DataFrame({
    'booking_status': pred_labels
})

# Save the predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)

# Display the first few rows of the submission for confirmation
submission_df.head()
