# Hotel Booking ML Project
## CRISP-DM Phase 1-2: Business & Data Understanding

This notebook covers:
1. Business Understanding
2. Data Loading & EDA
3. Data Cleaning & Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

ModuleNotFoundError: No module named 'pandas'

## 1. Business Understanding

**Business Questions:**
1. Which bookings are likely to be cancelled?
2. What will be the demand for rooms next month?

**Success Criteria:**
- Cancellation prediction accuracy â‰¥ 85%
- Demand forecast MAPE < 15%

## 2. Data Loading

Dataset: Hotel Booking Demand from Kaggle

In [None]:
# Load the dataset
df = pd.read_csv('../data/raw/hotel_bookings.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# First look at the data
df.head()

In [None]:
# Data types and missing values
df.info()

In [None]:
# Statistical summary
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Cancellation rate
cancel_rate = df['is_canceled'].mean() * 100
print(f"Overall Cancellation Rate: {cancel_rate:.2f}%")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Cancellation by hotel type
df.groupby('hotel')['is_canceled'].mean().plot(kind='bar', ax=axes[0], color=['steelblue', 'coral'])
axes[0].set_title('Cancellation Rate by Hotel Type')
axes[0].set_ylabel('Cancellation Rate')
axes[0].set_xticklabels(['City Hotel', 'Resort Hotel'], rotation=0)

# Cancellation distribution
df['is_canceled'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['lightgreen', 'salmon'])
axes[1].set_title('Cancellation Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Lead time distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Lead time by cancellation status
df[df['is_canceled']==0]['lead_time'].hist(ax=axes[0], bins=50, alpha=0.7, label='Not Cancelled')
df[df['is_canceled']==1]['lead_time'].hist(ax=axes[0], bins=50, alpha=0.7, label='Cancelled')
axes[0].set_title('Lead Time Distribution by Cancellation Status')
axes[0].set_xlabel('Lead Time (days)')
axes[0].legend()

# Lead time boxplot
df.boxplot(column='lead_time', by='is_canceled', ax=axes[1])
axes[1].set_title('Lead Time by Cancellation')

plt.tight_layout()
plt.show()

In [None]:
# Monthly booking trend
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']

monthly = df.groupby('arrival_date_month').size().reindex(month_order)

plt.figure(figsize=(12, 4))
monthly.plot(kind='bar', color='steelblue')
plt.title('Bookings by Month')
plt.xlabel('Month')
plt.ylabel('Number of Bookings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap for numeric features
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr = df[numeric_cols].corr()

plt.figure(figsize=(14, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=False, cmap='RdBu_r', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## 4. Missing Values Analysis

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({'Missing': missing, 'Percentage': missing_pct})
missing_df[missing_df['Missing'] > 0].sort_values('Percentage', ascending=False)

## 5. Save Cleaned Data

In [None]:
import sys
sys.path.insert(0, '../src')

from preprocessing import clean_data, engineer_features

# Clean and engineer features
df_clean = clean_data(df)
df_final = engineer_features(df_clean)

print(f"Original: {len(df)} rows")
print(f"After cleaning: {len(df_final)} rows")

# Save processed data
df_final.to_csv('../data/processed/hotel_bookings_processed.csv', index=False)
print("\nProcessed data saved!")