# Classification Model Training
## CRISP-DM Phase 4: Modeling - Cancellation Prediction

This notebook trains and evaluates classification models to predict booking cancellations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
import sys
sys.path.insert(0, '../src')

from preprocessing import load_data, clean_data, engineer_features, prepare_classification_data
from classification_model import CancellationClassifier

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load and Prepare Data

In [None]:
# Load and preprocess
df = load_data('../data/raw/hotel_bookings.csv')
df = clean_data(df)
df = engineer_features(df)

# Prepare classification data
X_train, X_test, y_train, y_test, feature_names = prepare_classification_data(df)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution (train):")
print(y_train.value_counts(normalize=True))

## 2. Train Models

In [None]:
# Train Random Forest
print("Training Random Forest...")
rf_clf = CancellationClassifier(model_type='random_forest')
rf_clf.train(X_train, y_train)

# Evaluate
print("\n" + "="*50)
print("Random Forest Results:")
rf_metrics = rf_clf.evaluate(X_test, y_test)

In [None]:
# Train XGBoost
print("Training XGBoost...")
xgb_clf = CancellationClassifier(model_type='xgboost')
xgb_clf.train(X_train, y_train)

# Evaluate
print("\n" + "="*50)
print("XGBoost Results:")
xgb_metrics = xgb_clf.evaluate(X_test, y_test)

## 3. Model Comparison

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics
}).T

comparison.style.background_gradient(cmap='Greens', axis=0)

In [None]:
# ROC Curves
fig, ax = plt.subplots(figsize=(8, 6))

for name, clf in [('Random Forest', rf_clf), ('XGBoost', xgb_clf)]:
    y_proba = clf.predict_proba(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

ax.plot([0, 1], [0, 1], 'k--', label='Random Guess')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves Comparison')
ax.legend()
plt.tight_layout()
plt.show()

## 4. Feature Importance

In [None]:
# XGBoost feature importance
importance = xgb_clf.get_feature_importance()

plt.figure(figsize=(10, 8))
top_features = importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 15 Most Important Features (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Confusion Matrix

In [None]:
# XGBoost confusion matrix
y_pred = xgb_clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Cancelled', 'Cancelled'],
            yticklabels=['Not Cancelled', 'Cancelled'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (XGBoost)')
plt.tight_layout()
plt.show()

## 6. Save Best Model

In [None]:
# Save the best performing model (XGBoost)
xgb_clf.save('../models/cancellation_model.pkl')

print("Model saved successfully!")
print(f"\nFinal XGBoost Metrics:")
for metric, value in xgb_metrics.items():
    print(f"  {metric}: {value:.4f}")