# Airbnb Revenue Prediction with HistGradientBoostingRegressor
-------------------------------------------------------------

This script complements the **XGBoost pipeline** by providing an alternative 
implementation using scikit-learn’s **HistGradientBoostingRegressor (HBGR)**.

It includes:
- Preprocessing of numerical and categorical features
- A HistGradientBoostingRegressor with tuned hyperparameters
- Model evaluation (train/validation MAE)

The purpose of this script is to **compare performance and interpretability** 
between HBGR and XGBoost. While XGBoost provides built-in feature importance scores, 
HBGR relies on **permutation importance** for model-agnostic interpretability.


## Imports 

In [1]:
import logging
import json
import zipfile

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.inspection import permutation_importance

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

from sklearn.ensemble import HistGradientBoostingRegressor

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## Defining a baseline function containing HistGradientBoostingRegressor

In [2]:
def hist_baseline():

    """
    Train a Histogram Gradient Boosting Regressor on the Airbnb dataset.
    
    Returns:
    --------
    best_model : Fitted pipeline with preprocessing & HistGradientBoostingRegressor.
    grid_search : Grid search object if tuning was run, else None
    
    """
    
    logging.info("Reading train and test files")
    train = pd.read_json("train.json", orient='records')
    test = pd.read_json("test.json", orient='records')
    seed = 123

    # Split train into train and validation
    train, valid = train_test_split(train, test_size=1/3, random_state=seed)
    
    # Define preprocessing pipeline
    preprocess = ColumnTransformer(
        transformers=[
            # Numerical features - impute then scale
            ("numerical", Pipeline(steps=[
                ('imputer', IterativeImputer(random_state=seed, max_iter=10)),
                ('scaler', StandardScaler())
            ]), ["lat", "lon", "bathrooms", "rooms", "guests", "num_reviews", "rating", "min_nights"]),
            
            # Categorical features - impute then one-hot encode
            ("categorical", Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]), ["room_type", "cancellation"]),
        ],
        remainder='drop'
    )

    label = 'revenue'

    hbg_regressor = Pipeline(steps=[
        ('preprocess', preprocess), 
        ('HGB', HistGradientBoostingRegressor(
            learning_rate=0.1,          
            max_iter=300,               
            max_depth=3,                 
            min_samples_leaf=30,       
            l2_regularization=1.0,      
            max_features=0.7,           
            validation_fraction=0.15,   
            early_stopping=True,
            n_iter_no_change=15,
            random_state=seed
        ))
    ])

    X_train = train.drop([label], axis=1)
    y_train = np.log1p(train[label].values)

    # Optional: Uncomment this block to perform grid search tuning
    # (Warning: VERY computationally expensive)
    
    '''
    logging.info("Starting grid search for hyperparameter tuning...")

        # Create pipeline with basic model (parameters will be set by grid search)
    hgb_regressor = Pipeline(steps=[
        ('preprocess', preprocess), 
        ('HGB', HistGradientBoostingRegressor(
            validation_fraction=0.15,   
            early_stopping=True,
            n_iter_no_change=10,
            random_state=seed
        ))
    ])
    
    # Define parameter grid
    param_grid = {
        'HGB__learning_rate': [0.01, 0.05, 0.1],
        'HGB__max_depth': [2, 3, 4],
        'HGB__l2_regularization': [0.01, 0.1, 1.0],
        'HGB__max_iter': [100, 200, 300],
        'HGB__min_samples_leaf': [10, 20, 30]
    }
    
    # Perform grid search
    grid_search = GridSearchCV(
        hgb_regressor, 
        param_grid, 
        cv=5,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search
    X_train = train.drop([label], axis=1)
    y_train = np.log1p(train[label].values)
    
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    

    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Best CV score (neg MAE): {grid_search.best_score_:.4f}')
    '''
    
    # Comment out if running grid search
    hbg_regressor.fit(X_train, y_train)
    best_model = hbg_regressor  
    grid_search = None
    
    logging.info("Evaluating model performance...")
    
    for split_name, split in [("Train", train), ("Valid", valid)]:
        X_split = split.drop([label], axis=1)
        y_true = split[label].values
        
        # Predict and reverse log transform
        y_pred_log = best_model.predict(X_split)
        y_pred = np.expm1(y_pred_log)
        
        mae = mean_absolute_error(y_true, y_pred)
        
        logging.info(f"{split_name:>5} - MAE: {mae:.3f}")

    '''
    # Make predictions on test set
    logging.info("Generating test predictions...")
    pred_test_log = best_model.predict(test)
    pred_test = np.expm1(pred_test_log)
    
    # Ensure no negative predictions
    pred_test = np.maximum(pred_test, 0)
    
    test[label] = pred_test
    predicted = test[['revenue']].to_dict(orient='records')

    # Save predictions
    logging.info("Saving predictions to baseline.zip...")
    with zipfile.ZipFile("baseline.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
        zipf.writestr("predicted.json", json.dumps(predicted, indent=2))
    
    logging.info("Pipeline completed successfully!")
    '''
    
    return best_model, grid_search


## Defining a function to analyze permutation importance

In [3]:

def permutation_feature_importance(pipeline_model, X, y, seed=123, n_repeats=10):
    """
    Compute permutation feature importance.
    
    """
    # Transform features
    preprocess = pipeline_model.named_steps['preprocess']
    X_transformed = preprocess.transform(X)
    
    # Run permutation importance on the fitted model only
    model = pipeline_model.named_steps['HGB']
    perm_importance = permutation_importance(
        model, X_transformed, y, n_repeats=n_repeats, random_state=seed, n_jobs=-1
    )
    
    # Get feature names after transformation
    feature_names = preprocess.get_feature_names_out()
    
    # Build DataFrame
    importances = pd.DataFrame({
        "feature": feature_names,
        "importance_mean": perm_importance.importances_mean,
        "importance_std": perm_importance.importances_std
    }).sort_values("importance_mean", ascending=False)
    
    return importances


## Executing HistGradientBoostingRegressor 

In [4]:
if __name__ == "__main__":
    best_model, _ = hist_baseline()

    # Use your validation split for feature importance
    valid = pd.read_json("train.json", orient="records")  # reload so we get the same split
    train, valid = train_test_split(valid, test_size=1/3, random_state=123)

    X_valid = valid.drop(["revenue"], axis=1)
    y_valid = np.log1p(valid["revenue"].values)  # keep consistent with training

    fi = permutation_feature_importance(best_model, X_valid, y_valid)
    print(fi.head(10))


2025-08-20 16:10:22,983 - INFO - Reading train and test files
2025-08-20 16:10:23,483 - INFO - Evaluating model performance...
2025-08-20 16:10:23,725 - INFO - Train - MAE: 8698.420
2025-08-20 16:10:23,777 - INFO - Valid - MAE: 8801.490


                                           feature  importance_mean  \
1                                   numerical__lon         0.326759   
5                           numerical__num_reviews         0.285784   
0                                   numerical__lat         0.122436   
6                                numerical__rating         0.086098   
3                                 numerical__rooms         0.051818   
8               categorical__room_type_entire_home         0.043824   
4                                numerical__guests         0.035536   
2                             numerical__bathrooms         0.022467   
7                            numerical__min_nights         0.010880   
19  categorical__cancellation_Super Strict 60 Days         0.002624   

    importance_std  
1         0.008016  
5         0.007269  
0         0.006749  
6         0.006236  
3         0.002772  
8         0.003783  
4         0.002102  
2         0.001827  
7         0.001257  
19      