# US Residential Housing Return Prediction

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score
import optuna
from optuna import visualization as vis
import numpy as np
import xgboost as xgb

%load_ext autoreload
%autoreload 2

## Modeling

In [None]:
# define scorer and cross-validation
scorer = make_scorer(mean_absolute_error, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

### Random Forest

#### Hyperparameter Tuning

In [None]:
# Hyperparameter search space
def objective(trial):
    # Hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # Create the Random Forest model with suggested hyperparameters
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Compute cross-validated F1 score
    f1_scores = cross_val_score(rf, X_train, y_train, scoring=scorer, cv=cv, n_jobs=-1)

    # Return the mean F1 score from the cross-validation
    return np.mean(f1_scores)

In [None]:
study_rf = optuna.create_study(direction='maximize', study_name='rf_tuning')  # Maximize validation accuracy
study_rf.optimize(objective, n_trials=15, show_progress_bar=True)

In [None]:
vis.plot_optimization_history(study_rf).show()
vis.plot_param_importances(study_rf).show()

#### Model Training

In [None]:
# Get the best hyperparameters
best_params_rf = study_rf.best_params

In [None]:
# Create the Random Forest model with best hyperparameters
rf = RandomForestRegressor(
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    max_features=best_params_rf['max_features'],
    random_state=42
)

In [None]:
# Fit the model
rf.fit(X_train, y_train)

#### Model Evaluation

In [None]:
y_pred_rf_train = rf.predict(X_train)
y_pred_rf_test = rf.predict(X_test)

### AdaBoost

#### Hyperparameter Tuning

In [None]:
def objective(trial):
    # Hyperparameter search space for the base Decision Tree
    max_depth = trial.suggest_int('max_depth', 1, 10)  # Depth of the tree
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)  # Minimum samples to split a node
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)  # Minimum samples in a leaf node
    max_features = trial.suggest_categorical('max_features',
                                             [None, 'sqrt', 'log2'])  # Features to consider for splitting

    # Hyperparameter search space for AdaBoost
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0, log=True)

    # Define the base estimator (Decision Tree)
    base_estimator = DecisionTreeRegressor(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Define the AdaBoost model
    model = AdaBoostRegressor(
        estimator=base_estimator,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        algorithm='SAMME',
        random_state=42
    )

    scores = cross_val_score(model, X_train, y_train, scoring=scorer, cv=cv, n_jobs=-1)

    # Return the mean F1 score
    return scores.mean()

In [None]:
# Optimize hyperparameters
study_ada = optuna.create_study(direction='maximize', study_name='ada_tuning')  # Maximize validation accuracy
study_ada.optimize(objective, n_trials=15, show_progress_bar=True)

In [None]:
# Visualizations
vis.plot_optimization_history(study_ada).show()
vis.plot_param_importances(study_ada).show()

#### Model Training

In [None]:
# Get the best hyperparameters
best_params_ada = study_ada.best_params

In [None]:
# Create the AdaBoost model with best hyperparameters
dt = DecisionTreeRegressor(
    max_depth=best_params_ada['max_depth'],
    min_samples_split=best_params_ada['min_samples_split'],
    min_samples_leaf=best_params_ada['min_samples_leaf'],
    max_features=best_params_ada['max_features'],
    random_state=42
)

ab = AdaBoostRegressor(
    estimator=dt,
    n_estimators=best_params_ada['n_estimators'],
    learning_rate=best_params_ada['learning_rate'],
    algorithm='SAMME',
    random_state=42
)

In [None]:
# Fit the model
ab.fit(X_train, y_train)

#### Model Evaluation

In [None]:
y_pred_ada_train = ab.predict(X_train)
y_pred_ada_test = ab.predict(X_test)

### XGBoost

#### Hyperparameter Tuning

In [None]:
# Hyperparameter search space
def objective(trial):
    # Define the hyperparameter search space
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 50.0),  # Important for imbalanced datasets
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),  # L1 regularization
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),  # L2 regularization
    }

    # Initialize XGBoost model
    model = xgb.XGBRegressor(random_state=42, **param)

    scores = cross_val_score(model, X_train, y_train, scoring=scorer, cv=cv, n_jobs=-1)

    # Return mean F1 score
    return scores.mean()

In [None]:
# Optimize hyperparameters
study_xg = optuna.create_study(direction='maximize', study_name='xg_tuning')  # Maximize validation accuracy
study_xg.optimize(objective, n_trials=15, show_progress_bar=True)

In [None]:
# Visualizations
vis.plot_optimization_history(study_xg).show()
vis.plot_param_importances(study_xg).show()

#### Model Training

In [None]:
# Get the best hyperparameters
best_params_xg = study_xg.best_params

In [None]:
# Initialize XGBoost model with best hyperparameters
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    **best_params_xg
)

In [None]:
# Fit the model
xgb_model.fit(X_train, y_train)

#### Model Evaluation

In [None]:
y_pred_xgb_train = xgb_model.predict(X_train)
y_pred_xgb_test = xgb_model.predict(X_test)

## Conclusion

In [None]:
models = ['Random Forest', 'AdaBoost', 'XGBoost']
predictions_train = [y_pred_rf_train, y_pred_ada_train, y_pred_xgb_train]
predictions_test = [y_pred_rf_test, y_pred_ada_test, y_pred_xgb_test]

utils.evaluate_models(models=models, predictions_train=predictions_train, predictions_test=predictions_test, X=X,
                      y_train=y_train, y_test=y_test, task='regression')