 # Hyperparameter Search for MPF Boosted Model



 In this cell we define helper functions for evaluating one hyperparameter candidate and performing a randomized search in parallel. We then use our random search function on training data.

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import numpy as np
import mpf_py
from utils import gen_data, true_model3, plot_2d_model_predictions  # Adjust import according to your project structure


 ### Example Usage for MPF



 Here we generate training data (using a hypothetical `gen_data` and `true_model3` from your utils) and run the random hyperparameter search.

In [3]:
# Generate data
true_model = lambda x: 2*x[:,1] + x[:,0] - 0.5 * x[:,0]* x[:,1] + 34

x, y = gen_data(n=10000, seed=3, model=true_model3)
x_train = x[:5000]
y_train = y[:5000]

x_test = x[5000:]
y_test = y[5000:]


In [11]:
# Fit the best MPF model
from utils import random_hyperparam_search_parallel


best_model, best_fr, best_params, best_error = random_hyperparam_search_parallel(
    x_train, y_train, n_splits=2, n_candidates=50, n_jobs=3, param_distributions= {
        "epochs": lambda: randint(2, 9).rvs(),       # 1 to 8 inclusive
        "n_iter": lambda: randint(5, 101).rvs(),         # 5 to 100 inclusive
        "split_try": lambda: randint(5, 21).rvs(),         # 5 to 20 inclusive
        "B": lambda: randint(10, 101).rvs(),         # 10 to 100 inclusive
        "colsample_bytree": lambda: 1.0,
        "identified": lambda: False
    })
print("Best hyperparameters for MPF:", best_params)
print("Best CV MSE for MPF:", best_error)


In [12]:
best_params, best_error


 # Hyperparameter Search for XGBoost



 In this section we use scikit‑learn’s RandomizedSearchCV with continuous and discrete ranges. We fix 2‑fold (or 4‑fold as set below) cross‑validation and use random sampling over the following ranges:



 - `max_depth`: integers from 3 to 9,

 - `learning_rate`: continuous values in [0.001, 0.6],

 - `n_estimators`: integers from 200 to 800.



 We then print the best hyperparameters and CV MSE, and retrieve the best model.

In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define hyperparameter distributions.
param_distributions = {
    'max_depth': randint(3, 10),           # Integers from 3 to 9.
    'learning_rate': uniform(0.001, 0.599),  # Continuous values in [0.001, 0.6].
    'n_estimators': randint(200, 801)        # Integers from 200 to 800.
}

# Create an XGBRegressor.
xgb_model = XGBRegressor(random_state=42)

# Set up RandomizedSearchCV with 4-fold CV.
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=100,   # number of random candidates to try
    scoring='neg_mean_squared_error',
    cv=4,         # use 4-fold cross-validation (adjust as needed)
    n_jobs=-1,    # use all available cores
    verbose=1,
    random_state=42
)

# Run the randomized hyperparameter search.
random_search.fit(x_train, y_train)
print("Best xgboost hyperparameters:", random_search.best_params_)
print("Best xgboost CV MSE:", -random_search.best_score_)

best_model_xgboost = random_search.best_estimator_


 # Comparing Models



 Finally, we plot the true model, the best MPF model, and the best XGBoost model using a provided plotting function `plot_model_predictions`.

In [15]:
# Assuming plot_model_predictions is defined in your environment
plot_2d_model_predictions(true_model3, title="True model")
plot_2d_model_predictions(lambda x: best_model.predict(x), title="Best MPF model")
plot_2d_model_predictions(lambda x: best_model_xgboost.predict(x), title="Best XGBoost model")


In [17]:
# Plot the best MPF model with identified=True
plot_2d_model_predictions(lambda x: best_model.predict(x), title="Best MPF model with identified=True")
test_preds = best_model.predict(x_test)
test_error = np.mean((y_test - test_preds) ** 2)
print(f"Test MSE for MPF with identified=True: {test_error}")


In [18]:
mpf_tree_grid_rep = [mpf_py.TreeGrid(tgf.combined_tree_grid) for tgf in best_model.tree_grid_families]

pred_function = lambda x: sum([tg.predict(x) for tg in mpf_tree_grid_rep])

plot_2d_model_predictions(pred_function, title="Identified MPF model predictions")
test_preds_identified = pred_function(x_test)
test_error_identified = np.mean((y_test - test_preds_identified) ** 2)
print(f"Test MSE for MPF with identified=True: {test_error_identified}")


In [19]:
for tg in mpf_tree_grid_rep:
    tg.plot_components()
    plot_2d_model_predictions(lambda x: tg.predict(x), title=f"TG scaled: {tg.scaling}")
