<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/week-Apr.-11/JM_HyperParams_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#
# John Mohsbeck
# 4-11-2023
# 
# Hyperparameter Optimization: Grid Search vs. Random Search vs. Bayesian Optimization in Action

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Load the dataset
url = "https://raw.githubusercontent.com/fenago/datasets/main/diamonds.csv"
diamonds = pd.read_csv(url)

# Preprocessing
label_encoder = LabelEncoder()
diamonds['cut'] = label_encoder.fit_transform(diamonds['cut'])
diamonds['color'] = label_encoder.fit_transform(diamonds['color'])
diamonds['clarity'] = label_encoder.fit_transform(diamonds['clarity'])

# Split the dataset into training and test sets
X = diamonds.drop('cut', axis=1)
y = diamonds['cut']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

RandomForestClassifier model

GridSearchCV

In [2]:
# Create a RandomForestClassifier model
rf = RandomForestClassifier()

# Define hyperparameters to be tuned
hyperparameters = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=hyperparameters, cv=5, n_jobs=-1, verbose=1)

# Fit the model on the training set
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best hyperparameters found by GridSearchCV:", best_params)

# Evaluate the model on the test set
test_score = grid_search.score(X_test, y_test)
print("Test set accuracy with best hyperparameters:", test_score)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters found by GridSearchCV: {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 200}
Test set accuracy with best hyperparameters: 0.7868001483129403


RandomForestRegressor model 

RandomizedSearchCV

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Load the dataset
url = "https://raw.githubusercontent.com/fenago/datasets/main/diamonds.csv"
diamonds = pd.read_csv(url)

# Preprocessing
label_encoder = LabelEncoder()
diamonds['cut'] = label_encoder.fit_transform(diamonds['cut'])
diamonds['color'] = label_encoder.fit_transform(diamonds['color'])
diamonds['clarity'] = label_encoder.fit_transform(diamonds['clarity'])

# Split the dataset into training and test sets
X = diamonds.drop('price', axis=1)
y = diamonds['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# Create a RandomForestRegressor model
rf = RandomForestRegressor()

# Define hyperparameters to be tuned
hyperparameters = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=hyperparameters, n_iter=10, cv=5, n_jobs=-1, verbose=1, random_state=42)

# Fit the model on the training set
random_search.fit(X_train, y_train)

# Get the best hyperparameters found by RandomizedSearchCV
best_params = random_search.best_params_
print("Best hyperparameters found by RandomizedSearchCV:", best_params)

# Evaluate the model on the test set
test_score = random_search.score(X_test, y_test)
print("Test set R^2 score with best hyperparameters:", test_score)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters found by RandomizedSearchCV: {'n_estimators': 50, 'min_samples_split': 2, 'max_depth': 30}
Test set R^2 score with best hyperparameters: 0.9999650617633047


Bayesian Optimization

XGBRegressor model

In [5]:
!pip install bayesian-optimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.2-py3-none-any.whl (17 kB)
Collecting colorama>=0.4.6
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.2 colorama-0.4.6


In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from bayes_opt import BayesianOptimization

# Load the dataset
url = "https://raw.githubusercontent.com/fenago/datasets/main/diamonds.csv"
diamonds = pd.read_csv(url)

# Preprocessing
label_encoder = LabelEncoder()
diamonds['cut'] = label_encoder.fit_transform(diamonds['cut'])
diamonds['color'] = label_encoder.fit_transform(diamonds['color'])
diamonds['clarity'] = label_encoder.fit_transform(diamonds['clarity'])

# Split the dataset into training and test sets
X = diamonds.drop('carat', axis=1)
y = diamonds['carat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Function to optimize
def xgb_cv(n_estimators, max_depth, gamma, min_child_weight, subsample, data, target):
    estimator = xgb.XGBRegressor(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        gamma=gamma,
        min_child_weight=min_child_weight,
        subsample=subsample,
        random_state=42,
    )
    cval = cross_val_score(estimator, data, target, scoring='neg_mean_squared_error', cv=5)
    return cval.mean()

# BayesianOptimization
def optimize_xgb(data, target):
    def xgb_crossval(n_estimators, max_depth, gamma, min_child_weight, subsample):
        return xgb_cv(
            n_estimators=n_estimators,
            max_depth=max_depth,
            gamma=gamma,
            min_child_weight=min_child_weight,
            subsample=subsample,
            data=data,
            target=target,
        )

    optimizer = BayesianOptimization(
        f=xgb_crossval,
        pbounds={
            "n_estimators": (50, 500),
            "max_depth": (3, 10),
            "gamma": (0, 1),
            "min_child_weight": (0, 10),
            "subsample": (0.5, 1),
        },
        random_state=42,
        verbose=2,
    )
    optimizer.maximize(init_points=5, n_iter=10)
    return optimizer.max

In [11]:
from sklearn.model_selection import cross_val_score

# Find optimal hyperparameters using Bayesian Optimization
best_params = optimize_xgb(X_train, y_train)
print("Best hyperparameters found by Bayesian Optimization:", best_params)

# Train the XGBoost model with the best hyperparameters
best_xgb = xgb.XGBRegressor(
    n_estimators=int(best_params["params"]["n_estimators"]),
    max_depth=int(best_params["params"]["max_depth"]),
    gamma=best_params["params"]["gamma"],
    min_child_weight=best_params["params"]["min_child_weight"],
    subsample=best_params["params"]["subsample"],
    random_state=42,
)

best_xgb.fit(X_train, y_train)

# Evaluate the model on the test set
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Test set Mean Squared Error with best hyperparameters:", mse)
print("Test set R^2 score with best hyperparameters:", r2)

|   iter    |  target   |   gamma   | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-0.000653[0m | [0m0.3745   [0m | [0m9.655    [0m | [0m7.32     [0m | [0m319.4    [0m | [0m0.578    [0m |
| [0m2        [0m | [0m-0.000715[0m | [0m0.156    [0m | [0m3.407    [0m | [0m8.662    [0m | [0m320.5    [0m | [0m0.854    [0m |
| [95m3        [0m | [95m-0.000400[0m | [95m0.02058  [0m | [95m9.789    [0m | [95m8.324    [0m | [95m145.6    [0m | [95m0.5909   [0m |
| [0m4        [0m | [0m-0.000543[0m | [0m0.1834   [0m | [0m5.13     [0m | [0m5.248    [0m | [0m244.4    [0m | [0m0.6456   [0m |
| [0m5        [0m | [0m-0.000813[0m | [0m0.6119   [0m | [0m3.976    [0m | [0m2.921    [0m | [0m214.9    [0m | [0m0.728    [0m |
| [95m6        [0m | [95m-0.000394[0m | [95m0.02288  [0m | [95m6.018    [0m | [95m4.267    [0m | [95