<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/week-Apr.-11/JM_HyperParams_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#
# John Mohsbeck
# 4-11-2023
# 
# Hyperparameter Optimization: Grid Search vs. Random Search vs. Bayesian Optimization in Action

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Load the dataset
url = "https://raw.githubusercontent.com/fenago/datasets/main/diamonds.csv"
diamonds = pd.read_csv(url)

# Preprocessing
label_encoder = LabelEncoder()
diamonds['cut'] = label_encoder.fit_transform(diamonds['cut'])
diamonds['color'] = label_encoder.fit_transform(diamonds['color'])
diamonds['clarity'] = label_encoder.fit_transform(diamonds['clarity'])

# Split the dataset into training and test sets
X = diamonds.drop('cut', axis=1)
y = diamonds['cut']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

RandomForestClassifier model

GridSearchCV

In [2]:
# Create a RandomForestClassifier model
rf = RandomForestClassifier()

# Define hyperparameters to be tuned
hyperparameters = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=hyperparameters, cv=5, n_jobs=-1, verbose=1)

# Fit the model on the training set
grid_search.fit(X_train, y_train)

# Get the best hyperparameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best hyperparameters found by GridSearchCV:", best_params)

# Evaluate the model on the test set
test_score = grid_search.score(X_test, y_test)
print("Test set accuracy with best hyperparameters:", test_score)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters found by GridSearchCV: {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 200}
Test set accuracy with best hyperparameters: 0.7868001483129403


RandomForestRegressor model 

RandomizedSearchCV

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Load the dataset
url = "https://raw.githubusercontent.com/fenago/datasets/main/diamonds.csv"
diamonds = pd.read_csv(url)

# Preprocessing
label_encoder = LabelEncoder()
diamonds['cut'] = label_encoder.fit_transform(diamonds['cut'])
diamonds['color'] = label_encoder.fit_transform(diamonds['color'])
diamonds['clarity'] = label_encoder.fit_transform(diamonds['clarity'])

# Split the dataset into training and test sets
X = diamonds.drop('price', axis=1)
y = diamonds['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# Create a RandomForestRegressor model
rf = RandomForestRegressor()

# Define hyperparameters to be tuned
hyperparameters = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=hyperparameters, n_iter=10, cv=5, n_jobs=-1, verbose=1, random_state=42)

# Fit the model on the training set
random_search.fit(X_train, y_train)

# Get the best hyperparameters found by RandomizedSearchCV
best_params = random_search.best_params_
print("Best hyperparameters found by RandomizedSearchCV:", best_params)

# Evaluate the model on the test set
test_score = random_search.score(X_test, y_test)
print("Test set R^2 score with best hyperparameters:", test_score)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters found by RandomizedSearchCV: {'n_estimators': 50, 'min_samples_split': 2, 'max_depth': 30}
Test set R^2 score with best hyperparameters: 0.9999650617633047
