<h2 align="center" style="color:blue">Codebasics ML Course: Optuna Tutorial</h2>

In [6]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification

In [7]:
np.logspace(-4, 4, 10)

array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04])

### Generate Dataset

In [10]:
X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

### Hyperparameter Tunning Using GridSearchCV

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = LogisticRegression(max_iter=2000)

# Define the parameter grid
param_grid = {
    'C': [1, 2, 3, 4, 5],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # Different solvers
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1, scoring='accuracy')

# Perform the grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test set evaluation:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best parameters found:  {'C': 1, 'solver': 'liblinear'}
Best cross-validation score: 0.72
Test set evaluation:
              precision    recall  f1-score   support

           0       0.73      0.62      0.67       106
           1       0.63      0.73      0.68        94

    accuracy                           0.68       200
   macro avg       0.68      0.68      0.67       200
weighted avg       0.68      0.68      0.67       200



### Hyperparameter Tunning Using Optuna

In [36]:
import optuna
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [33]:
# Define the objective function to be optimized
def objective(trial):
    # Suggest values for the hyperparameters
    C = trial.suggest_float("C", 1, 5)
    solver = trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"])
    
    # Create the model with suggested hyperparameters
    model = LogisticRegression(C=C, solver=solver, max_iter=2000)

    # Evaluate the model: the negative mean accuracy is returned because Optuna looks for minimum
    score = cross_val_score(model, X, y, n_jobs=-1, cv=3)
    accuracy = np.mean(score)

    return accuracy

# Create a study object and specify the optimization direction
study = optuna.create_study(direction='maximize')

# Perform the optimization
study.optimize(objective, n_trials=10)

# Results
print("Best parameters:", study.best_params)
print("Best cross-validation accuracy:", study.best_value)

[I 2024-07-17 17:41:30,682] A new study created in memory with name: no-name-9911cc24-d88a-4ead-bc7a-b754e769276b
[I 2024-07-17 17:41:30,716] Trial 0 finished with value: 0.687996379613146 and parameters: {'C': 1.4219065255338297, 'solver': 'sag'}. Best is trial 0 with value: 0.687996379613146.
[I 2024-07-17 17:41:30,732] Trial 1 finished with value: 0.687996379613146 and parameters: {'C': 4.570354157570347, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.687996379613146.
[I 2024-07-17 17:41:30,748] Trial 2 finished with value: 0.687996379613146 and parameters: {'C': 3.4362183020084025, 'solver': 'liblinear'}. Best is trial 0 with value: 0.687996379613146.
[I 2024-07-17 17:41:30,764] Trial 3 finished with value: 0.687996379613146 and parameters: {'C': 3.014939817825534, 'solver': 'sag'}. Best is trial 0 with value: 0.687996379613146.
[I 2024-07-17 17:41:30,780] Trial 4 finished with value: 0.687996379613146 and parameters: {'C': 3.891678000949935, 'solver': 'saga'}. Best is trial 0 w

Best parameters: {'C': 1.0564542420205165, 'solver': 'newton-cg'}
Best cross-validation accuracy: 0.6889973806141471


In [34]:
best_model = LogisticRegression(**study.best_params, max_iter=2000)
best_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.62      0.67       106
           1       0.63      0.73      0.68        94

    accuracy                           0.68       200
   macro avg       0.68      0.68      0.67       200
weighted avg       0.68      0.68      0.67       200

