In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [25]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [5]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

In [6]:
df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
cols_with_sparse = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_sparse] = df[cols_with_sparse].replace(0, np.nan)

In [10]:
df.fillna(df.mean(), inplace=True)

In [12]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [13]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(
      x,
      y,
      test_size=0.3,
      random_state=42
)

In [18]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [19]:
x_train.shape, x_test.shape

((537, 8), (231, 8))

In [21]:
def objective(trail):

  n_estimators = trail.suggest_int('n_estimators', 50, 200)
  max_depth = trail.suggest_int('max_depth', 3, 20)

  model = RandomForestClassifier(
      n_estimators = n_estimators,
      max_depth = max_depth,
      random_state = 42
  )

  score = cross_val_score(model, x_train, y_train, cv=5).mean()

  return score

In [22]:
study = optuna.create_study(direction = 'maximize', sampler = optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-01-18 23:19:41,824] A new study created in memory with name: no-name-11c36626-c9a5-4ba6-b1bb-66b0c59df514
[I 2025-01-18 23:19:45,426] Trial 0 finished with value: 0.7503980616130149 and parameters: {'n_estimators': 111, 'max_depth': 5}. Best is trial 0 with value: 0.7503980616130149.
[I 2025-01-18 23:19:48,623] Trial 1 finished with value: 0.7522499134648667 and parameters: {'n_estimators': 120, 'max_depth': 17}. Best is trial 1 with value: 0.7522499134648667.
[I 2025-01-18 23:19:49,885] Trial 2 finished with value: 0.7541190723433713 and parameters: {'n_estimators': 78, 'max_depth': 12}. Best is trial 2 with value: 0.7541190723433713.
[I 2025-01-18 23:19:52,387] Trial 3 finished with value: 0.7708722741433023 and parameters: {'n_estimators': 176, 'max_depth': 16}. Best is trial 3 with value: 0.7708722741433023.
[I 2025-01-18 23:19:53,015] Trial 4 finished with value: 0.7652474904811353 and parameters: {'n_estimators': 63, 'max_depth': 20}. Best is trial 3 with value: 0.7708722

In [24]:
print(f"Best Score: {study.best_value}")
print(f"Best Params: {study.best_params}")

Best Score: 0.7727068189685011
Best Params: {'n_estimators': 109, 'max_depth': 9}


In [27]:
# use of best hyeprparameters
best_model = RandomForestClassifier(**study.best_trial.params, random_state = 42)

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7445887445887446


In [29]:
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()