In [38]:
import seaborn as sns
import pandas as pd

In [39]:
healthexp = sns.load_dataset('healthexp')

In [40]:
healthexp.head(3)

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9


In [41]:
healthexp = pd.get_dummies(healthexp)

In [42]:
healthexp.head()

Unnamed: 0,Year,Spending_USD,Life_Expectancy,Country_Canada,Country_France,Country_Germany,Country_Great Britain,Country_Japan,Country_USA
0,1970,252.311,70.6,False,False,True,False,False,False
1,1970,192.143,72.2,False,True,False,False,False,False
2,1970,123.993,71.9,False,False,False,True,False,False
3,1970,150.437,72.0,False,False,False,False,True,False
4,1970,326.961,70.9,False,False,False,False,False,True


In [43]:
X = healthexp.drop(columns=['Life_Expectancy'])
y = healthexp['Life_Expectancy']

In [44]:
from sklearn.model_selection import train_test_split 


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=54)


In [46]:
from sklearn.ensemble import RandomForestRegressor

In [47]:
rfr = RandomForestRegressor(random_state = 52)

In [48]:
rfr.fit(X_train,y_train)

In [49]:
y_pred = rfr.predict(X_test)

In [50]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

In [51]:
mean_absolute_error(y_test, y_pred)

0.31665454545453275

In [52]:
root_mean_squared_error(y_test, y_pred)

0.4000090908057808

In [53]:
r2_score(y_test,y_pred)

0.9831296317338122

In [54]:
import optuna 

In [55]:
from sklearn.model_selection import cross_val_score

In [67]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators',100,1000)
    max_depth = trial.suggest_int('max_depth',10,50)
    min_samples_split = trial.suggest_int('min_samples_split',2,32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf',1,32)
    
    model = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf)

    score = cross_val_score(model, X, y, n_jobs=-1, cv=5, scoring='neg_mean_squared_error').mean()

    return score

In [68]:
study = optuna.create_study(direction='maximize', sampler = optuna.samplers.RandomSampler(seed=42))

[I 2025-03-30 13:22:39,198] A new study created in memory with name: no-name-019dc8ce-a2b5-40f9-86a3-e7c90a430475


In [69]:
study.optimize(objective, n_trials = 100)

[I 2025-03-30 13:22:42,867] Trial 0 finished with value: -4.482089322232224 and parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 24, 'min_samples_leaf': 20}. Best is trial 0 with value: -4.482089322232224.
[I 2025-03-30 13:22:44,158] Trial 1 finished with value: -5.153424416411042 and parameters: {'n_estimators': 240, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 28}. Best is trial 0 with value: -4.482089322232224.
[I 2025-03-30 13:22:44,822] Trial 2 finished with value: -5.552603343848067 and parameters: {'n_estimators': 641, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 32}. Best is trial 0 with value: -4.482089322232224.
[I 2025-03-30 13:22:45,783] Trial 3 finished with value: -2.9918169184720242 and parameters: {'n_estimators': 850, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 3 with value: -2.9918169184720242.
[I 2025-03-30 13:22:46,194] Trial 4 finished with value: -3.7935390283773507 and param

In [70]:
study.best_params

{'n_estimators': 112,
 'max_depth': 14,
 'min_samples_split': 3,
 'min_samples_leaf': 2}

In [71]:
best_params = study.best_params

In [72]:
import matplotlib.pyplot as plt

In [73]:
optuna.visualization.plot_optimization_history(study)

In [75]:
optuna.visualization.plot_parallel_coordinate(study)

In [76]:
optuna.visualization.plot_slice(study,params=['n_estimators','max_depth','min_samples_split','min_samples_leaf'])

In [77]:
rfr_new = RandomForestRegressor(**best_params)

In [78]:
rfr_new.fit(X_train,y_train)
y_new_preds = rfr_new.predict(X_test)

In [79]:
r2_score(y_test,y_new_preds)

0.9775438170812873

In [80]:
mean_absolute_error(y_test,y_new_preds)

0.38100259611419796