#### Predict life expectancy

In [1]:
import seaborn as sb
import pandas as pd

df_raw = sb.load_dataset('healthexp')
df_raw.tail()
print(df_raw.shape)
print(df_raw.info())

(274, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             274 non-null    int64  
 1   Country          274 non-null    object 
 2   Spending_USD     274 non-null    float64
 3   Life_Expectancy  274 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 8.7+ KB
None


In [8]:
print(df_raw['Country'].unique())
print(df_raw['Country'].value_counts())

['Germany' 'France' 'Great Britain' 'Japan' 'USA' 'Canada']
Country
Japan            51
USA              51
Germany          50
Canada           44
Great Britain    43
France           35
Name: count, dtype: int64


In [2]:
df = df_raw.copy()
df = pd.get_dummies(df) 
print(df.head())
print(df.info())

   Year  Spending_USD  Life_Expectancy  Country_Canada  Country_France  \
0  1970       252.311             70.6           False           False   
1  1970       192.143             72.2           False            True   
2  1970       123.993             71.9           False           False   
3  1970       150.437             72.0           False           False   
4  1970       326.961             70.9           False           False   

   Country_Germany  Country_Great Britain  Country_Japan  Country_USA  
0             True                  False          False        False  
1            False                  False          False        False  
2            False                   True          False        False  
3            False                  False           True        False  
4            False                  False          False         True  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 9 columns):
 #   Column        

In [3]:
X = df.drop(['Life_Expectancy'], axis=1)
y = df['Life_Expectancy']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=54)

In [5]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=34)
model.fit(X_train, y_train)

In [6]:
y_hat = model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('MAE: ' + str(mean_absolute_error(y_test, y_hat)))
print('MSE: ' + str(mean_squared_error(y_test, y_hat)))
print('R2: ' + str(r2_score(y_test, y_hat)))

MAE: 0.31138181818180044
MSE: 0.1553235999999905
R2: 0.9836234548107303


In [7]:
from sklearn.model_selection import cross_val_score

#### Definir função objetivo para o OPTUNA

In [8]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [9]:
import optuna

In [10]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1).mean()
    return score

In [11]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=34))


[I 2025-08-26 23:40:52,354] A new study created in memory with name: no-name-eb9a7c42-2ef1-4e05-8acf-94727f8eab28


In [None]:
study.optimize(objective, n_trials=10) #just to test. In practice use more, 100 etc.

[I 2025-08-26 23:41:03,810] Trial 0 finished with value: -2.3408834582712954 and parameters: {'n_estimators': 134, 'max_depth': 41, 'min_samples_split': 4, 'min_samples_leaf': 21}. Best is trial 0 with value: -2.3408834582712954.
[I 2025-08-26 23:41:05,975] Trial 1 finished with value: -1.4650895403551494 and parameters: {'n_estimators': 112, 'max_depth': 48, 'min_samples_split': 15, 'min_samples_leaf': 9}. Best is trial 1 with value: -1.4650895403551494.
[I 2025-08-26 23:41:06,703] Trial 2 finished with value: -2.370234670841821 and parameters: {'n_estimators': 271, 'max_depth': 27, 'min_samples_split': 2, 'min_samples_leaf': 21}. Best is trial 1 with value: -1.4650895403551494.
[I 2025-08-26 23:41:09,021] Trial 3 finished with value: -2.7968509597964326 and parameters: {'n_estimators': 883, 'max_depth': 34, 'min_samples_split': 27, 'min_samples_leaf': 27}. Best is trial 1 with value: -1.4650895403551494.
[I 2025-08-26 23:41:10,011] Trial 4 finished with value: -0.8101955818150598 and

In [14]:
best_params = study.best_params

In [15]:
%pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [16]:
import matplotlib.pyplot as plt
#import plotly

In [17]:
optuna.visualization.plot_optimization_history(study)

In [18]:
optuna.visualization.plot_parallel_coordinate(study)

In [19]:
optuna.visualization.plot_param_importances(study)

In [20]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [21]:
best_model = RandomForestRegressor(n_estimators=best_n_estimators,
                                   max_depth=best_max_depth,
                                   min_samples_split=best_min_samples_split,
                                   min_samples_leaf=best_min_samples_leaf)

In [22]:
best_model.fit(X_train, y_train)

In [23]:
y_hat = best_model.predict(X_test)

In [24]:
print('MAE: ' + str(mean_absolute_error(y_test, y_hat)))
print('MSE: ' + str(mean_squared_error(y_test, y_hat)))
print('R2: ' + str(r2_score(y_test, y_hat)))

MAE: 0.6279181937666369
MSE: 0.5442458801560739
R2: 0.9426174306386774
