# AutoML

In [1]:
#%pip install h2o

In [2]:
# Importando bibliotecas
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Carregando dados e preparação
df = pd.read_csv("../../datasets/data_titanic.csv")

In [4]:
# Remover colunas irrelevantes
df = df.drop(columns=['Name', 'Ticket', 'Cabin'])

In [5]:
# Tratar valores nulos
df.fillna(df.select_dtypes(include=['number']).mean(), inplace=True)

In [6]:
# Seperação entre features e target
X = df.drop(columns=['Survived'])
y = df['Survived']

In [7]:
# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Trainamento H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK Client VM Temurin-17.0.14+7 (build 17.0.14+7, mixed mode, emulated-client)
  Starting server from C:\Users\josaf\AppData\Local\Programs\Python\Python310\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\josaf\AppData\Local\Temp\tmpl_kekhya
  JVM stdout: C:\Users\josaf\AppData\Local\Temp\tmpl_kekhya\h2o_josaf_started_from_python.out
  JVM stderr: C:\Users\josaf\AppData\Local\Temp\tmpl_kekhya\h2o_josaf_started_from_python.err


  Please download the latest 64-bit Java SE JDK from Oracle.

  warn("  You have a 32-bit version of Java. H2O works best with 64-bit Java.\n"


  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,4 months and 17 days
H2O_cluster_name:,H2O_from_python_josaf_70ner5
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,232.7 Mb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [9]:
# Converter dataset para H2O Frame
train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
# Defini a coluna de resposta
y_col = 'Survived'
x_cols = X_train.columns.tolist()

In [11]:
# Configurar AutoML
aml = H2OAutoML(max_runtime_secs=300, seed=42)
aml.train(x=x_cols, y=y_col, training_frame=train)

AutoML progress: |
00:02:31.787: AutoML: XGBoost is not available; skipping it.
00:02:31.814: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


00:02:32.145: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


00:02:32.587: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
00:02:32.757: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical befor

key,value
Stacking strategy,cross_validation
Number of base models (used / total),4/9
# GBM base models (used / total),3/5
# DRF base models (used / total),0/2
# GLM base models (used / total),1/1
# DeepLearning base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,129.13356,20.302544,102.76249,148.52917,116.777336,127.899506,149.69933
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.2681302,0.0201668,0.2362,0.2880757,0.2619855,0.2752789,0.2791109
mean_residual_deviance,0.1338722,0.0183631,0.1099662,0.1501236,0.1253603,0.1295038,0.154407
mse,0.1338722,0.0183631,0.1099662,0.1501236,0.1253603,0.1295038,0.154407
null_deviance,33.467896,1.5589635,34.25749,33.65469,30.801163,33.8017,34.824444
r2,0.4283454,0.0745193,0.537285,0.3526767,0.4476085,0.4399465,0.3642103
residual_deviance,19.072008,2.7627828,15.835136,21.767918,16.923637,18.907558,21.925789
rmse,0.365189,0.0252287,0.3316115,0.3874578,0.3540625,0.3598664,0.3929465
rmsle,0.2571369,0.0175964,0.2324718,0.269924,0.2492402,0.2568071,0.2772415


In [12]:
#obter o melhor modelo
best_model = aml.leader
print(best_model)

Model Details
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_2_AutoML_1_20250320_00231


Model Summary for Stacked Ensemble: 
key                                        value
-----------------------------------------  ----------------
Stacking strategy                          cross_validation
Number of base models (used / total)       4/9
# GBM base models (used / total)           3/5
# DRF base models (used / total)           0/2
# GLM base models (used / total)           1/1
# DeepLearning base models (used / total)  0/1
Metalearner algorithm                      GLM
Metalearner fold assignment scheme         Random
Metalearner nfolds                         5
Metalearner fold_column
Custom metalearner hyperparameters         None

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.07343390858004101
RMSE: 0.2709869158834814
MAE: 0.194524673287602
RMSLE: 0.18983555368941396
Mean Residual Deviance: 0.07343390858004101


In [16]:
# Predição
preds = best_model.predict(test)
preds = preds.as_data_frame().values.flatten()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%





In [19]:
preds = (preds >= 0.5).astype(int)

In [20]:
# Avaliação 
print(f"Acurácia H2O AutoML: {accuracy_score(y_test, preds)}")

Acurácia H2O AutoML: 0.8156424581005587
