In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets, ensemble, model_selection, linear_model, preprocessing, metrics
import mlflow
import h2o
from h2o.automl import H2OAutoML

## Dataset: MNIST

In [2]:
X, y = datasets.load_digits(return_X_y=True, as_frame=True)

In [3]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

### Data Preprocessing

In [4]:
scaler = preprocessing.StandardScaler()

In [5]:
X_train_preprocessed = scaler.fit_transform(X_train)

### Model Training

In [6]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.1" 2018-10-16 LTS; OpenJDK Runtime Environment Zulu11.2+3 (build 11.0.1+13-LTS); OpenJDK 64-Bit Server VM Zulu11.2+3 (build 11.0.1+13-LTS, mixed mode)
  Starting server from /home/hadrian/anaconda3/envs/py36/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp178_h9ih
  JVM stdout: /tmp/tmp178_h9ih/h2o_hadrian_started_from_python.out
  JVM stderr: /tmp/tmp178_h9ih/h2o_hadrian_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Asia/Manila
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.1.2
H2O cluster version age:,4 months and 10 days !!!
H2O cluster name:,H2O_from_python_hadrian_18ibwu
H2O cluster total nodes:,1
H2O cluster free memory:,1.881 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [7]:
aml = H2OAutoML(max_models=10, balance_classes=True, seed=1)

In [23]:
train_h2o = h2o.H2OFrame(
    X_train_preprocessed,
    column_names=X_train.columns.tolist()
)

train_cols = train_h2o.columns

train_h2o = train_h2o.cbind(
    h2o.H2OFrame(y_train.values,
                column_names=['label']),
)

test_col = 'label'

train_h2o[test_col] = train_h2o[test_col].asfactor()

test_h2o = h2o.H2OFrame(
        scaler.transform(X_test),
        column_names=X_train.columns.to_list()
)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [24]:
aml.train(train_cols, test_col, train_h2o)

AutoML progress: |
21:11:51.195: New models will be added to existing leaderboard AutoML_20200727_193156733@@label (leaderboard frame=null) with already 12 models.

███████████████████
21:15:43.400: StackedEnsemble_BestOfFamily_AutoML_20200727_211151 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.
21:15:44.402: StackedEnsemble_AllModels_AutoML_20200727_211151 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.

█████████████████████████████████████| 100%


### Results

In [25]:
preds = aml.predict(test_h2o)['predict'].as_data_frame()

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [26]:
print(
    metrics.classification_report(y_test, preds, digits=4)
)

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        53
           1     0.9804    1.0000    0.9901        50
           2     1.0000    1.0000    1.0000        47
           3     1.0000    0.9630    0.9811        54
           4     1.0000    0.9833    0.9916        60
           5     0.9552    0.9697    0.9624        66
           6     0.9811    0.9811    0.9811        53
           7     1.0000    0.9818    0.9908        55
           8     0.9545    0.9767    0.9655        43
           9     0.9667    0.9831    0.9748        59

    accuracy                         0.9833       540
   macro avg     0.9838    0.9839    0.9837       540
weighted avg     0.9836    0.9833    0.9834       540



In [27]:
aml.leaderboard

model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_BestOfFamily_AutoML_20200727_193156,0.0287552,0.10405,0.161265,0.0260063
StackedEnsemble_AllModels_AutoML_20200727_193156,0.0295688,0.555905,0.420909,0.177165
GBM_4_AutoML_20200727_193156,0.0367784,0.118776,0.175849,0.0309229
GBM_4_AutoML_20200727_211151,0.0367784,0.118776,0.175849,0.0309229
GBM_2_AutoML_20200727_211151,0.0369126,0.118037,0.17532,0.0307372
GBM_2_AutoML_20200727_193156,0.0369126,0.118037,0.17532,0.0307372
GBM_3_AutoML_20200727_211151,0.0377522,0.119373,0.179114,0.0320818
GBM_3_AutoML_20200727_193156,0.0377522,0.119373,0.179114,0.0320818
GLM_1_AutoML_20200727_211151,0.0382547,0.119251,0.178784,0.0319636
GLM_1_AutoML_20200727_193156,0.0382547,0.119251,0.178784,0.0319636


