**Задание** Попробуйте создать stacking модели случайного леса `my_rf` и лучшей модели (по ROC AUC) среди моделей градиентного бустинга, найденной при помощи GridSearch на наборе данных *Higgs*.

В качестве примера посмотрите ниже, как объединяются модели `my_rf` и `my_gbm`.

In [None]:
!pip install h2o
import h2o
# Number of threads, nthreads = -1, means use all cores on your machine
# max_mem_size is the maximum memory (in GB) to allocate to H2O
h2o.init(nthreads = -1, max_mem_size = 8)

Датасет: https://archive.ics.uci.edu/ml/datasets/HIGGS

In [None]:
# higgs = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/testng/higgs_train_5k.csv")

train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
valid = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

In [None]:
# Identify predictors and response
predictors = train.columns
response = "response"
predictors.remove(response)

# For binary classification, response should be a factor
train[response] = train[response].asfactor()

# Случайный лес

Описание: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/drf.html

In [None]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

# Train & Cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=5,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 seed=123)

my_rf.train(x=predictors,
            y=response,
            training_frame=train,
            validation_frame=valid)

In [None]:
# Eval performance:
perf_rf = my_rf.model_performance()

# Generate predictions on a validation set (if necessary):
pred_rf = my_rf.predict(valid)

In [None]:
my_rf.auc(train=True, valid=True, xval=False)

# Градиентный бустинг

Описание: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/gbm.html

In [None]:
from h2o.estimators import H2OGradientBoostingEstimator

# Build and train the model:
my_gbm = H2OGradientBoostingEstimator(nfolds=5,
                                      seed=123,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions = True)

my_gbm.train(x=predictors,
            y=response,
            training_frame=train,
            validation_frame=valid)

# Eval performance:
perf_gbm = my_gbm.model_performance()

# Generate predictions on a test set (if necessary):
pred_gbm = my_gbm.predict(valid)

# Extract feature interactions:
feature_interactions = my_gbm.feature_interaction()

In [None]:
my_gbm.auc(train=True, valid=True, xval=False)

# Объединение моделей

Описание: https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/stacked-ensembles.html

In [None]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_rf, my_gbm])
ensemble.train(x=predictors,
              y=response,
              training_frame=train,
              validation_frame=valid)

In [None]:
perf_stack_test = ensemble.model_performance(valid)

In [None]:
ensemble.auc(train=True, valid=True, xval=False)

# Использование Grid Search

In [None]:
from h2o.grid.grid_search import H2OGridSearch

# GBM hyperparameters
gbm_params1 = {'learn_rate': [0.01, 0.1],
                'max_depth': [3, 5, 9],
                'sample_rate': [0.8, 1.0],
                'col_sample_rate': [0.2, 0.5, 1.0]}

# Train and validate a cartesian grid of GBMs
gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator(),
                          grid_id='gbm_grid1',
                          hyper_params=gbm_params1)
gbm_grid1.train(x=predictors,
              y=response,
              training_frame=train,
              validation_frame=valid)

In [None]:
# Get the grid results, sorted by validation AUC
gbm_gridperf1 = gbm_grid1.get_grid(sort_by='auc', decreasing=True)
gbm_gridperf1

# Grab the top GBM model, chosen by validation AUC
best_gbm1 = gbm_gridperf1.models[0]

# Now let's evaluate the model performance on a test set
# so we get an honest estimate of top model performance
best_gbm_perf1 = best_gbm1.model_performance(valid)

best_gbm_perf1.auc()