In [1]:
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch

In [2]:
h2o.init(nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_131"; OpenJDK Runtime Environment (build 1.8.0_131-8u131-b11-2ubuntu1.16.04.3-b11); OpenJDK 64-Bit Server VM (build 25.131-b11, mixed mode)
  Starting server from /usr/local/lib/python3.5/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp1wgdaj2a
  JVM stdout: /tmp/tmp1wgdaj2a/h2o_ivan_started_from_python.out
  JVM stderr: /tmp/tmp1wgdaj2a/h2o_ivan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,05 secs
H2O cluster version:,3.10.4.8
H2O cluster version age:,4 months and 27 days !!!
H2O cluster name:,H2O_from_python_ivan_oepv5q
H2O cluster total nodes:,1
H2O cluster free memory:,860 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [3]:
# import data
data = h2o.import_file("train.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [5]:
# get labels for x and y
x = data.columns
y = "target"
x.remove("id"); x.remove("target")

In [6]:
# splitting data
train, test = data.split_frame(ratios=[.7])

In [7]:
# get lables and encode as factors
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [8]:
# Specify hyperparameters for the grid
hyper_params = {"learn_rate": [0.01, 0.02, 0.03],
                "max_depth": [2, 3, 4, 5, 6, 9],
                "sample_rate": [0.7, 0.8, 0.9, 1.0],
                "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
search_criteria = {"strategy": "RandomDiscrete", "max_models": 5, "seed": 42}

# number of folds
nfolds = 5

In [9]:
# train the grid
grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10,
                                                        seed=42,
                                                        nfolds=nfolds,
                                                        balance_classes=True,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=hyper_params,
                     search_criteria=search_criteria,
                     grid_id="gbm_grid_binomial")
grid.train(x=x, y=y, training_frame=train)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [10]:
# train stacked ensemble using the GBM grid
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial",
                                       base_models=grid.model_ids)
ensemble.train(x=x, y=y, training_frame=train)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [11]:
# evaluate ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

In [12]:
# compare to base learner performance on the test set
baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids])
stack_auc_test = perf_stack_test.auc()
print("Best Base-learner Test AUC:  {0}".format(baselearner_best_auc_test))
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

Best Base-learner Test AUC:  0.6199055178451305
Ensemble Test AUC:  0.6240018411902022


In [14]:
gini = (2*stack_auc_test)-1
print(gini)

0.24800368238040438


In [15]:
# import final test data
finalTestData = h2o.import_file("test.csv")
# drop id column
x = finalTestData.columns
x.remove("id")
# create final test features
finalTest = finalTestData[x]

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [16]:
# generate predictions on final test set (optional)
pred = ensemble.predict(finalTest)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [17]:
print(pred)

predict,p0,p1
0,0.975766,0.024234
0,0.969836,0.0301643
0,0.964803,0.0351974
0,0.982321,0.0176795
0,0.967571,0.0324289
0,0.958124,0.0418758
0,0.969621,0.0303794
0,0.963274,0.0367261
0,0.946973,0.0530271
0,0.943516,0.056484





In [18]:
# extract predictions of pred
predFinal=pred.drop("p0")
predFinal=predFinal.drop("predict")
predFinal.set_names(["target"])

# create id column data frame
idcol = "id"
idDF = finalTestData[idcol]
# combine to submissions df
subdf = idDF.cbind(predFinal)
subdf.head()

id,target
0,0.024234
1,0.0301643
2,0.0351974
3,0.0176795
4,0.0324289
5,0.0418758
6,0.0303794
8,0.0367261
10,0.0530271
11,0.056484




In [19]:
# check shape
subdf.shape

(892816, 2)

In [20]:
# write submission to disk
h2o.export_file(subdf, force = True, path = "sub.csv")

Export File progress: |███████████████████████████████████████████████████| 100%


In [21]:
# shut down the cluster
h2o.shutdown(prompt=True) 

    >>> h2o.shutdown(prompt=True)
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
Are you sure you want to shutdown the H2O instance running at http://127.0.0.1:54321 (Y/N)? y
H2O session _sid_b630 closed.
