## Stacked Ensemble

In [8]:
import h2o
import os
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch

# h2o.init()

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.241-b07, mixed mode)
  Starting server from C:\Users\jmatney\AppData\Local\Continuum\anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\jmatney\AppData\Local\Temp\tmpe9n9hgab
  JVM stdout: C:\Users\jmatney\AppData\Local\Temp\tmpe9n9hgab\h2o_jmatney_started_from_python.out
  JVM stderr: C:\Users\jmatney\AppData\Local\Temp\tmpe9n9hgab\h2o_jmatney_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,1 month and 7 days
H2O_cluster_name:,H2O_from_python_jmatney_e7fxn2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,247.5 Mb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [9]:
path = "C:\\Users\jmatney\Documents\GitHub\IndianaRisk"
os.chdir(path)
# data
IN_df = pd.read_excel("data\model_data\IN_Risk_Model.xlsx")
IN_mod = IN_df.drop('subwatershed', 1)

predictors = list(IN_mod.loc[:, IN_mod.columns != "claims_total_building_insurance_coverage_avg"].columns)
response = "claims_total_building_insurance_coverage_avg"

scaler = MinMaxScaler()
IN_norm = scaler.fit_transform(IN_mod)
IN_norm = pd.DataFrame(IN_norm, columns=IN_mod.columns)
hf = h2o.H2OFrame(IN_norm)
train, test = hf.split_frame(ratios=[.8])

# # Import a sample binary outcome train/test set into H2O
# train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
# test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
# Identify predictors and response
x = predictors
y = response

In [12]:
# # For binary classification, response should be a factor
# train[y] = train[y].asfactor()
# test[y] = test[y].asfactor()

In [13]:
# Number of CV folds (to generate level-one data for stacking)
nfolds = 5

In [14]:
# There are a few ways to assemble a list of models to stack together:
# 1. Train individual models and put them in a list
# 2. Train a grid of models
# 3. Train several grids of models
# Note: All base models must have the same cross-validation folds and
# the cross-validated predicted values must be kept.


# 1. Generate a 2-model ensemble (GBM + RF)

# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="gaussian",
                                      ntrees=10,
                                      max_depth=3,
                                      min_rows=2,
                                      learn_rate=0.2,
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
my_gbm.train(x=x, y=y, training_frame=train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [15]:
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(x=x, y=y, training_frame=train)


drf Model Build progress: |███████████████████████████████████████████████| 100%


In [16]:
# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_gbm, my_rf])
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [20]:
# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test)
perf_rf_test = my_rf.model_performance(test)
baselearner_best_rmse_test = max(perf_gbm_test.rmse(), perf_rf_test.rmse())
stack_rmse_test = perf_stack_test.rmse()
print("Best Base-learner Test RMSE:  {0}".format(baselearner_best_rmse_test))
print("Ensemble Test RMSE:  {0}".format(stack_rmse_test))

# Generate predictions on a test set (if neccessary)
pred = ensemble.predict(test)


# 2. Generate a random grid of models and stack them together

# Specify GBM hyperparameters for the grid
hyper_params = {"learn_rate": [0.01, 0.03],
                "max_depth": [3, 4, 5, 6, 9],
                "sample_rate": [0.7, 0.8, 0.9, 1.0],
                "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1}

# Train the grid
grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10,
                                                        seed=1,
                                                        nfolds=nfolds,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=hyper_params,
                     search_criteria=search_criteria,
                     grid_id="gbm_grid_binomial")
grid.train(x=x, y=y, training_frame=train)

Best Base-learner Test RMSE:  0.0816973251268041
Ensemble Test RMSE:  0.07909706987951522
stackedensemble prediction progress: |████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [59]:
# Train a stacked ensemble using the GBM grid
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial",
                                       base_models=grid.model_ids)
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

# Compare to base learner performance on the test set
baselearner_best_rmse_test = max([h2o.get_model(model).model_performance(test_data=test).rmse() for model in grid.model_ids])
stack_rmse_test = perf_stack_test.rmse()
print("Best Base-learner Test RMSE:  {0}".format(baselearner_best_rmse_test))
print("Ensemble Test RMSE:  {0}".format(stack_rmse_test))

# Generate predictions on a test set (if neccessary)
pred = ensemble.predict(test)

stackedensemble Model Build progress: |███████████████████████████████████| 100%
Best Base-learner Test RMSE:  0.0915179695188364
Ensemble Test RMSE:  0.08237816341351313
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [75]:
test_df=pd.DataFrame(test)

In [56]:
IN_norm_pred = IN_norm.drop(response,1)
IN_norm_pred
# IN_norm_pred = pd.concat([IN_norm_pred, pred], axis=1)
ensemble_pred = scaler.inverse_transform(IN_norm)

In [79]:
test_df['pred'] = pred

ValueError: Length of values does not match length of index

In [78]:
test

circulatory_ratio,relief,avg_slope,elongation_ratio,drainage_density,shape_factor,relief_ratio,ruggedness,aae_area,buildings_aae_count,buildings_x_count,water_bodies_area,dams_count,bridges_count,streets_km,railroads_km,population_density.y,avg_median_income,population_change,dependent_population_pct,dist_to_stream_avg,dist_to_stream_stdev,avg_impervious_percent,orb100yr24ha_am,policy_total_building_coverage_avg,claims_total_building_insurance_coverage_avg,pred
0.370504,0.396078,0.37801,0.20201,0.0780372,0.47826,0.0584717,0.133649,0.0103152,0.0,0.0,0.0166534,0.0,0.157895,0.0492896,0.0,0.059749,0.307027,0.152239,0.677509,0.300349,0.297989,0.00164684,0.629405,0.0239904,0.00734987,0.0314349
0.68782,0.196078,0.12063,0.47225,0.0537523,0.0743733,0.0783795,0.0369731,0.0596353,0.00027115,0.0,0.0640993,0.0,0.0350877,0.0572177,0.0776773,0.0272733,0.291042,0.098242,0.681679,0.438647,0.318731,0.0214104,0.661533,0.191268,0.0168889,0.0744265
0.338717,1.0,0.0756813,0.00359214,0.224889,0.157397,0.361068,0.204099,0.0280562,0.00352495,0.00124224,0.00671512,0.0,0.140351,0.0235931,0.0370407,0.0697913,0.26236,0.105146,0.660828,0.0993074,0.131111,0.00918841,0.0505093,0.231573,0.181974,0.121476
0.286453,1.0,0.0503826,0.000163836,0.0785721,0.181445,0.245664,0.0618294,0.224674,0.00704989,0.0,0.125045,0.0555556,0.0877193,0.169132,0.0434581,0.100183,0.359951,0.121237,0.605922,0.264137,0.328865,0.018555,0.50734,0.156472,0.0100489,0.0740823
0.314167,1.0,0.0824951,1.2519e-05,0.161695,0.595491,0.154186,0.169066,0.104269,0.145065,0.25383,0.0550284,0.0,0.403509,0.40289,0.0464773,0.897396,0.348481,0.275324,0.587355,0.117396,0.142658,0.562907,0.195469,0.275572,0.174372,0.150049
0.573953,0.992157,0.0676116,0.000146867,0.0726052,0.0476021,0.54854,0.0332539,0.189561,0.00027115,0.0,0.0883516,0.0,0.0175439,0.0461597,0.0,0.0995287,0.349792,0.119928,0.684112,0.256903,0.348784,0.0111334,0.320479,0.220565,0.0475702,0.0848812
0.55545,0.839216,0.0532834,0.0163343,0.0980909,0.34521,0.196791,0.0720188,0.0133463,0.0,0.0,0.00147171,0.0,0.0175439,0.0339389,0.0552258,0.0152914,0.248845,0.109432,0.574646,0.30282,0.382909,0.0192923,0.628165,0.000219567,0.000163509,0.0176791
0.241575,1.0,0.194195,0.0773651,0.123475,0.250686,0.265366,0.146475,0.0452715,0.00135575,0.0,0.012223,0.0,0.175439,0.0375883,0.0199904,0.0871282,0.315551,0.116669,0.510703,0.22367,0.376204,0.012974,0.571579,0.24302,0.214561,0.0923651
0.2037,1.0,0.0478791,0.00264658,0.113671,0.715039,0.268202,0.139038,0.157109,0.0517896,0.0248447,0.0228536,0.0555556,0.122807,0.0800988,0.0108722,0.0395927,0.448952,0.101476,0.611343,0.145391,0.150922,0.0236951,0.250313,0.0377362,0.00348469,0.0440728
0.239653,1.0,0.0797956,0.0028892,0.0887042,0.181992,0.517934,0.0403847,0.011746,0.0027115,0.0,0.0146958,0.0,0.0526316,0.040651,0.0380523,0.153913,0.262706,0.102015,0.69836,0.156347,0.223877,0.012001,0.33478,0.21455,0.1731,0.0860869




In [35]:
pd.DataFrame(ensemble_pred)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.299470,1.010000e+02,11.031308,0.307262,0.412036,9.386736,3.610590e-03,0.050787,0.697929,0.0,...,193.107921,58111.000000,1673.0,41.366667,884.262553,503.506692,0.146502,6789.381348,3212.713978,1180.826665
1,0.485784,5.000000e+01,4.080327,0.718304,0.283812,1.484138,4.839883e-03,0.014050,4.034926,1.0,...,104.643085,56771.333333,-431.0,41.466667,1261.467019,535.669076,1.180721,6862.660156,25613.870480,2713.363580
2,0.322675,2.600000e+01,2.030253,0.815297,0.647139,1.778003,2.980456e-03,0.017181,2.977041,0.0,...,321.678527,48506.750000,-243.0,41.675000,584.613292,297.702490,1.084871,6175.916504,27111.136390,28374.903350
3,0.326203,3.710738e-09,2.053226,0.000588,0.308209,1.027938,4.436303e-13,0.009651,4.357636,25.0,...,366.175097,61766.200000,38.0,42.000000,1087.881755,964.682414,4.389772,5982.229004,33976.722030,43.018237
4,0.379072,7.600000e+01,10.546125,0.172761,0.501159,3.978998,4.231477e-03,0.068939,18.411886,10.0,...,256.032460,55698.400000,-833.0,40.660000,555.426454,409.885301,1.661841,5676.336426,63568.611420,9695.319736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,0.240000,3.710000e+01,3.470000,0.440000,0.250000,3.500000,0.000000e+00,0.010000,3.830000,82.0,...,359.640000,51922.500000,-261.0,42.930000,912.910000,796.990000,0.150000,6789.380000,37197.085120,8582.128831
759,0.390000,1.949000e+01,1.160000,0.790000,0.490000,2.020000,0.000000e+00,0.010000,0.000000,0.0,...,173.150000,53627.670000,-684.0,41.600000,865.100000,516.570000,1.390000,6639.860000,17181.581280,23632.585480
760,0.380000,1.949000e+01,1.160000,0.580000,0.820000,1.930000,0.000000e+00,0.020000,34.480000,122.0,...,95.140000,52547.500000,57.0,38.500000,450.170000,253.230000,1.390000,6639.860000,27942.486560,2902.555628
761,0.280000,5.801000e+01,3.540000,0.460000,0.350000,4.970000,0.000000e+00,0.020000,12.670000,250.0,...,602.750000,74185.500000,2295.0,40.160000,1093.700000,788.000000,9.340000,7600.010000,21593.433150,18142.836830
