# Spark + H2O

- Spark 2.0.2
- Scala 2.11

In [3]:
import pickle
import pandas as pd
import numpy as np
from pysparkling import *
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from utils import SEED

ImportError: No module named 'pyspark'

In [2]:
with open('./rebalanced_data/train_set_sru.pickle', mode='rb') as f:
    train_set = pickle.load(f)

In [4]:
with open('./rebalanced_data/test_set.pickle', mode='rb') as f:
    test_set = pickle.load(f)

In [5]:
df_train = pd.DataFrame(train_set['x'], columns=train_set['x'].columns)
df_train['Results'] = train_set['y']

Conversión de dataframe de Pandas a dataframe de Spark.

In [6]:
spark_df_train = spark.createDataFrame(df_train)

Creación del H2O Context

In [7]:
hc = H2OContext.getOrCreate(spark)

Connecting to H2O server at http://192.168.1.48:54321... successful.


0,1
H2O cluster uptime:,10 secs
H2O cluster version:,3.14.0.2
H2O cluster version age:,11 days
H2O cluster name:,sparkling-water-jose_local-1504378951094
H2O cluster total nodes:,1
H2O cluster free memory:,770 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://192.168.1.48:54321


In [8]:
df_train_h2o = hc.as_h2o_frame(spark_df_train, "phishing_websites_train")

In [9]:
for col in list(df_train.columns):
    df_train_h2o[col] = df_train_h2o[col].asfactor()

In [32]:
df_test = pd.DataFrame(test_set['x'], columns=test_set['x'].columns)
df_test['Results'] = test_set['y']

spark_df_test = spark.createDataFrame(df_test)

df_test_h2o = hc.as_h2o_frame(spark_df_test, "phishing_websites_test")

for col in list(df_test.columns):
    df_test_h2o[col] = df_test_h2o[col].asfactor()

## Grandient Boosting Machine

In [13]:
gbm_model = H2OGradientBoostingEstimator(
                                         distribution = "bernoulli",
                                         nfolds=5,
                                         seed=SEED
                                        )

tuned_parameters = {'ntrees': [10, 50, 70, 80, 90, 100],
                    'max_depth': [4, 8, 15],
                    'learn_rate': [0.1, 0.2, 0.5],
                    'min_split_improvement': [0.002, 0.005, 0.01],
                    'col_sample_rate_per_tree': [5/len(train_set['x'].columns), 
                                                8/len(train_set['x'].columns), 
                                                10/len(train_set['x'].columns)
                                               ],
                    'col_sample_rate_change_per_level': [1, 0.5, 0.2]}


search_criteria = {
  "strategy": "RandomDiscrete", #RandomDiscrete para pruebas en local, Cartesian cuando se despliegue
  "max_runtime_secs": 60, # Solo para pruebas en local
  "stopping_metric": "AUC",
  "stopping_tolerance": 0.00001,
  "seed": SEED # Solo para pruebas en local
}

# Mirar consola para ver resultados de CV Grid-Search
gs = H2OGridSearch(gbm_model, tuned_parameters, search_criteria=search_criteria)

gs.train(x=list(range(0, (len(df_train.columns)-1) ))
         ,y=29, training_frame=df_train_h2o)

grid = gs.get_grid(sort_by='Precision', decreasing=True)

best_model_id = grid.model_ids[0]
print(gs.get_hyperparams(best_model_id))
best_model_gbm = h2o.get_model(best_model_id)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%
Hyperparameters: [col_sample_rate_per_tree, col_sample_rate_change_per_level, min_split_improvement, learn_rate, ntrees, max_depth]
[0.3333333333333333, 0.5, 0.002, 0.1, 90, 15]


In [14]:
print(gs.summary())


Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_1,50.0,50.0,4727.0,0.0,5.0,0.92,1.0,12.0,2.36
Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_2,70.0,70.0,6632.0,0.0,4.0,1.0428572,1.0,9.0,2.3857143
Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_3,31.0,31.0,9935.0,4.0,9.0,6.483871,7.0,34.0,20.516129
Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_0,10.0,10.0,6761.0,6.0,12.0,8.5,9.0,143.0,49.0


In [15]:
print(gs.sorted_metric_table())

Unnamed: 0,Unnamed: 1,col_sample_rate_change_per_level,col_sample_rate_per_tree,learn_rate,max_depth,min_split_improvement,ntrees,model_ids,logloss
0,,0.5,0.1666666666666666,0.5,15,0.005,50,Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_1,0.1738462359283464
1,,0.2,0.3333333333333333,0.2,4,0.01,70,Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_2,0.1876629879720593
2,,0.5,0.3333333333333333,0.1,15,0.002,31,Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_3,0.2109936482523133
3,,1.0,0.3333333333333333,0.2,15,0.01,10,Grid_GBM_py_30_sid_95f8_model_python_1504378979143_1_model_0,0.2885591318165788


In [16]:
df_test = pd.DataFrame(test_set['x'], columns=test_set['x'].columns)
df_test['Results'] = test_set['y']

spark_df_test = spark.createDataFrame(df_test)

df_test_h2o = hc.as_h2o_frame(spark_df_test, "phishing_websites_test")

for col in list(df_test.columns):
    df_test_h2o[col] = df_test_h2o[col].asfactor()

In [17]:
performance_test = best_model_gbm.model_performance(df_test_h2o)

print(performance_test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.052761437454541774
RMSE: 0.22969857956579046
LogLoss: 0.21191216433150972
Mean Per-Class Error: 0.0524875705174348
AUC: 0.9870059594556138
Gini: 0.9740119189112275
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5324480853633458: 


0,1,2,3,4
,-1.0,1.0,Error,Rate
-1,882.0,58.0,0.0617,(58.0/940.0)
1,55.0,1216.0,0.0433,(55.0/1271.0)
Total,937.0,1274.0,0.0511,(113.0/2211.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5324481,0.9555992,212.0
max f2,0.3288422,0.9668721,266.0
max f0point5,0.5866812,0.9585951,196.0
max accuracy,0.5324481,0.9488919,212.0
max precision,0.9687464,1.0,0.0
max recall,0.1169056,1.0,334.0
max specificity,0.9687464,1.0,0.0
max absolute_mcc,0.5324481,0.8954004,212.0
max min_per_class_accuracy,0.5522096,0.9436170,206.0


Gains/Lift Table: Avg response rate: 57.49 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0108548,0.9569769,1.7395751,1.7395751,1.0,1.0,0.0188828,0.0188828,73.9575138,73.9575138
,2,0.0203528,0.9542140,1.7395751,1.7395751,1.0,1.0,0.0165224,0.0354052,73.9575138,73.9575138
,3,0.0307553,0.9514840,1.7395751,1.7395751,1.0,1.0,0.0180960,0.0535012,73.9575138,73.9575138
,4,0.0402533,0.9481567,1.7395751,1.7395751,1.0,1.0,0.0165224,0.0700236,73.9575138,73.9575138
,5,0.0502035,0.9462129,1.7395751,1.7395751,1.0,1.0,0.0173092,0.0873328,73.9575138,73.9575138
,6,0.1004071,0.9347821,1.7395751,1.7395751,1.0,1.0,0.0873328,0.1746656,73.9575138,73.9575138
,7,0.1501583,0.9217809,1.7395751,1.7395751,1.0,1.0,0.0865460,0.2612116,73.9575138,73.9575138
,8,0.2003618,0.9075689,1.7395751,1.7395751,1.0,1.0,0.0873328,0.3485445,73.9575138,73.9575138
,9,0.3007689,0.8769666,1.7160674,1.7317274,0.9864865,0.9954887,0.1723053,0.5208497,71.6067366,73.1727430







In [18]:
print(performance_test.accuracy()[0][1])

0.9488919041157847

## Random Forest

In [26]:
rf_model = H2ORandomForestEstimator(
                                     nfolds=5,
                                     seed=SEED,
                                     fold_assignment='Stratified'
                                    )

tuned_parameters = {'ntrees': [50, 100, 200],
                    'mtries': [-1],
                    'max_depth': [4, 8, 10, 15],
                    'min_rows': [0.02, 0.05, 0.1]
                   }

search_criteria = {
  "strategy": "RandomDiscrete", #RandomDiscrete para pruebas en local, Cartesian cuando se despliegue
  "max_runtime_secs": 60, # Solo para pruebas en local
  "stopping_metric": "AUC",
  "stopping_tolerance": 0.00001,
  "seed": SEED # Solo para pruebas en local
}

In [27]:
# Mirar consola para ver resultados de CV Grid-Search
gs = H2OGridSearch(rf_model, tuned_parameters, search_criteria=search_criteria)

In [28]:
gs.train(x=list(range(0, (len(df_train.columns)-1) )), y=29, training_frame=df_train_h2o)

drf Grid Build progress: |████████████████████████████████████████████████| 100%


In [29]:
grid = gs.get_grid(sort_by='Precision', decreasing=True)

best_model_id = grid.model_ids[0]
print(gs.get_hyperparams(best_model_id))
best_model_rf = h2o.get_model(best_model_id)

Hyperparameters: [min_rows, mtries, ntrees, max_depth]
[0.1, -1, 200, 15]


In [30]:
print(gs.summary())


Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_DRF_py_30_sid_95f8_model_python_1504378979143_3605_model_0,200.0,200.0,897909.0,15.0,15.0,15.0,256.0,477.0,352.815
Grid_DRF_py_30_sid_95f8_model_python_1504378979143_3605_model_1,39.0,39.0,51507.0,8.0,8.0,8.0,68.0,145.0,100.0512850


In [31]:
print(gs.sorted_metric_table())

Unnamed: 0,Unnamed: 1,max_depth,min_rows,mtries,ntrees,model_ids,logloss
0,,15,0.1,-1,200,Grid_DRF_py_30_sid_95f8_model_python_1504378979143_3605_model_0,0.1000103779586281
1,,8,0.02,-1,39,Grid_DRF_py_30_sid_95f8_model_python_1504378979143_3605_model_1,0.1660300607537654


In [33]:
performance_test = best_model_gbm.model_performance(df_test_h2o)

print(performance_test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.052761437454541774
RMSE: 0.22969857956579046
LogLoss: 0.21191216433150972
Mean Per-Class Error: 0.0524875705174348
AUC: 0.9870059594556138
Gini: 0.9740119189112275
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5324480853633458: 


0,1,2,3,4
,-1.0,1.0,Error,Rate
-1,882.0,58.0,0.0617,(58.0/940.0)
1,55.0,1216.0,0.0433,(55.0/1271.0)
Total,937.0,1274.0,0.0511,(113.0/2211.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5324481,0.9555992,212.0
max f2,0.3288422,0.9668721,266.0
max f0point5,0.5866812,0.9585951,196.0
max accuracy,0.5324481,0.9488919,212.0
max precision,0.9687464,1.0,0.0
max recall,0.1169056,1.0,334.0
max specificity,0.9687464,1.0,0.0
max absolute_mcc,0.5324481,0.8954004,212.0
max min_per_class_accuracy,0.5522096,0.9436170,206.0


Gains/Lift Table: Avg response rate: 57.49 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0108548,0.9569769,1.7395751,1.7395751,1.0,1.0,0.0188828,0.0188828,73.9575138,73.9575138
,2,0.0203528,0.9542140,1.7395751,1.7395751,1.0,1.0,0.0165224,0.0354052,73.9575138,73.9575138
,3,0.0307553,0.9514840,1.7395751,1.7395751,1.0,1.0,0.0180960,0.0535012,73.9575138,73.9575138
,4,0.0402533,0.9481567,1.7395751,1.7395751,1.0,1.0,0.0165224,0.0700236,73.9575138,73.9575138
,5,0.0502035,0.9462129,1.7395751,1.7395751,1.0,1.0,0.0173092,0.0873328,73.9575138,73.9575138
,6,0.1004071,0.9347821,1.7395751,1.7395751,1.0,1.0,0.0873328,0.1746656,73.9575138,73.9575138
,7,0.1501583,0.9217809,1.7395751,1.7395751,1.0,1.0,0.0865460,0.2612116,73.9575138,73.9575138
,8,0.2003618,0.9075689,1.7395751,1.7395751,1.0,1.0,0.0873328,0.3485445,73.9575138,73.9575138
,9,0.3007689,0.8769666,1.7160674,1.7317274,0.9864865,0.9954887,0.1723053,0.5208497,71.6067366,73.1727430







In [34]:
print(performance_test.accuracy()[0][1])

0.9488919041157847