Skip to content

Commit

Permalink
Add h2o automl model
Browse files Browse the repository at this point in the history
  • Loading branch information
FavioVazquez committed Oct 11, 2018
1 parent 18d7d6c commit 5d4eece
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
26 changes: 26 additions & 0 deletions optimus/ml/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
from optimus.helpers.functions import parse_columns
from optimus.ml.feature import string_to_index, vector_assembler

from pysparkling import *
from pysparkling.ml import H2OAutoML
from pyspark.sql.functions import *
from optimus.spark import Spark


class ML:
@staticmethod
Expand Down Expand Up @@ -120,3 +125,24 @@ def gbt(df, columns, input_col):
gbt_model = model.fit(df)
df_model = gbt_model.transform(df)
return df_model, gbt_model

@staticmethod
def h2o_automl(df, label, columns, **kargs):

hc = H2OContext.getOrCreate(Spark.spark)

df_sti = string_to_index(df, input_cols=label)
df_va = vector_assembler(df_sti, input_cols=columns)
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=True,
maxRuntimeSecs=60, # 1 minutes
seed=1,
maxModels=3,
predictionCol=label + "_index",
**kargs)

model = automl.fit(df_va)
df_raw = model.transform(df_va)

df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["value"] > 0.5, 1.0).otherwise(0.0))

return df_pred, model
9 changes: 9 additions & 0 deletions tests/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pyspark.ml import feature, classification
from nose.tools import assert_equal
import pyspark
import py_sparkling

from optimus import Optimus

Expand Down Expand Up @@ -89,3 +90,11 @@ def test_gbt():
assert_spark_df(df_model)

assert isinstance(rf_model, pyspark.ml.classification.GBTClassificationModel), "Not a GBT model"


def test_h2o_automl():
df_model, automl_model = op.ml.h2o_automl(df_cancer, "diagnosis", columns)

assert_spark_df(df_model)

assert isinstance(automl_model, py_sparkling.ml.models.H2OAutoMLModel), "Not a H2OAutoMLModel"

0 comments on commit 5d4eece

Please sign in to comment.