In [None]:
!sudo apt update
!sudo apt install default-jdk
!sudo apt install default-jre
!pip install h2o

In [5]:
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from katonic.ml import MyClient

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu120.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp9m11nlo5
  JVM stdout: /tmp/tmp9m11nlo5/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp9m11nlo5/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_unknownUser_bv5wjh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.812 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [6]:
myclient = MyClient()
mlflow = myclient.mlflow
client = myclient.client

In [7]:
def metric(actual, pred):
    acc_score = accuracy_score(actual, pred)
    recall = recall_score(actual, pred, average='weighted')
    precision_scr = precision_score(actual, pred, average='weighted')
    f1_scr = f1_score(actual, pred, average='weighted')
    
    return (
        acc_score,
        recall,
        f1_scr,
        precision_scr
    )

In [21]:
exp_name = "mlflow-test-h2o"
mlflow.set_experiment(exp_name)
exp_details = mlflow.get_experiment_by_name(exp_name)

prostate = h2o.import_file("https://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv")

# convert columns to factors
prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()
prostate['RACE'] = prostate['RACE'].asfactor()
prostate['DCAPS'] = prostate['DCAPS'].asfactor()
prostate['DPROS'] = prostate['DPROS'].asfactor()

# set the predictor and response columns
predictors = ["AGE", "RACE", "VOL", "GLEASON"]
response_col = "CAPSULE"
kwargs = {
    'family':'binomial',
    'lambda':0,
    'compute_p_values':True,
}
# split into train and testing sets
train, test = prostate.split_frame(ratios = [0.8], seed = 1234)
with mlflow.start_run(run_name=exp_name):
    glm_model = H2OGeneralizedLinearEstimator(family= "binomial",
                                          lambda_ = 0,
                                          compute_p_values = True)
    glm_model.train(predictors, response_col, training_frame= prostate)

    y_pred = glm_model.predict(test)


    (acc_score, recall, f1_scr, precision_scr) = metric(test["CAPSULE"].as_data_frame(), y_pred['predict'].as_data_frame())

    model_metrics = {
        "accuracy_score": acc_score,
        "recall": recall,
        "f1_score": f1_scr,
        "precision_score": precision_scr
    }
    for k, v in kwargs.items():
        mlflow.log_param(k, v)
    for metric_name, score in model_metrics.items():
        mlflow.log_metric(metric_name, score)

    model_info = mlflow.h2o.log_model(h2o_model=glm_model, artifact_path="model")
    

2023/10/25 05:54:15 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-test-h2o' does not exist. Creating a new experiment.


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
