In [9]:
import kfp
import kfp.dsl as dsl
from kfp import components

from kubeflow.katib import ApiClient
from kubeflow.katib import V1beta1ExperimentSpec
from kubeflow.katib import V1beta1AlgorithmSpec
from kubeflow.katib import V1beta1EarlyStoppingSpec
from kubeflow.katib import V1beta1EarlyStoppingSetting
from kubeflow.katib import V1beta1ObjectiveSpec
from kubeflow.katib import V1beta1ParameterSpec
from kubeflow.katib import V1beta1FeasibleSpace
from kubeflow.katib import V1beta1TrialTemplate
from kubeflow.katib import V1beta1TrialParameterSpec

In [10]:
# Experiment name and namespace.
experiment_name = "test-katib"
experiment_namespace = "kubeflow-user-example-com"

max_trial_count = 5
max_failed_trial_count = 3
parallel_trial_count = 2

# Objective specification.
objective = V1beta1ObjectiveSpec(
    type="minimize",
    goal=0.001,
    objective_metric_name="loss"
)

# Algorithm specification.
algorithm = V1beta1AlgorithmSpec(
    algorithm_name="random",
)

# Experiment search space.
# In this example we tune learning rate and batch size.
parameters = [
    V1beta1ParameterSpec(
        name="lr",
        parameter_type="double",
        feasible_space=V1beta1FeasibleSpace(
            min="0.01",
            max="0.05"
        ),
    ),
    V1beta1ParameterSpec(
        name="momentum",
        parameter_type="double",
        feasible_space=V1beta1FeasibleSpace(
            min="0.5",
            max="0.9"
        ),
    )
]

In [11]:
# Experiment Trial template.
# TODO (andreyvelich): Use community image for the mnist example.
trial_spec = {
    "apiVersion": "kubeflow.org/v1",
    "kind": "PyTorchJob",
    "spec": {
        "pytorchReplicaSpecs": {
            "Master": {
                "replicas": 1,
                "restartPolicy": "OnFailure",
                "template": {
                    "spec": {
                        "containers": [
                            {
                                "name": "pytorch",
                                "image": "docker.io/170642/pytorch-mnist",
                                "command": [
                                    "python3",
                                    "/opt/pytorch-mnist/mnist.py",
                                    "--epochs=1",
                                    "--lr=${trialParameters.learningRate}",
                                    "--momentum=${trialParameters.momentum}"
                                ]
                            }
                        ]
                    }
                }
            },
            "Worker": {
                "replicas": 1,
                "restartPolicy": "OnFailure",
                "template": {
                    "spec": {
                        "containers": [
                            {
                                "name": "pytorch",
                                "image": "docker.io/170642/pytorch-mnist",
                                "command": [
                                    "python3",
                                    "/opt/pytorch-mnist/mnist.py",
                                    "--epochs=1",
                                    "--lr=${trialParameters.learningRate}",
                                    "--momentum=${trialParameters.momentum}"
                                ]
                            }
                        ]
                    }
                }
            }
        }
    }
}


# Configure parameters for the Trial template.
trial_template = V1beta1TrialTemplate(
    primary_container_name="pytorch",
    trial_parameters=[
        V1beta1TrialParameterSpec(
            name="learningRate",
            description="Learning rate for the training model",
            reference="lr"
        ),
        V1beta1TrialParameterSpec(
            name="momentum",
            description="Batch size for the model",
            reference="momentum"
        ),
    ],
    trial_spec=trial_spec
)


In [12]:
# Create an Experiment from the above parameters.
experiment_spec = V1beta1ExperimentSpec(
    max_trial_count=max_trial_count,
    max_failed_trial_count=max_failed_trial_count,
    parallel_trial_count=parallel_trial_count,
    objective=objective,
    algorithm=algorithm,
    parameters=parameters,
    trial_template=trial_template
)

In [13]:
# Get the Katib launcher.
katib_experiment_launcher_op = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/katib-launcher/component.yaml")

@dsl.pipeline(
    name="Mnist katib",
    description="A Mnist katib"
)

def mnist_tune_test():

    # Katib launcher component.
    # Experiment Spec should be serialized to a valid Kubernetes object.
    op = katib_experiment_launcher_op(
        experiment_name=experiment_name,
        experiment_namespace=experiment_namespace,
        experiment_spec=ApiClient().sanitize_for_serialization(experiment_spec),
        experiment_timeout_minutes=60,
        delete_finished_experiment=False)

    # Output container to print the results.
    op_out = dsl.ContainerOp(
        name="best-hp",
        image="library/bash:4.4.23",
        command=["sh", "-c"],
        arguments=["echo Best HyperParameters: %s" % op.output],
    )

In [14]:
import sys
sys.path.insert(0, "..")
from constants import NAMESPACE, HOST
from utils.auth import get_session_cookie

In [15]:
session_cookie = get_session_cookie()
client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
    namespace=NAMESPACE,
)

In [16]:
client.create_run_from_pipeline_func(mnist_tune_test,  namespace=experiment_namespace, arguments={})

RunPipelineResult(run_id=a73c192c-04ce-47a3-9746-814919a7fd26)