Skip to content

Commit

Permalink
[SDK] Enable resource specification for trial containers
Browse files Browse the repository at this point in the history
Co-authored-by: shipengcheng1230 <shipengcheng1230@gmail.com>
Co-authored-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
  • Loading branch information
3 people committed Aug 4, 2023
1 parent c749d27 commit 052fd34
Showing 1 changed file with 27 additions and 3 deletions.
30 changes: 27 additions & 3 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import multiprocessing
import textwrap
import time
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, Union

import grpc
import kubeflow.katib.katib_api_pb2 as katib_api_pb2
Expand Down Expand Up @@ -144,6 +144,7 @@ def tune(
max_trial_count: int = None,
parallel_trial_count: int = None,
max_failed_trial_count: int = None,
resources_per_trial: Union[dict, client.V1ResourceRequirements, None] = None,
retain_trials: bool = False,
packages_to_install: List[str] = None,
pip_index_url: str = "https://pypi.org/simple",
Expand Down Expand Up @@ -177,6 +178,21 @@ def tune(
values check this doc: https://www.kubeflow.org/docs/components/katib/experiment/#configuration-spec.
parallel_trial_count: Number of Trials that Experiment runs in parallel.
max_failed_trial_count: Maximum number of Trials allowed to fail.
resources_per_trial: A parameter that lets you specify how much
resources each trial container should have. You can either specify a
kubernetes.client.V1ResourceRequirements object (documented here:
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
or a dictionary that includes one or more of the following keys:
`cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
values for these keys are documented here:
https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
For example:
{
"cpu": "1",
"gpu": "1",
"memory": "2Gi",
}
This parameter is optional and defaults to None.
retain_trials: Whether Trials' resources (e.g. pods) are deleted after Succeeded state.
packages_to_install: List of Python packages to install in addition
to the base image packages. These packages are installed before
Expand Down Expand Up @@ -280,6 +296,15 @@ def tune(
+ exec_script
)

if isinstance(resources_per_trial, dict):
if "gpu" in resources_per_trial:
resources_per_trial["nvidia.com/gpu"] = resources_per_trial.pop("gpu")

resources_per_trial = client.V1ResourceRequirements(
requests=resources_per_trial,
limits=resources_per_trial,
)

# Create Trial specification.
trial_spec = client.V1Job(
api_version="batch/v1",
Expand All @@ -297,6 +322,7 @@ def tune(
image=base_image,
command=["bash", "-c"],
args=[exec_script],
resources=resources_per_trial,
)
],
),
Expand Down Expand Up @@ -640,7 +666,6 @@ def wait_for_experiment_condition(
namespace = namespace or self.namespace

for _ in range(round(timeout / polling_interval)):

# We should get Experiment only once per cycle and check the statuses.
experiment = self.get_experiment(name, namespace, apiserver_timeout)

Expand Down Expand Up @@ -1175,7 +1200,6 @@ def get_trial_metrics(
)

with katib_api_pb2.beta_create_DBManager_stub(channel) as client:

try:
# When metric name is empty, we select all logs from the Katib DB.
observation_logs = client.GetObservationLog(
Expand Down

0 comments on commit 052fd34

Please sign in to comment.