# Explore Kubeflow Katib Hyperparamter Optimization API

In [1]:
import sys
print(sys.version)

3.12.8 (main, Dec  3 2024, 18:42:41) [Clang 16.0.0 (clang-1600.0.26.4)]


In [17]:
!kubectl apply -k "github.com/kubeflow/katib.git/manifests/v1beta1/installs/katib-standalone?ref=master"

namespace/kubeflow configured
customresourcedefinition.apiextensions.k8s.io/experiments.kubeflow.org created
customresourcedefinition.apiextensions.k8s.io/suggestions.kubeflow.org created
customresourcedefinition.apiextensions.k8s.io/trials.kubeflow.org created
serviceaccount/katib-controller created
serviceaccount/katib-ui created
clusterrole.rbac.authorization.k8s.io/katib-controller created
clusterrole.rbac.authorization.k8s.io/katib-ui created
clusterrolebinding.rbac.authorization.k8s.io/katib-controller created
clusterrolebinding.rbac.authorization.k8s.io/katib-ui created
configmap/katib-config created
configmap/trial-templates created
secret/katib-mysql-secrets created
secret/katib-webhook-cert created
service/katib-controller created
service/katib-db-manager created
service/katib-mysql created
service/katib-ui created
persistentvolumeclaim/katib-mysql created
deployment.apps/katib-controller created
deployment.apps/katib-db-manager created
deployment.apps/katib-mysql created
dep

In [3]:
%pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1 --break-system-packages
%pip install git+https://github.com/kubeflow/trainer.git@pr-2576#egg=trainer-sdk&subdirectory=sdk/python --break-system-packages

Collecting git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1
  Cloning https://github.com/kubeflow/katib.git to /private/var/folders/_z/49hgcx6x4db7pjpfzjcphlcm0000gn/T/pip-req-build-kz6279aq
  Running command git clone --filter=blob:none --quiet https://github.com/kubeflow/katib.git /private/var/folders/_z/49hgcx6x4db7pjpfzjcphlcm0000gn/T/pip-req-build-kz6279aq
  Resolved https://github.com/kubeflow/katib.git to commit dd4acfc2ce8fb4e4e0095a49d012443e26c70b4e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting certifi>=14.05.14 (from kubeflow-katib==0.18.0rc0)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Collecting setuptools>=21.0.0 (from kubeflow-katib==0.18.0rc0)
  Using cached setuptools-79.0.1-py3-none-any.whl.metadata (6.5 kB)
Collecting urllib3>=1.15.1 (from kubeflow-katib==0.18.0rc0)
  Downloading urlli

In [7]:
# i assume after the package updates in this pr, i don't need to update them manually
%pip install transformers peft --break-system-packages

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none

In [20]:
from kubeflow.katib import TrainerResources


resources_per_trial = TrainerResources(
    num_workers=1,                    # Number of distributed workers
    num_procs_per_worker=1,           # Processes per worker
    resources_per_worker={            # Resource allocation per worker
        "gpu": 0,                     # Number of GPUs
        "cpu": 1,                     # Number of CPUs
        "memory": "1G",              # Memory allocation
    },
)

In [8]:
import kubeflow.katib as katib
from kubeflow.katib import KatibClient

from transformers import AutoModelForSequenceClassification, TrainingArguments
from peft import LoraConfig

from kubeflow.storage_initializer.hugging_face import (
    HuggingFaceModelParams,
    HuggingFaceDatasetParams,
    HuggingFaceTrainerParams,
)

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
hf_model = HuggingFaceModelParams(
    model_uri = "hf://meta-llama/Llama-3.2-1B",
    transformer_type = AutoModelForSequenceClassification,
)

# Train the model on 1 movie reviews from imdb
# https://huggingface.co/datasets/stanfordnlp/imdb
hf_dataset = HuggingFaceDatasetParams(
    repo_id = "imdb",
    split = "train[:1]",
)

hf_tuning_parameters = HuggingFaceTrainerParams(
    training_parameters = TrainingArguments(
        output_dir = "results",
        save_strategy = "no",
        hub_strategy="all_checkpoints",
        learning_rate = 1e-05, #katib.search.double(min=1e-05, max=5e-05),
        num_train_epochs=1,
    ),
    lora_config=LoraConfig(
        r = 8,
        lora_alpha=8,
        lora_dropout=0.1,
        bias="none",
    )
)

In [16]:
cl = KatibClient(namespace="kubeflow")

In [25]:
# Fine-tuning for Binary Classification
exp_name = "testllm"
cl.tune(
    name = exp_name,
    model_provider_parameters = hf_model,
    dataset_provider_parameters = hf_dataset,
    trainer_parameters = hf_tuning_parameters,
    objective_metric_name = "train_loss",
    objective_type = "minimize",
    algorithm_name = "random",
    max_trial_count = 10,
    parallel_trial_count = 2,
    resources_per_trial=resources_per_trial,
)

cl.wait_for_experiment_condition(name=exp_name)

# Get the best hyperparameters.
print(cl.get_optimal_hyperparameters(exp_name))

Thank you for using `tune` API for LLM hyperparameter optimization. This feature is in the alpha stage. Kubeflow community is looking for your feedback. Please share your experience via #kubeflow-katib Slack channel or the Kubeflow Katib GitHub.
PVC 'testllm' already exists in namespace kubeflow.


TypeError: Object of type LoraRuntimeConfig is not JSON serializable