In [None]:
import kfp

from typing import NamedTuple

from google.cloud import aiplatform

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google.client import AIPlatformClient

from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)

In [None]:
PROJECT_ID = 'jk-mlops-dev'
REGION = 'us-central1'
USER = 'test'

STAGING_BUCKET = 'gs://jk-vertex-us-central1'
PIPELINE_ROOT = "{}/pipeline_root/{}".format(STAGING_BUCKET, 'pipeline_runs')
VERTEX_SA = f'vertex-sa@{PROJECT_ID}.iam.gserviceaccount.com'

PIPELINE_ROOT

In [None]:
aiplatform.init(
    project=PROJECT_ID,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

In [None]:
@component
def preprocess(
    # An input parameter of type string.
    message: str,
    # Use Output to get a metadata-rich handle to the output artifact
    # of type `Dataset`.
    output_dataset_one: Output[Dataset],
    # A locally accessible filepath for another output artifact of type
    # `Dataset`.
    output_dataset_two_path: OutputPath("Dataset"),
    # A locally accessible filepath for an output parameter of type string.
    output_parameter_path: OutputPath(str),
):
    """'Mock' preprocessing step.
    Writes out the passed in message to the output "Dataset"s and the output message.
    """
    output_dataset_one.metadata["hello"] = "there"
    # Use OutputArtifact.path to access a local file path for writing.
    # One can also use OutputArtifact.uri to access the actual URI file path.
    with open(output_dataset_one.path, "w") as f:
        f.write(message)

    # OutputPath is used to just pass the local file path of the output artifact
    # to the function.
    with open(output_dataset_two_path, "w") as f:
        f.write(message)

    with open(output_parameter_path, "w") as f:
        f.write(message)

In [None]:
@component(
    base_image="python:3.9",  # Use a different base image.
)
def train(
    # An input parameter of type string.
    message: str,
    # Use InputPath to get a locally accessible path for the input artifact
    # of type `Dataset`.
    dataset_one_path: InputPath("Dataset"),
    # Use InputArtifact to get a metadata-rich handle to the input artifact
    # of type `Dataset`.
    dataset_two: Input[Dataset],
    # Output artifact of type Model.
    imported_dataset: Input[Dataset],
    model: Output[Model],
    # An input parameter of type int with a default value.
    num_steps: int = 3,
    # Use NamedTuple to return either artifacts or parameters.
    # When returning artifacts like this, return the contents of
    # the artifact. The assumption here is that this return value
    # fits in memory.
) -> NamedTuple(
    "Outputs",
    [
        ("output_message", str),  # Return parameter.
        ("generic_artifact", Artifact),  # Return generic Artifact.
    ],
):
    """'Mock' Training step.
    Combines the contents of dataset_one and dataset_two into the
    output Model.
    Constructs a new output_message consisting of message repeated num_steps times.
    """

    # Directly access the passed in GCS URI as a local file (uses GCSFuse).
    with open(dataset_one_path, "r") as input_file:
        dataset_one_contents = input_file.read()

    # dataset_two is an Artifact handle. Use dataset_two.path to get a
    # local file path (uses GCSFuse).
    # Alternately, use dataset_two.uri to access the GCS URI directly.
    with open(dataset_two.path, "r") as input_file:
        dataset_two_contents = input_file.read()

    with open(model.path, "w") as f:
        f.write("My Model")

    with open(imported_dataset.path, "r") as f:
        data = f.read()
    print("Imported Dataset:", data)

    # Use model.get() to get a Model artifact, which has a .metadata dictionary
    # to store arbitrary metadata for the output artifact. This metadata will be
    # recorded in Managed Metadata and can be queried later. It will also show up
    # in the UI.
    model.metadata["accuracy"] = 0.9
    model.metadata["framework"] = "Tensorflow"
    model.metadata["time_to_train_in_seconds"] = 257

    artifact_contents = "{}\n{}".format(dataset_one_contents, dataset_two_contents)
    output_message = " ".join([message for _ in range(num_steps)])
    return (output_message, artifact_contents)

In [None]:
@component
def read_artifact_input(
    generic: Input[Artifact],
):
    with open(generic.path, "r") as input_file:
        generic_contents = input_file.read()
        print(f"generic contents: {generic_contents}")

In [None]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline. Use to determine the pipeline Context.
    name="metadata-pipeline-v2",
)
def pipeline(message: str):
    importer = kfp.dsl.importer(
        artifact_uri="gs://ml-pipeline-playground/shakespeare1.txt",
        artifact_class=Dataset,
        reimport=False,
    )
    preprocess_task = preprocess(message=message)
    train_task = train(
        dataset_one=preprocess_task.outputs["output_dataset_one"],
        dataset_two=preprocess_task.outputs["output_dataset_two"],
        imported_dataset=importer.output,
        message=preprocess_task.outputs["output_parameter"],
        num_steps=5,
    )
    read_task = read_artifact_input(  # noqa: F841
        train_task.outputs["generic_artifact"]
    )

In [None]:
package_path = "hw_pipeline_job.json"

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path=package_path)


In [None]:
job_name = 'test_pipeline_run'

pipeline_job = aiplatform.PipelineJob(
    display_name=job_name,
    template_path=package_path,
    enable_caching=False,
)

pipeline_job.run(
    service_account=VERTEX_SA
)