<a href="https://colab.research.google.com/github/joahofmann/gcp-notebooks/blob/main/vertex_test_wine_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Get started

### Install Vertex AI SDK for Python and other required packages

In [1]:
import os

! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                 google-cloud-storage \
                                 kfp \
                                 google-cloud-pipeline-components

if os.getenv("IS_TESTING"):
    ! pip3 install --upgrade matplotlib $USER_FLAG -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/269.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m266.2/269.1 kB[0m [31m56.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.1/269.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [2]:
import sys



if "google.colab" in sys.modules:

    print("Restarting kernel...")

    import IPython

    print("Done.")

    app = IPython.Application.instance()

    print("Done.")


Restarting kernel...
Done.
Done.


In [None]:
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.

In [3]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [4]:
PROJECT_ID = "vertex-test-id"  # @param {type:"string"}
LOCATION = "europe-west3"  # @param {type:"string"}

#### UUID

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a uuid for each instance session, and append it onto the name of resources you create in this tutorial.

In [5]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

### Create a Cloud Storage bucket

Create a storage bucket to store intermediate artifacts such as datasets.

In [6]:
BUCKET_URI = f"gs://test-bucket-name-{PROJECT_ID}-{LOCATION}-roh"

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [7]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://test-bucket-name-vertex-test-id-europe-west3-roh/...


### Service Account

**If you don't know your service account**, try to get your service account using `gcloud` command by executing the second cell below.

In [8]:
SERVICE_ACCOUNT = "219162896674-compute@developer.gserviceaccount.com"  # @param {type:"string"}

In [9]:
import sys

IS_COLAB = "google.colab" in sys.modules

if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

#### Set service account access for Vertex AI Pipelines

Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step. You only need to run these once per service account.

In [10]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

### Import libraries and define constants

In [11]:
import google.cloud.aiplatform as aip
from kfp import compiler, dsl
from kfp.dsl import ClassificationMetrics, Metrics, Output, component

In [12]:
import kfp.dsl as dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component,
                        OutputPath,
                        InputPath)

from typing import NamedTuple
from datetime import datetime
import os # Import os for path manipulation if needed
import json

# --- Global Configuration Placeholders ---
# IMPORTANT: Replace these with your actual GCP project ID, bucket, and region.
# These variables need to be defined before they are used in the pipeline definition.
####PROJECT_ID = "vertex-test-id" # e.g., "my-gcp-project-12345"
REGION = LOCATION             # e.g., "us-central1" or "europe-west1"
# Define a GCS bucket path where pipeline artifacts will be stored.
# Ensure this bucket exists and your service account has write permissions.
####PIPELINE_ROOT = f"gs://your-kfp-pipeline-bucket/wine-quality-pipeline-root"

# Generate a timestamp for unique display names for pipeline runs
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
DISPLAY_NAME = f'pipeline-winequality-job-{TIMESTAMP}'

# Create pipeline

We create 4 components:  
- Load data   
- Train a  model
- Evaluate the model
- Deploy the model

The components have dependencies on `pandas`, `sklearn`.

#### Vertex AI constants

Setup up the following constants for Vertex AI pipelines:
- `PIPELINE_NAME`: Set name for the pipeline.
- `PIPELINE_ROOT`: Cloud Storage bucket path to store pipeline artifacts.

In [13]:
PIPELINE_NAME = "metrics-pipeline-v2"
PIPELINE_ROOT = "{}/pipeline_root/iris".format(BUCKET_URI)

Let's look at our data.

In [14]:
import pandas as pd
df_wine = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", delimiter=";")
df_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [15]:
df_wine.quality.describe()

Unnamed: 0,quality
count,4898.0
mean,5.877909
std,0.885639
min,3.0
25%,5.0
50%,6.0
75%,6.0
max,9.0


## Initialize Vertex AI SDK for Python

To get started using Vertex AI, you must [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

In [16]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

### Define pipeline components using scikit-learn

In this section, you define some Python function-based components that use scikit-learn to train some classifiers and produce evaluations that can be visualized.

Note the use of the `@component()` decorator in the definitions below. Optionally, you can set a list of packages for the component to install. That is, list the base image to use (the default is a Python 3.7 image), and the name of a component YAML file to generate, so that the component definition can be shared and reused.

In [17]:
@dsl.component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.2"],
    base_image="python:3.9",
    output_component_file="load_data_component.yaml"
)

def get_wine_data(
    url: str,
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset]
):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split as tts

    df_wine = pd.read_csv(url, delimiter=";")
    df_wine['best_quality'] = [1 if x>=7 else 0 for x in df_wine.quality]
    df_wine['target'] = df_wine.best_quality
    df_wine = df_wine.drop(
        ['quality', 'total sulfur dioxide', 'best_quality'],
         axis=1,
    )
    train, test = tts(df_wine, test_size=0.3)
    train.to_csv(
        dataset_train.path + ".csv",
        index=False,
        encoding='utf-8-sig',
    )
    test.to_csv(
        dataset_test.path + ".csv",
        index=False,
        encoding='utf-8-sig',
    )

  @dsl.component(


In [18]:
@component(
    packages_to_install = [
        "pandas",
        "scikit-learn"
    ],
    base_image="python:3.9",
    output_component_file="model_training_component.yml",
)
def train_winequality(
    dataset:  Input[Dataset],
    model: Output[Model],
):
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import pickle

    data = pd.read_csv(dataset.path+".csv")
    model_rf = RandomForestClassifier(n_estimators=10)
    model_rf.fit(
        data.drop(columns=["target"]),
        data.target,
    )
    model.metadata["framework"] = "RF"
    file_name = model.path + f".pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(model_rf, file)

  @component(


In [30]:
!gcloud compute instances describe YOUR_VM_INSTANCE_NAME --zone=YOUR_VM_ZONE --format="value(serviceAccounts[0].email)"

[1;31mERROR:[0m (gcloud.compute.instances.describe) The required property [project] is not currently set.
It can be set on a per-command basis by re-running your command with the [--project] flag.

You may set it for your current workspace by running:

  $ gcloud config set project VALUE

or it can be set temporarily by the environment variable [CLOUDSDK_CORE_PROJECT]


In [19]:
@component(
    packages_to_install = [
        "pandas",
        "scikit-learn"
    ],
    base_image="python:3.9",
    output_component_file="model_evaluation_component.yml",
)
def winequality_evaluation(
    test_set:  Input[Dataset],
    rf_winequality_model: Input[Model],
    thresholds_dict_str: str,
    metrics: Output[ClassificationMetrics],
    kpi: Output[Metrics]
) -> NamedTuple("output", [("deploy", str)]):

    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import logging
    import pickle
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import json
    import typing

    def threshold_check(val1, val2):
        cond = "false"
        if val1 >= val2 :
            cond = "true"
        return cond

    data = pd.read_csv(test_set.path+".csv")
    model = RandomForestClassifier()
    file_name = rf_winequality_model.path + ".pkl"
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

    y_test = data.drop(columns=["target"])
    y_target=data.target
    y_pred = model.predict(y_test)

    y_scores =  model.predict_proba(
        data.drop(columns=["target"])
    )[:, 1]
    fpr, tpr, thresholds = roc_curve(
         y_true=data.target.to_numpy(),
        y_score=y_scores, pos_label=True
    )
    metrics.log_roc_curve(
        fpr.tolist(),
        tpr.tolist(),
        thresholds.tolist()
    )

    metrics.log_confusion_matrix(
       ["False", "True"],
       confusion_matrix(
           data.target, y_pred
       ).tolist(),
    )

    accuracy = accuracy_score(data.target, y_pred.round())
    thresholds_dict = json.loads(thresholds_dict_str)
    rf_winequality_model.metadata["accuracy"] = float(accuracy)
    kpi.log_metric("accuracy", float(accuracy))
    deploy = threshold_check(float(accuracy), int(thresholds_dict['roc']))
    return (deploy,)

  @component(


In [20]:
@dsl.component(
    packages_to_install=["google-cloud-aiplatform", "scikit-learn",  "kfp"],
    base_image="python:3.9",
    output_component_file="model_winequality_component.yml"
)
def deploy_winequality(
    model: Input[Model],
    project: str,
    region: str,
    serving_container_image_uri : str,
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=region)

    DISPLAY_NAME  = "winequality"
    MODEL_NAME = "winequality-rf"
    ENDPOINT_NAME = "winequality_endpoint"

    def create_endpoint():
        endpoints = aiplatform.Endpoint.list(
        filter='display_name="{}", deployed_models.display_name="{}"'.format(ENDPOINT_NAME, DISPLAY_NAME),
        order_by='create_time desc',
        project=project,
        location=region,
        )
        if len(endpoints) > 0:
            endpoint = endpoints[0]  # most recently created
            # Check if the model is already deployed on the endpoint
            for deployed_model in endpoint.deployed_models:
                if deployed_model.display_name == DISPLAY_NAME:
                    print(f"Model {DISPLAY_NAME} is already deployed on endpoint {ENDPOINT_NAME}")
                    return endpoint
            print(f"Endpoint {ENDPOINT_NAME} exists, but model {DISPLAY_NAME} is not deployed. Deploying model...")
            return endpoint # Return existing endpoint to deploy the new model
        else:
            print(f"Endpoint {ENDPOINT_NAME} does not exist. Creating endpoint...")
            endpoint = aiplatform.Endpoint.create(
            display_name=ENDPOINT_NAME, project=project, location=region
        )
        return endpoint # Return newly created endpoint


    endpoint = create_endpoint()

    # Import a model programmatically
    # Check if a model with the same display name already exists
    models = aiplatform.Model.list(
        filter='display_name="{}"'.format(DISPLAY_NAME),
        order_by='create_time desc',
        project=project,
        location=region,
    )
    if len(models) > 0:
        model_upload = models[0] # most recently created model with same display name
        print(f"Model {DISPLAY_NAME} already exists. Using existing model: {model_upload.resource_name}")
    else:
        print(f"Model {DISPLAY_NAME} does not exist. Uploading model...")
        model_upload = aiplatform.Model.upload(
            display_name = DISPLAY_NAME,
            artifact_uri = model.uri.replace("model", ""),
            serving_container_image_uri =  serving_container_image_uri,
            serving_container_health_route=f"/v1/models/{MODEL_NAME}",
            serving_container_predict_route=f"/v1/models/{MODEL_NAME}:predict",
            serving_container_environment_variables={
            "MODEL_NAME": MODEL_NAME,
        },
        )
        print(f"Model uploaded: {model_upload.resource_name}")


    # Deploy the model to the endpoint
    # Check if the model is already deployed before attempting to deploy
    model_already_deployed = False
    if endpoint.deployed_models:
        for deployed_model in endpoint.deployed_models:
            if deployed_model.model == model_upload.resource_name:
                model_already_deployed = True
                print(f"Model {DISPLAY_NAME} is already deployed on endpoint {ENDPOINT_NAME}.")
                break

    if not model_already_deployed:
        print(f"Deploying model {DISPLAY_NAME} to endpoint {ENDPOINT_NAME}...")
        model_deploy = model_upload.deploy(
            machine_type="n1-standard-4",
            endpoint=endpoint,
            traffic_split={"0": 100},
            deployed_model_display_name=DISPLAY_NAME,
        )
        print(f"Model deployed: {model_deploy.deployed_model_resource_name}")
        # Save data to the output params
        vertex_model.uri = model_deploy.deployed_model_resource_name
        vertex_endpoint.uri = endpoint.resource_name
    else:
        # If the model is already deployed, we still need to set the output URIs
        vertex_model.uri = model_upload.resource_name
        vertex_endpoint.uri = endpoint.resource_name

  @dsl.component(


In [21]:
DISPLAY_NAME = 'pipeline-winequality-job{}'.format(TIMESTAMP)

In [22]:
@dsl.pipeline(
    pipeline_root=BUCKET_URI,
    name="pipeline-winequality",
)
def pipeline(
    url: str = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
    project: str = PROJECT_ID,
    region: str = REGION,
    display_name: str = DISPLAY_NAME,
    api_endpoint: str = REGION+"-aiplatform.googleapis.com",
    thresholds_dict_str: str = '{"roc":0.8}',
    serving_container_image_uri: str = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.0-24:latest"
    ):

    # adding first component
    data_op = get_wine_data(url=url)
    # second component uses output of first component as input
    train_model_op = train_winequality(dataset=data_op.outputs["dataset_train"])
    # add third component (uses outputs of comp1 and comp2 as input)
    model_evaluation_op = winequality_evaluation(
        test_set=data_op.outputs["dataset_test"],
        rf_winequality_model=train_model_op.outputs["model"],
        # We deploy the model only if the model performance is above the threshold
        thresholds_dict_str = thresholds_dict_str,
    )

    # condition to deploy the model
    with dsl.Condition(
        model_evaluation_op.outputs["deploy"]=="true",
        name="deploy-winequality",
    ):
        deploy_model_op = deploy_winequality(
        model=train_model_op.outputs['model'],
        project=project,
        region=region,
        serving_container_image_uri = serving_container_image_uri,)

  with dsl.Condition(


In [23]:
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='ml_winequality.json',
)

In [24]:
pipeline_job = aip.PipelineJob(
    display_name=DISPLAY_NAME, # Use the dynamically generated display name
    template_path="ml_winequality.json",
    enable_caching=False,
    location=REGION,
)

In [25]:
pipeline_job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [get-wine-data].; Job (project_id = vertex-test-id, job_id = 2898715072073302016) is failed due to the above error.; Failed to handle the job: {project_number = 219162896674, job_id = 2898715072073302016}"


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
import os
import tempfile
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)

# Helper function containing the core logic, without the KFP component decorator
def _get_wine_data_logic(url: str, train_output_path: str, test_output_path: str):
    """Core logic for fetching and processing wine data."""
    logging.info(f"Attempting to read data from URL: {url}")
    try:
        df_wine = pd.read_csv(url, delimiter=";")
        logging.info(f"Successfully read data from URL. Shape: {df_wine.shape}")
    except Exception as e:
        logging.error(f"Error reading data from URL: {e}")
        raise # Re-raise the exception to fail the component

    df_wine['best_quality'] = [1 if x>=7 else 0 for x in df_wine.quality]
    df_wine['target'] = df_wine.best_quality
    df_wine = df_wine.drop(
        ['quality', 'total sulfur dioxide', 'best_quality'],
         axis=1,
    )
    train, test = tts(df_wine, test_size=0.3)

    train_file_path = train_output_path + ".csv"
    test_file_path = test_output_path + ".csv"

    logging.info(f"Writing train data to: {train_file_path}")
    try:
        train.to_csv(
            train_file_path,
            index=False,
            encoding='utf-8-sig',
        )
        logging.info("Train data written successfully.")
    except Exception as e:
        logging.error(f"Error writing train data to {train_file_path}: {e}")
        raise # Re-raise the exception

    logging.info(f"Writing test data to: {test_file_path}")
    try:
        test.to_csv(
            test_file_path,
            index=False,
            encoding='utf-8-sig',
        )
        logging.info("Test data written successfully.")
    except Exception as e:
        logging.error(f"Error writing test data to {test_file_path}: {e}")
        raise # Re-raise the exception


# --- Testing the helper function ---
with tempfile.TemporaryDirectory() as tmpdir:
    train_path = os.path.join(tmpdir, 'dataset_train')
    test_path = os.path.join(tmpdir, 'dataset_test')

    print("Testing _get_wine_data_logic with temporary files...")
    _get_wine_data_logic(
        url="http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",
        train_output_path=train_path,
        test_output_path=test_path
    )

    # Verify if the files were created and check their content
    print(f"Train dataset written to: {train_path}.csv")
    print(f"Test dataset written to: {test_path}.csv")

    # Optional: Load and display the first few rows of the created files
    try:
        df_train_test = pd.read_csv(train_path + ".csv")
        print("\nTrain data head:")
        display(df_train_test.head())

        df_test_test = pd.read_csv(test_path + ".csv")
        print("\nTest data head:")
        display(df_test_test.head())

    except FileNotFoundError:
        print("Error: Output files were not created.")
    except Exception as e:
        print(f"An error occurred while reading output files: {e}")

print("\nHelper function test complete.")

Testing _get_wine_data_logic with temporary files...
Train dataset written to: /tmp/tmppjjglxmk/dataset_train.csv
Test dataset written to: /tmp/tmppjjglxmk/dataset_test.csv

Train data head:


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,density,pH,sulphates,alcohol,target
0,7.0,0.15,0.38,2.2,0.047,33.0,0.9928,3.13,0.39,10.4,1
1,6.0,0.33,0.2,1.8,0.031,49.0,0.9919,3.41,0.53,11.0,0
2,6.9,0.28,0.4,8.2,0.036,15.0,0.9944,3.17,0.33,10.2,0
3,5.9,0.3,0.29,1.1,0.036,23.0,0.9904,3.19,0.38,11.3,0
4,5.7,0.21,0.24,2.3,0.047,60.0,0.995,3.65,0.72,10.1,0



Test data head:


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,density,pH,sulphates,alcohol,target
0,7.8,0.43,0.49,13.0,0.033,37.0,0.9955,3.14,0.35,11.3,0
1,6.8,0.23,0.31,2.8,0.047,40.0,0.99126,3.06,0.64,10.9,1
2,8.0,0.27,0.57,10.4,0.053,18.0,0.99732,3.12,0.68,9.0,0
3,6.7,0.26,0.29,5.8,0.025,26.0,0.9929,3.28,0.53,11.0,0
4,6.7,0.27,0.12,1.3,0.041,62.0,0.9921,3.21,0.42,10.0,0



Helper function test complete.


In [28]:
# Modify the KFP component to call the helper function
@dsl.component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.2"],
    base_image="python:3.9",
    output_component_file="load_data_component.yaml"
)

def get_wine_data(
    url: str,
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset]
):
    # Call the helper function with the KFP output paths
    _get_wine_data_logic(
        url=url,
        train_output_path=dataset_train.path,
        test_output_path=dataset_test.path
    )

  @dsl.component(
