In [1]:
# import libraries
import os

# load all environment variables
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

pipeline_root = os.environ['PIPELINE_ROOT']
base_image = os.environ.get("CONTAINER_IMAGE")
project_id = os.environ['PROJECT_ID']
region = os.environ['REGION']
service_account = os.environ['SERVICE_ACCOUNT']
artifact_repo = os.environ['ARTIFACT_REPO']
model_display_name = os.environ['MODEL_DISPLAY_NAME']
model_name = os.environ['MODEL_NAME']
endpoint_name = os.environ['ENDPOINT_NAME']
gcs_url = os.environ['GCS_URL']
train_ratio = float(os.environ['TRAIN_RATIO'])

In [2]:
# %%writefile ../components/load_data.py

from kfp.v2 import dsl
from kfp.v2.dsl import Dataset, Output
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

container_image = os.environ.get("CONTAINER_IMAGE", "python:3.8") # Fallback to a default image if not set

@dsl.component(base_image=base_image)
def load_data(
    gcs_url: str, 
    output_dataset: Output[Dataset]
):
    """Download data from a GCS URL and save it to the specified path as a Dataset."""
    
    # Logic-specific Imports
    from google.cloud import storage
    import pandas as pd

    # Extract bucket and blob info from GCS URL
    if not gcs_url.startswith("gs://"):
        raise ValueError("Invalid GCS URL format")
    parts = gcs_url[5:].split("/", 1)
    if len(parts) != 2:
        raise ValueError("Invalid GCS URL format")
    bucket_name, blob_name = parts

    # Create a GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    # Read the contents into Pandas DataFrame
    df = pd.read_csv(blob.open("rb"))

    # Save to the specified path as Dataset
    df.to_csv(output_dataset.path, index=False)
    output_dataset.metadata['dataset_metadata'] = {'format': 'csv'}

  from kfp.v2 import dsl


In [3]:
# %%writefile ../components/mlplatform_pipeline.py

from kfp.v2 import dsl
from kfp.v2 import compiler
import os

@dsl.pipeline(
    name="Data Loading and Preprocessing",
    description="A pipeline that loads data, preprocesses it, and deploys the best model.",
    pipeline_root=pipeline_root
)
def mlplatform_pipeline(
    gcs_url: str = gcs_url,
    ):
    load_data_op = load_data(gcs_url=gcs_url)


In [4]:
# generate a JSON file that you'll use to run the pipeline:
compiler.Compiler().compile(pipeline_func=mlplatform_pipeline, package_path="pipeline.json")

In [5]:
from datetime import datetime
from google.cloud import aiplatform, aiplatform_v1

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

# create vertex pipeline job
api_client = aiplatform.PipelineJob(
    display_name="ml-pipeline",
    template_path="pipeline.json",
    job_id=f"ml-pipeline-{TIMESTAMP}",
    enable_caching=False,
    project=project_id,
    location=region,
    parameter_values={
        "gcs_url": gcs_url,
#         "train_ratio": train_ratio,
#         "model_display_name": model_display_name,
#         "model_name": model_name,
#         "endpoint_name": endpoint_name,
#         project: str,
#         "location": str,
#         "container_image": container_image
    }
)

# run vertex pipeline job
api_client.submit(service_account=service_account)

ImportError: cannot import name 'aiplatform' from 'google.cloud' (unknown location)