In [None]:
import kfp
from kfp.v2.compiler import Compiler
from kfp import components as comp
from kfp.v2.google.client import AIPlatformClient
from google.cloud.aiplatform.pipeline_jobs import PipelineJob
from google.cloud.aiplatform import Model

project_id = "cde-ds-enablement-8k1r"
region = "europe-west1"
pipeline_root_path = "gs://cde-dse-test-artifacts/penguin-model"


def _train_model():
    """Trains our penguin model."""
    
    import os
    from pathlib import Path
    
    # from google.cloud import storage
    from google.cloud import bigquery
    
    import joblib
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline

    client = bigquery.Client(project=os.environ["CLOUD_ML_PROJECT_ID"])    
    data = client.query("SELECT * FROM `bigquery-public-data`.ml_datasets.penguins").to_dataframe()
    
    train = data[["species", "island", "sex"]].dropna().loc[lambda row: row.sex != "."]

    x = train.drop("sex", axis=1)
    y = train["sex"].map({'MALE':0, 'FEMALE':1}).astype(int)

    model = Pipeline(
        steps=[
            (
                "feature_engineering", 
                ColumnTransformer(
                  transformers=[("one_hot", OneHotEncoder(), ["species", "island"])],
                  remainder="drop"
                )
            ),
            (
                "model",
                RandomForestClassifier()
            )       
        ]
    )

    model.fit(x, y)
    
    # Save model artifact to local filesystem (doesn't persist)
    with Path("/gcs/cde-dse-test-outputs/model.pkl").open("wb") as file_:
        joblib.dump(model, file_)


train_model = comp.create_component_from_func(
    _train_model,
    packages_to_install=[
        "scikit-learn", 
        "google-cloud-bigquery", 
        "google-cloud-storage", 
        "pandas", 
        "pyarrow"
    ]
)
    

@kfp.dsl.pipeline(
    name="penguins",
    pipeline_root=pipeline_root_path)
def pipeline():
    train_model()

    
Compiler().compile(
    pipeline_func=pipeline, 
    package_path='pipeline.json'
)


job = PipelineJob(
    display_name="My first penguin pipeline",
    enable_caching=False,
    template_path="pipeline.json",
    parameter_values={},
    pipeline_root=pipeline_root_path,
    location=region,
)

job.run(service_account="cde-dse-test-pipeline@cde-ds-enablement-8k1r.iam.gserviceaccount.com")

In [None]:
%pip install kfp google-cloud-aiplatform