In [1]:
from google.cloud import bigquery

client = bigquery.Client()

In [5]:
job = client.query("SELECT * FROM `bigquery-public-data`.ml_datasets.penguins")
df = job.to_dataframe()

In [6]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Adelie Penguin (Pygoscelis adeliae),Dream,39.8,19.1,184.0,4650.0,MALE
2,Adelie Penguin (Pygoscelis adeliae),Dream,40.9,18.9,184.0,3900.0,MALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,46.5,17.9,192.0,3500.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Dream,37.3,16.8,192.0,3000.0,FEMALE


In [8]:
from sklearn.pipeline import Pipeline

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

train = df[["species", "island", "sex"]].dropna().loc[lambda row: row.sex != "."]

x = train.drop("sex", axis=1)
y = train["sex"].map({'MALE':0, 'FEMALE':1}).astype(int)

model = Pipeline(
    steps=[
        (
            "feature_engineering", 
            ColumnTransformer(
              transformers=[("one_hot", OneHotEncoder(), ["species", "island"])],
              remainder="drop"
            )
        ),
        (
            "model",
            RandomForestClassifier()
        )       
    ]
)

model.fit(x, y)
model.predict(x)

array([0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [61]:
import kfp
from kfp.v2.compiler import Compiler
from kfp import components as comp
from kfp.v2.google.client import AIPlatformClient
from google.cloud.aiplatform.pipeline_jobs import PipelineJob
from google.cloud.aiplatform import Model

project_id = "cde-ds-enablement-8k1r"
region = "europe-west1"
pipeline_root_path = "gs://cde-dse-test-artifacts/penguin-model"


def _train_model():
    """Trains our penguin model."""
    import os
    
    # from google.cloud import storage
    from google.cloud import bigquery
    
    import joblib
    from sklearn.compose import ColumnTransformer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import Pipeline

    client = bigquery.Client(project=os.environ["CLOUD_ML_PROJECT_ID"])    
    data = client.query("SELECT * FROM `bigquery-public-data`.ml_datasets.penguins").to_dataframe()
    
    train = data[["species", "island", "sex"]].dropna().loc[lambda row: row.sex != "."]

    x = train.drop("sex", axis=1)
    y = train["sex"].map({'MALE':0, 'FEMALE':1}).astype(int)

    model = Pipeline(
        steps=[
            (
                "feature_engineering", 
                ColumnTransformer(
                  transformers=[("one_hot", OneHotEncoder(), ["species", "island"])],
                  remainder="drop"
                )
            ),
            (
                "model",
                RandomForestClassifier()
            )       
        ]
    )

    model.fit(x, y)
    
    # Save model artifact to local filesystem (doesn't persist)
    with Path("/gcs/cde-dse-test-outputs/model.pkl").open("wb") as file_:
        joblib.dump(model, file_)


train_model = comp.create_component_from_func(
    _train_model,
    packages_to_install=[
        "scikit-learn", 
        "google-cloud-bigquery", 
        "google-cloud-storage", 
        "pandas", 
        "pyarrow"
    ]
)
    

@kfp.dsl.pipeline(
    name="penguins",
    pipeline_root=pipeline_root_path)
def pipeline():
    train_model()

    
Compiler().compile(
    pipeline_func=pipeline, 
    package_path='pipeline.json'
)


job = PipelineJob(
    display_name="My first penguin pipeline",
    enable_caching=False,
    template_path="pipeline.json",
    parameter_values={},
    pipeline_root=pipeline_root_path,
    location=region,
)

job.run(service_account="cde-dse-test-pipeline@cde-ds-enablement-8k1r.iam.gserviceaccount.com")

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/409590257761/locations/europe-west1/pipelineJobs/penguins-20211021063307
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/409590257761/locations/europe-west1/pipelineJobs/penguins-20211021063307')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/europe-west1/pipelines/runs/penguins-20211021063307?project=409590257761
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/409590257761/locations/europe-west1/pipelineJobs/penguins-20211021063307 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/409590257761/locations/europe-west1/pipelineJobs/penguins-20211021063307 curr

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [train-model].; Job (project_id = cde-ds-enablement-8k1r, job_id = 1949495688698003456) is failed due to the above error.; Failed to handle the job: {project_number = 409590257761, job_id = 1949495688698003456}"


In [60]:
Model.upload

<bound method Model.upload of <class 'google.cloud.aiplatform.models.Model'>>