In [1]:
import kfp.dsl as dsl
from kfp.v2 import compiler
from kfp.v2.google.client import AIPlatformClient
from kfp.v2.dsl import component,Model,Output,Dataset
from typing import List,Union,Tuple,NamedTuple
from kfp.v2.dsl import Input, Model, Output, Dataset, Metrics, ClassificationMetrics, component, Artifact

In [50]:
PROJECT_ID = "XXXXXX"
BUCKET_NAME = 'XXXXXX'
PIPELINE_NAME = 'pipeline-test-metadata'
PIPELINE_ROOT = 'gs://XXXXXX/test_metadata'

In [68]:
@component(base_image='gcr.io/deeplearning-platform-release/sklearn-cpu:latest')
def get_data_op(
        output_dataset:Output[Dataset]
)->None:
    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import train_test_split
    import os
    import pandas as pd
    california = fetch_california_housing()
    x = california.data 
    y = california.target 
    # output
    os.makedirs(output_dataset.path, exist_ok=True)# kubeflowが作るdatasetのパスを作成
    pd.DataFrame(x).to_csv(os.path.join(output_dataset.path, 'x.csv'),index=False) # 直接ファイルを吐き出す
    print(f'File path: {output_dataset.path}')


In [69]:
@component(base_image='gcr.io/deeplearning-platform-release/sklearn-cpu:latest')
def train(
        input_dataset: Input[Dataset], # データセットとして入力を受け取る
)->None:
    import pandas as pd
    import os
    print('input_dataset:',input_dataset)
    print('input_dataset.path:',input_dataset.path)
    df_input = pd.read_csv(os.path.join(input_dataset.path, 'x.csv'))
    print('df_input.head(): ',df_input.head())

In [70]:
# パイプライン構築
@dsl.pipeline(
    name=PIPELINE_NAME,
    description='XXXXXX_test_metadata',
    pipeline_root=PIPELINE_ROOT
)
def pipeline(PROJECT_ID:str,BUCKET_NAME:str,PIPELINE_ROOT:str):
    get_data = get_data_op()
    train(input_dataset=get_data.outputs['output_dataset'])

In [None]:
# パイプラインのコンパイル
compiler.Compiler().compile(pipeline_func=pipeline, package_path='./XXXXXX.json')

In [72]:
from datetime import datetime
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

from google.cloud import aiplatform
pipeline_job = aiplatform.PipelineJob(
    display_name="metadata-pipeline",
    template_path="XXXXX/metadata_pipeline.json",
    job_id="metadata-pipeline-{}".format(TIMESTAMP),
    parameter_values={
        "PROJECT_ID":PROJECT_ID,
        "BUCKET_NAME":BUCKET_NAME,
        "PIPELINE_ROOT":PIPELINE_ROOT
    },
    enable_caching=True,
)

In [None]:
pipeline_job.submit()