In [1]:
import kfp
from kfp import components
import os
from pathlib import Path

In [2]:
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE

'user-example-com'

In [3]:
MONITORING = Path(os.path.abspath("")).parent.parent
HTML_VIEWER = os.path.join(MONITORING, 'html-viewer')

In [4]:
chicago_taxi_dataset_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml')
data_quality_op         = components.load_component_from_file('component.yaml')
html_viewer_op          = components.load_component_from_file(os.path.join(HTML_VIEWER, 'component.yaml'))

In [5]:
@kfp.dsl.pipeline(name='test-dq-pipeline-csv')
def test_pipeline():
    training_data_csv = chicago_taxi_dataset_op(
        where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-01-10"',
        select='tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total',
        limit=10000,
    ).output
    
    report = data_quality_op(
        dataset_dir=training_data_csv,
        dataset_type='csv'
    ).output


In [6]:
kfp.Client().create_run_from_pipeline_func(
    test_pipeline, arguments={}
)

RunPipelineResult(run_id=b63068ae-72a8-4942-a866-0e7a7ea0a59f)

### test dataframe data type

In [7]:
from kfp.components import create_component_from_func, InputPath, OutputPath


In [8]:
def load_dataset(
    output_dir: OutputPath(str)
) -> None:
    
    from sklearn import datasets
    from pathlib import Path
    import pandas as pd
    import numpy as np
    
    iris_data = datasets.load_iris(as_frame=True)
    Path(output_dir).parent.mkdir(parents=True, exist_ok=True)
    
    iris_data.frame.to_pickle(output_dir)
    
    
load_iris_data = create_component_from_func(
    load_dataset,
    base_image="quay.io/ibm/kubeflow-notebook-image-ppc64le:latest",
    packages_to_install=['sklearn']
)
    


In [9]:
from sklearn import datasets

@kfp.dsl.pipeline(name='test-dq-pipeline-dataframe')
def test_load_dataframe():
    iris_data = load_iris_data().output
    
    report = data_quality_op(
        dataset_dir=iris_data
    )    

In [10]:
kfp.Client().create_run_from_pipeline_func(
    test_load_dataframe, arguments={}
)

RunPipelineResult(run_id=26cb7774-c6e8-4e6a-9d0f-44aeeaa8ce2b)