# Component Test: Create Data Drift Report

## Authors
- Adam Shedivy <Adam.Shedivy@ibm.com>
- Sebastian Lehrig <sebastian.lehrig1@ibm.com>

## License
Apache-2.0 License

In [1]:
import kfp
from kfp import components
from kfp.components import create_component_from_func, InputPath, OutputPath
import os
from pathlib import Path

%load_ext lab_black

In [2]:
BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE

'user-example-com'

## Component to be tested

In [12]:
create_data_drift_report_comp = components.load_component_from_file("component.yaml")

### Test CSV data type

In [13]:
chicago_taxi_dataset_comp = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml"
)


@kfp.dsl.pipeline(name="test-dq-pipeline-csv")
def test_pipeline():
    training_data_csv = chicago_taxi_dataset_comp(
        where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-01-10"',
        select="tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total",
        limit=10000,
    ).output

    ref_data_csv = chicago_taxi_dataset_comp(
        where='trip_start_timestamp >= "2019-02-01" AND trip_start_timestamp < "2019-02-10"',
        select="tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total",
        limit=10000,
    ).output

    report = create_data_drift_report_comp(
        dataset_dir=training_data_csv, ref_dataset_dir=ref_data_csv, dataset_type="csv"
    ).output

In [14]:
kfp.Client().create_run_from_pipeline_func(test_pipeline, arguments={})

RunPipelineResult(run_id=ebcd04c2-6a0f-46d5-8e47-2957086d27d0)

### Test dataframe (pickle) data type

In [31]:
from sklearn import datasets
from pathlib import Path

datasets.load_iris(as_frame=True).frame[75:]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
75,6.6,3.0,4.4,1.4,1
76,6.8,2.8,4.8,1.4,1
77,6.7,3.0,5.0,1.7,1
78,6.0,2.9,4.5,1.5,1
79,5.7,2.6,3.5,1.0,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [32]:
def load_dataframe(
    output_dir: OutputPath(str), start_idx: int = None, end_idx: int = None
) -> None:

    from sklearn import datasets
    from pathlib import Path

    iris_data = datasets.load_iris(as_frame=True).frame[start_idx:end_idx]
    Path(output_dir).parent.mkdir(parents=True, exist_ok=True)

    iris_data.to_pickle(output_dir)


load_iris_data = create_component_from_func(
    load_dataframe,
    base_image=BASE_IMAGE,
)


@kfp.dsl.pipeline(name="test-dq-pipeline-dataframe")
def test_load_dataframe():
    iris_data = load_iris_data(start_idx=0, end_idx=75).output

    ref_data = load_iris_data(start_idx=75, end_idx=None).output

    report = create_data_drift_report_comp(
        dataset_dir=iris_data, ref_dataset_dir=ref_data
    )

In [33]:
kfp.Client().create_run_from_pipeline_func(test_load_dataframe, arguments={})

RunPipelineResult(run_id=a2931f66-3b63-4652-926e-f40997b568fe)

### Test Huggingface datasets data type

In [10]:
load_huggingface_dataset_comp = components.load_component_from_file(
    f"{os.getenv('HOME')}/kubeflow-ppc64le-components/data-collection/load-huggingface-dataset/component.yaml"
)


@kfp.dsl.pipeline(name="test-dq-pipeline-huggingface")
def test_load_huggingface():
    huggingface_dataset = load_huggingface_dataset_comp(
        path="glue", configuration="mrpc", split="train"
    ).outputs["dataset_dir"]

    report = create_data_drift_report_comp(
        dataset_dir=huggingface_dataset,
        dataset_type="huggingface",
    )

In [11]:
kfp.Client().create_run_from_pipeline_func(test_load_huggingface, arguments={})

RunPipelineResult(run_id=aaa308e9-89de-4138-bd2b-1732572d0588)