# Component Test: Create Data Quality Report with Evidently

## Authors
- Adam Shedivy <Adam.Shedivy@ibm.com>
- Sebastian Lehrig <sebastian.lehrig1@ibm.com>

## License
Apache-2.0 License

In [13]:
import kfp
from kfp import components
from kfp.components import create_component_from_func, OutputPath
import os

%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [14]:
BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE

'user-example-com'

## Component to be tested

In [15]:
create_data_quality_report_comp = components.load_component_from_file("component.yaml")

### Test CSV data type

In [16]:
chicago_taxi_dataset_comp = components.load_component_from_url(
    "https://raw.githubusercontent.com/kubeflow/pipelines/e3337b8bdcd63636934954e592d4b32c95b49129/components/datasets/Chicago%20Taxi/component.yaml"
)


@kfp.dsl.pipeline(name="test-dq-pipeline-csv")
def test_pipeline():
    training_data_csv = chicago_taxi_dataset_comp(
        where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-01-10"',
        select="tips,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tolls,extras,trip_total",
        limit=10000,
    ).output

    create_data_quality_report_comp(
        dataset_dir=training_data_csv, dataset_type="csv"
    ).output

In [18]:
kfp.Client().create_run_from_pipeline_func(test_pipeline, arguments={})

RunPipelineResult(run_id=fa2dd235-082c-4c59-98cb-e0f89b112106)

### Test dataframe (pickle) data type

In [19]:
def load_dataframe(output_dir: OutputPath(str)) -> None:

    from sklearn import datasets
    from pathlib import Path

    iris_data = datasets.load_iris(as_frame=True)
    Path(output_dir).parent.mkdir(parents=True, exist_ok=True)

    iris_data.frame.to_pickle(output_dir)


load_iris_data = create_component_from_func(
    load_dataframe,
    base_image=BASE_IMAGE,
)


@kfp.dsl.pipeline(name="test-dq-pipeline-dataframe")
def test_load_dataframe():
    iris_data = load_iris_data().output

    create_data_quality_report_comp(dataset_dir=iris_data)

In [21]:
kfp.Client().create_run_from_pipeline_func(test_load_dataframe, arguments={})

RunPipelineResult(run_id=479e177f-44a7-47f1-ae7b-a7493f32a87d)

### Test Huggingface datasets data type

In [22]:
load_huggingface_dataset_comp = components.load_component_from_file(
    f"{os.getenv('HOME')}/components/data-collection/load-huggingface-dataset/component.yaml"
)


@kfp.dsl.pipeline(name="test-dq-pipeline-huggingface")
def test_load_huggingface():
    huggingface_dataset = load_huggingface_dataset_comp(
        path="glue", configuration="mrpc", split="train"
    ).outputs["dataset_dir"]

    create_data_quality_report_comp(
        dataset_dir=huggingface_dataset,
        dataset_type="huggingface",
    )

In [23]:
kfp.Client().create_run_from_pipeline_func(test_load_huggingface, arguments={})

RunPipelineResult(run_id=fa8ebf7d-6b38-4e53-bfb7-85107a715e80)