In [2]:
from kfp.components import create_component_from_func, InputPath, OutputPath

%load_ext lab_black

BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"


def data_drift_report(
    dataset_dir: InputPath(str),
    ref_dataset_dir: InputPath(str),
    output_dir: OutputPath(str),
    mlpipeline_ui_metadata_path: OutputPath(),
    dataset_type="df",
    additional_args: dict = None,
    column_mapping: dict = None,
):

    from datasets import load_from_disk
    from evidently.metric_preset import DataDriftPreset
    from evidently.report import Report
    import json
    import logging
    import pandas as pd
    from pathlib import Path
    import sys

    logging.basicConfig(
        stream=sys.stdout,
        level=logging.INFO,
        format="%(levelname)s %(asctime)s: %(message)s",
    )

    def _process_dataframe(dataset, opt_args=None):
        return pd.read_pickle(dataset, **opt_args)

    def _process_dataframe_feather(dataset, opt_args=None):
        return pd.read_feather(dataset, **opt_args)

    def _process_csv(dataset, opt_args=None):
        return pd.read_csv(dataset, **opt_args)

    def _process_huggingface(dataset, opt_args=None):
        return load_from_disk(dataset).to_pandas()

    DATA_TYPES = {
        "df": _process_dataframe,
        "df/feather": _process_dataframe_feather,
        "csv": _process_csv,
        "huggingface": _process_huggingface,
    }

    def process_dataset(dataset, args=None):
        _dataset_type = dataset_type.lower()

        if dataset is None:
            return None
        if _dataset_type not in DATA_TYPES.keys():
            raise KeyError(
                f"Dataset type {_dataset_type} not supported by the data quality component"
            )
        return DATA_TYPES[_dataset_type](dataset, (args or {}))

    logging.info("Preparing datasets for data quality report...")
    df = process_dataset(dataset_dir, args=additional_args)
    ref_data = process_dataset(ref_dataset_dir, args=additional_args)

    report = Report(metrics=[DataDriftPreset()])

    logging.info("Generating report using Evidently...")
    report.run(current_data=df, reference_data=ref_data, column_mapping=column_mapping)

    logging.info("Saving report as HTML...")
    Path(output_dir).parent.mkdir(parents=True, exist_ok=True)
    report.save_html(output_dir)

    logging.info("Writing HTML content to Metadata UI...")
    html_content = open(output_dir, "r").read()
    metadata = {
        "outputs": [
            {
                "type": "web-app",
                "storage": "inline",
                "source": html_content,
            }
        ]
    }

    with open(mlpipeline_ui_metadata_path, "w") as f:
        json.dump(metadata, f)

    logging.info("Finished.")


data_drift_report_op = create_component_from_func(
    data_drift_report,
    output_component_file="component.yaml",
    base_image=BASE_IMAGE,
    packages_to_install=[
        "evidently==0.2.6",
    ],
)

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
