In [21]:
from kfp.components import create_component_from_func, InputPath, OutputPath

In [25]:
BASE_IMAGE = 'quay.io/ibm/kubeflow-notebook-image-ppc64le:latest'

def data_quality_report(
    dataset_dir: InputPath(str),
    output_dir: OutputPath(str),
    mlpipeline_ui_metadata_path: OutputPath(),
    dataset_type='df',
    ref_dataset_dir: InputPath(str) = None,
    additional_args: dict = None,
    column_mapping: dict = None

):
    """
    Generate a data quality report for a given dataset.

    Args:
        dataset_dir (str): Path to the directory containing the dataset.
        output_dir (str): Path to the directory where the report HTML file will be saved.
        mlpipeline_ui_metadata_path (str): Path to the file where the metadata for the ML Pipeline UI will be saved.
        dataset_type (str, optional): Type of the dataset. Must be one of 'df', 'csv', or 'huggingface'.
            Defaults to 'df'.
        ref_dataset_dir (str, optional): Path to the directory containing a reference dataset for comparison.
            Defaults to None.
        additional_args (dict, optional): Additional arguments to be passed to the dataset processing function.
            Defaults to None.
        column_mapping (dict, optional): Mapping of columns between the current and reference datasets.
            Defaults to None.

    Returns:
        None: The function saves the report HTML file and metadata to disk.

    Raises:
        KeyError: If the `dataset_type` argument is not one of 'df', 'csv', or 'huggingface'.
    """
    import pandas as pd
    import os
    import json
    from evidently.metric_preset import DataQualityPreset
    from evidently.report import Report
    from pathlib import Path

    def _process_dataframe(dataset, opt_args=None):
        """
        Process a Pandas DataFrame.

        Args:
            dataset (str): Path to the DataFrame pickle file.
            opt_args (dict, optional): Additional arguments to be passed to `pd.read_pickle`.
                Defaults to None.

        Returns:
            pd.DataFrame: The processed DataFrame.
        """
        return pd.read_pickle(dataset, **opt_args)

    def _process_csv(dataset, opt_args=None):
        """
        Process a CSV file.

        Args:
            dataset (str): Path to the CSV file.
            opt_args (dict, optional): Additional arguments to be passed to `pd.read_csv`.
                Defaults to None.

        Returns:
            pd.DataFrame: The processed DataFrame.
        """
        return pd.read_csv(dataset, **opt_args)

    def _process_huggingface(dataset, opt_args=None):
        """
        Process a Hugging Face dataset.

        Args:
            dataset (str): Path to the Hugging Face dataset.
            opt_args (dict, optional): Additional arguments to be passed to `pd.DataFrame`.
                Defaults to None.

        Returns:
            pd.DataFrame: The processed DataFrame.
        """
        return pd.DataFrame(dataset, **opt_args)

    DATA_TYPES = {
        'df': _process_dataframe,
        'csv': _process_csv,
        'huggingface': _process_huggingface
    }

    def process_dataset(dataset, args=None):
        """
        Process a dataset based on its type.

        Args:
            dataset (str): Path to the dataset.
            args (dict, optional): Additional arguments to be passed to the processing function.
                Defaults to None.

        Returns:
            pd.DataFrame: The processed DataFrame.
        """
        _dataset_type = dataset_type.lower()
        
        if dataset is None:
            return None
        if _dataset_type not in DATA_TYPES.keys():
            raise KeyError(f"Dataset type {_dataset_type} not supported by the data quality component")
        return DATA_TYPES[dataset_type.lower()](dataset, (args or {}))
    
    # prepare datasets for DataQuality Report
    df = process_dataset(dataset_dir, args=additional_args)
    ref_data = process_dataset(ref_dataset_dir, args=additional_args)

    report = Report(metrics=[
        DataQualityPreset()
    ])
    
    # Generate Report using Evidently API 
    report.run(current_data=df, reference_data=ref_data,
               column_mapping=column_mapping)
    
    # Save Report as HTML
    Path(output_dir).parent.mkdir(parents=True, exist_ok=True)
    report.save_html(output_dir)

    html = os.path.abspath(output_dir)
    html_content = open(html, 'r').read()

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'inline',
            'source': html_content,
        }]
    }
    
    # Output/ Endpoint: Write HTML content to Metadata UI
    with open(mlpipeline_ui_metadata_path, 'w') as f:
        json.dump(metadata, f)

In [26]:
data_quality_report_op = create_component_from_func(
    data_quality_report,
    output_component_file='component.yaml',
    base_image=BASE_IMAGE,
    packages_to_install=[
        'evidently==0.2.0',
    ]
)