In [2]:
from kfp.components import create_component_from_func, OutputPath, InputPath

%load_ext lab_black

BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"


def plot_confusion_matrix(
    predictions_dir: InputPath(str), mlpipeline_ui_metadata_path: OutputPath(str)
):
    """
    Plots a confusion matrix based on a list of labels and predictions for one target variable

            Parameters:
                    predictions_dir: Path to directory with 2 files for true labels (ytrue.txt) and predicted labels (ypred.txt).
                                    The files should have one label formatted as string per row.
            Returns:
                    mlpipeline_ui_metadata_path: Data to plot a confusion matrix. The plotted confusion matrix can be viewed via Kubeflow UI's Vizualization for this component inside a pipeline run.
    """
    import json
    import logging
    import pandas as pd
    import sys
    import tensorflow as tf
    import os

    logging.basicConfig(
        stream=sys.stdout,
        level=logging.INFO,
        format="%(levelname)s %(asctime)s: %(message)s",
    )
    logger = logging.getLogger()

    def load_data(file):
        with open(os.path.join(predictions_dir, file)) as f:
            y = f.readlines()
        try:
            y = list(map(lambda x: int(x.strip("\n")), y))
            logger.info(f"Reading {file} as numbers.")
        except ValueError:
            y = list(map(lambda x: x.strip("\n"), y))
            logger.info(f"Reading {file} as strings.")
        return y

    y_true = load_data("ytrue.txt")
    y_pred = load_data("ypred.txt")

    if len(y_true) != len(y_pred):
        logger.error("Labels and Predictions have different lengths.")

    labels = list(set(y_true).union(set(y_pred)))
    logging.info(f"Using the labels {labels}")

    confusion_matrices = []
    confusion_matrix = tf.math.confusion_matrix(
        labels=y_true,
        predictions=y_pred,
        num_classes=len(labels),
    )

    data = []
    for target_index, target_row in enumerate(confusion_matrix):
        for predicted_index, count in enumerate(target_row):
            data.append((labels[target_index], labels[predicted_index], count.numpy()))

    df = pd.DataFrame(data, columns=["target", "predicted", "count"])

    confusion_matrices.append(
        {
            "type": "confusion_matrix",
            "format": "csv",
            "schema": [
                {"name": "target", "type": "CATEGORY"},
                {"name": "predicted", "type": "CATEGORY"},
                {"name": "count", "type": "NUMBER"},
            ],
            "storage": "inline",
            "source": df.to_csv(
                columns=["target", "predicted", "count"], header=False, index=False
            ),
            "labels": labels,
        }
    )

    metadata = {"outputs": confusion_matrices}

    logger.info("Dumping mlpipeline_ui_metadata...")
    with open(mlpipeline_ui_metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)

    logger.info("Finished.")


load_dataset_comp = create_component_from_func(
    func=plot_confusion_matrix,
    output_component_file="component.yaml",
    base_image=BASE_IMAGE,
)

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
