# Load Huggingface Dataset

In [3]:
from kfp.components import create_component_from_func, OutputPath
from typing import Dict, List, NamedTuple

%load_ext lab_black

BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"


def Load_HuggingFace_Dataset(
    path: str,
    dataset_dir: OutputPath(str),
    configuration: str = "",
    split: str = None,
    label_columns: List[str] = None,
) -> NamedTuple("LoadDatasetOutput", [("labels", Dict[str, List[str]])]):
    """
    Load a Huggingface Dataset.

            Parameters:
                    path: Path from which to load the dataset. Huggingfaces hub for datasets is supported. Example: "Lehrig/Monkey-Species-Collection".
                    dataset_dir: Target directory where the dataset will be loaded to. Should be available as a mount from a PVC. Example: "/blackboard/dataset".
                    configuration: Name of the dataset configuration to load. Example: "downsized".
                    split: Split within the dataset. If None, all splits are loaded as a DatasetDict. Example: "train",
                    label_columns: Optional list of label column names to be fetched as optional, additional output. Example: ["label"].
            Returns:
                    labels: Dictionary mapping label columns to associated labels, if available. Empty dictionary otherwise. Example: {"labels": ["cat", "dog"]}
    """

    from collections import namedtuple
    from datasets import load_dataset
    from datasets.dataset_dict import DatasetDict
    import logging
    import os
    from PIL.Image import Image
    import sys

    logging.basicConfig(
        stream=sys.stdout,
        level=logging.INFO,
        format="%(levelname)s %(asctime)s: %(message)s",
    )

    if not configuration:
        configuration = None
    logging.info(
        f"Loading dataset from '{path}' using configuration '{configuration}'..."
    )
    dataset = load_dataset(path=path, name=configuration, split=split)

    logging.info("Reading image files into bytes...")

    # see: https://huggingface.co/docs/datasets/v2.4.0/en/package_reference/main_classes#datasets.Dataset.save_to_disk
    def read_image_file(example):
        for column in example:
            if isinstance(example[column], Image):
                with open(example[column].filename, "rb") as f:
                    example[column] = {"bytes": f.read()}
        return example

    # note: batching in map caused caching issues, so not using it for now
    dataset = dataset.map(read_image_file)

    logging.info(f"Saving dataset to '{dataset_dir}'...")
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    dataset.save_to_disk(dataset_dir)

    logging.info(f"Dataset saved. Contents of '{dataset_dir}':")
    logging.info(os.listdir(dataset_dir))

    labels = dict()
    if label_columns is not None:
        if isinstance(dataset, DatasetDict):
            dataset = next(iter(dataset.values()))
        for label_column in label_columns:
            logging.info(f"Fetching labels from column '{label_column}'...")
            labels[label_column] = dataset.features[label_column].names

    output = namedtuple("LoadDatasetOutput", ["labels"])

    logging.info("Finished.")
    return output(labels)


load_huggingface_dataset_comp = create_component_from_func(
    func=Load_HuggingFace_Dataset,
    output_component_file="component.yaml",
    base_image=BASE_IMAGE,
)

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
