In [2]:
from kfp.components import create_component_from_func, InputPath, OutputPath

%load_ext lab_black

BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"


def Convert_Speech_to_Text_with_OpenAI_Whisper(
    audio_dir: InputPath(str),
    texts_file: OutputPath(str),
    model_type: str = "base",
    limit: int = 100,
):
    """
    Transcribes speech audio files stored as part of a Huggingface Dataset to text using the Whisper model of OpenAI.

            Parameters:
                    audio_dir: Directory where to load data from.
                    texts_file: File to which transcribed texts will be written.
                    model_type: Whisper model to load.
                    limit: Subset size to limit the number of transcriptions.
    """

    import logging
    import string
    import sys
    from pprint import pformat
    from statistics import mean
    import torch
    import whisper
    from datasets import load_from_disk
    from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
    from tqdm import tqdm
    from tqdm.contrib.logging import tqdm_logging_redirect

    logging.basicConfig(
        stream=sys.stdout,
        level=logging.INFO,
        format="%(levelname)s %(asctime)s: %(message)s",
    )
    logger = logging.getLogger()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Running on {device}")

    dataset = load_from_disk(audio_dir).select(range(limit))
    model = whisper.load_model(model_type, device=device)
    len_ds = len(dataset)
    logger.info(f"Loaded model and data ({len_ds} records). Transcription starts...")
    texts = []
    performance = []
    references = []
    chencherry = SmoothingFunction()

    with tqdm_logging_redirect():
        for sample in tqdm(dataset, total=len_ds):
            audio = sample["audio"]["array"].astype("float32")
            result = model.transcribe(audio=audio)
            text = (
                result["text"]
                .lower()
                .translate(str.maketrans("", "", string.punctuation))
            )
            reference = [sample["transcription"].split()]
            references.append(reference)
            texts.append(text)
            performance.append(
                sentence_bleu(
                    reference, text.split(), smoothing_function=chencherry.method1
                )
            )

    logger.info(f"Average Sentence BLEU of transcriptions: {mean(performance)}")
    logger.info(
        f"Corpus BLEU across all transcriptions: {corpus_bleu(references, [text.split() for text in texts])}"
    )

    with open(texts_file, "w") as out:
        out.writelines(texts)

    logger.info(f"Finished. Transcriptions written to {texts_file}.")


convert_speech_to_text_comp = create_component_from_func(
    func=Convert_Speech_to_Text_with_OpenAI_Whisper,
    output_component_file="component.yaml",
    base_image=BASE_IMAGE,
)

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
