# NLP Demo

In [203]:
import kfp
import kfp.components as comp

In [204]:
UPLOAD_MODEL_COMPONENT_URL = "https://raw.githubusercontent.com/lehrig/kubeflow-ppc64le-components/main/model-building/upload-model/component.yaml"
MODEL_NAME = "question-answering"
MINIO_URL = "minio-service.kubeflow:9000"
MINIO_USER = "minio"
MINIO_PASS = "minio123"

In [244]:
def load_dataset(dataset_dir: comp.OutputPath(str)):
    from datasets import load_dataset
    import os
    squad = load_dataset("squad")
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    
    # with open(dataset_dir + "/dataset.pkl", "wb") as f:
    #     pickle.dump(squad, f)
    squad.save_to_disk(dataset_dir)

        
load_dataset_comp = kfp.components.create_component_from_func(load_dataset, "dataset.yaml",
                                                              "quay.io/jeremie_ch/transformers-component:gpu")

In [206]:
def description(dataset_dir: comp.InputPath(str)):
    import pickle
    from datasets.load import load_from_disk
    
    dataset = load_from_disk(dataset_dir)
    
    # with open(dataset_dir + "/dataset.pkl", "rb") as f:
    #     dataset = pickle.load(f)
    # dataset.describe()


description_comp = kfp.components.create_component_from_func(description, "description.yaml",
                                                             "quay.io/jeremie_ch/transformers-component:gpu")

In [207]:
def preprocess(dataset_dir: comp.InputPath(str),
               preprocess_dir: comp.OutputPath(str)):

    from transformers import AutoTokenizer
    from datasets.load import load_from_disk
    from datasets import load_dataset
    import pickle
    import os
    
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    
    # with open(dataset_dir + "/dataset.pkl", "rb") as f:
    #     squad = pickle.load(f)
    squad = load_from_disk(dataset_dir)
    
    def preprocess_function(examples):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs

    tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
    
    if not os.path.exists(preprocess_dir):
        os.makedirs(preprocess_dir)
    
    tokenized_squad.save_to_disk(preprocess_dir)


preprocess_comp = kfp.components.create_component_from_func(preprocess, "preprocess.yaml",
                                                            "quay.io/jeremie_ch/transformers-component:gpu")

In [262]:
def train(preprocess_dir: comp.InputPath(str),
          # model_dir: comp.OutputPath(str),
          model_path: comp.OutputPath(str),
          checkpoint_dir: comp.OutputPath(str)):

    import os
    from datasets import load_from_disk
    import pickle
    from transformers import AutoTokenizer, DefaultDataCollator, \
        AutoModelForQuestionAnswering, TrainingArguments, Trainer
    
    tokenized_squad = load_from_disk(preprocess_dir)
    
    data_collator = DefaultDataCollator()
    
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    training_args = TrainingArguments(
        output_dir=checkpoint_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1, #3,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        # train_dataset=tokenized_squad["train"],
        train_dataset=tokenized_squad["train"].select(range(1000)),
        eval_dataset=tokenized_squad["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    
    model_dir = os.path.dirname(model_path)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    trainer.save_model(model_dir)


train_comp = kfp.components.create_component_from_func(train, "train.yaml",
                                                       "quay.io/jeremie_ch/transformers-component:gpu")

In [249]:
upload_model_comp = comp.load_component_from_url(
    UPLOAD_MODEL_COMPONENT_URL
)

In [253]:
def pipeline(dataset_dir: str, 
             preprocess_dir: str, 
             # model_dir: str,
             checkpoint_dir: str,
             model_path: str = "/model_dir/training_args.bin",
             model_name: str = "question-answering",
             minio_url: str = MINIO_URL,
             minio_user: str = MINIO_USER,
             minio_pass: str = MINIO_PASS):
    load_dataset_task = load_dataset_comp()
    description_task = description_comp(dataset_dir=load_dataset_task.output)
    preproccess_task = preprocess_comp(dataset_dir=load_dataset_task.output)
    train_task = train_comp(preprocess_dir=preproccess_task.output).set_gpu_limit(1)
    upload_model_task = upload_model_comp(
        train_task.outputs["model_path"],
        minio_url,
        minio_user,
        minio_pass,
        model_name=model_name
    )

In [263]:
arguments = {"dataset_dir": "/dataset_dir", 
             "preprocess_dir": "/preprocess_dir",
             # "model_dir": "/model_dir", 
             "model_name": MODEL_NAME,
             "checkpoint_dir": "/checkpoint_dir",
             "model_path": "/model_dir/training_args.bin",
             "minio_url": MINIO_URL,
             "minio_user": MINIO_USER,
             "minio_pass": MINIO_PASS}

NAMESPACE = "jeremie-chheang-ibm-com"
client = kfp.Client()
client.create_run_from_pipeline_func(
    pipeline,
    arguments=arguments,
    namespace=NAMESPACE
)

KeyError: 'model_path'

'/model_dir'