# Component Test: Plot Confusion Matrix

## Author
- Sebastian Lehrig <sebastian.lehrig1@ibm.com>

## License
Apache-2.0 License

## Imports & Constants

In [442]:
import kfp
from kfp.components import (
    InputPath,
    OutputPath
)
import kfp.dsl as dsl
import os

In [449]:
BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

KFP_CLIENT = kfp.Client()

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()
NAMESPACE

'user-example-com'

## Component to be tested

In [444]:
plot_confusion_matrix_comp = kfp.components.load_component_from_file(
    "component.yaml"
)

## Test Data Generators

In [445]:
def generate_test_dataset(
    dataset_dir: OutputPath(str)
):
    from datasets import Dataset
    import os

    data = {
        "feature1": [[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]],
        "feature2": [True, False, True],
        "label1": [1, 0, 2],
        "label2": [True, False, True]
    }

    dataset = Dataset.from_dict(data)

    dataset = dataset.train_test_split(test_size=0.3)

    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    dataset.save_to_disk(dataset_dir)


generate_test_dataset_comp = kfp.components.create_component_from_func(
    func=generate_test_dataset,
    base_image=BASE_IMAGE
)

In [446]:
def generate_test_model(
    dataset_dir: InputPath(str),
    model_dir: OutputPath(str)
):
    from datasets import load_from_disk
    import os
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import (
        Concatenate,
        Dense,
        Input
    )
    from transformers import DefaultDataCollator

    dataset = load_from_disk(dataset_dir)
    print(dataset["train"])

    data_collator = DefaultDataCollator(return_tensors="tf")
    train_dataset = dataset["train"].to_tf_dataset(
        columns=['feature1', 'feature2'],
        label_cols=['label1', 'label2'],
        shuffle=True,
        batch_size=32,
        collate_fn=data_collator
    )

    input1 = Input(shape=(5,), name='feature1')
    input2 = Input(shape=(1,), name='feature2')
    input = Concatenate()([input1, input2])
    dense1 = Dense(units=32, activation="tanh", kernel_initializer='random_normal')(input)
    output1 = Dense(units=1, kernel_initializer='random_normal', name='label1')(dense1)
    output2 = Dense(units=1, kernel_initializer='random_normal', name='label2')(dense1)

    model = Model(inputs=[input1, input2], outputs=[output1, output2])

    model.compile(loss='mse', optimizer='sgd', metrics=['accuracy'])

    model.fit(train_dataset, epochs=10, batch_size=32)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save(model_dir)


generate_test_model_comp = kfp.components.create_component_from_func(
    func=generate_test_model,
    base_image=BASE_IMAGE
)

## Create pipeline

In [447]:
@dsl.pipeline(
  name='Component Test - Plot Confusion Matrix',
  description='A simple component test'
)
def train_pipeline():
    generate_test_dataset_task = generate_test_dataset_comp()

    generate_test_model_task = generate_test_model_comp(
        generate_test_dataset_task.outputs['dataset_dir']
    )

    plot_confusion_matrix_comp(
        input_columns=['feature1', 'feature2'],
        label_columns={'label1': ['Label 1A', 'Label 1B', 'Label 1C'], 'label2': ['Label 2A', 'Label 2B']},
        prep_dataset_dir=generate_test_dataset_task.outputs['dataset_dir'],
        model_dir=generate_test_model_task.outputs['model_dir']
    )

## Run the pipeline within an experiment

In [448]:
KFP_CLIENT.create_run_from_pipeline_func(
    train_pipeline,
    arguments={},
    namespace=NAMESPACE
)

RunPipelineResult(run_id=6fbe4fb5-4fbd-48c8-af47-6981020a41a3)