In [None]:

import kfp
from kfp import dsl
import kfp.components as components
from typing import NamedTuple
from datetime import datetime

def split_data():
    import os
    from glob import glob
    from minio import Minio
    import numpy as np
    from PIL import Image
    import xml.etree.ElementTree as ET
    from sklearn.model_selection import train_test_split

    ## get data from minio
    minio_client = Minio(
       "<minio_ep>",
        access_key="<minio_accK>",
        secret_key="<minio_secK>",
        secure=False
    )
    minio_bucket = "kubeflow"


    ## get data from minio
    print('Downloading data from minio...')
    for f in minio_client.list_objects(minio_bucket, prefix="datasets/pcb_defect_detection/PCB_DATASET/",recursive=True):
        save_path = f.object_name.replace('datasets/pcb_defect_detection', '/tmp')
        minio_client.fget_object(minio_bucket, f.object_name, save_path)

    ## preprocess dataset
    class PCB_DATA:
        def __init__(self):
            self._img_path=''
            self._anno_path=''
        @property
        def image(self):
            return self._img_path
        @image.setter
        def image(self, path):
            self._img_path = path

        @property
        def annotation(self):
            return self._anno_path
        @annotation.setter
        def annotation(self, path):
            self._anno_path = path
    
    ## split data
    train_data = []
    val_data = []
    test_data = []

    for folder in glob('/tmp/PCB_DATASET/images/*'):
        files = glob(f'{folder}/*.jpg')
        _, folder_cls = os.path.split(folder)
        train_files, val_test_files = train_test_split(files, test_size=0.2, random_state=42)
        val_files, test_files = train_test_split(val_test_files, test_size=0.5, random_state=42)
        for idx, files in enumerate([train_files, val_files, test_files]):
            for file in files:
                _, file_name = os.path.split(file)
                file_name, _ = os.path.splitext(file_name)
                assert os.path.exists(f'/tmp/PCB_DATASET/Annotations/{folder_cls}/{file_name}.xml')
                tmp_data = PCB_DATA()
                tmp_data.image = file
                tmp_data.annotation = f'/tmp/PCB_DATASET/Annotations/{folder_cls}/{file_name}.xml'
                if idx==0: # train
                    train_data.append(tmp_data)
                elif idx==1: # val
                    val_data.append(tmp_data)
                else: # test
                    test_data.append(tmp_data)

    ## save data
    def resize_xml(xml_path, output_path, target_size): # resize annotation
        tree = ET.parse(xml_path)
        root = tree.getroot()

        for size in root.iter('size'):
            width = int(size.find('width').text)
            height = int(size.find('height').text)

            size.find('width').text = str(target_size)
            size.find('height').text = str(target_size)

        for obj in root.iter('object'):
            for box in obj.iter('bndbox'):
                xmin = int(box.find('xmin').text)
                ymin = int(box.find('ymin').text)
                xmax = int(box.find('xmax').text)
                ymax = int(box.find('ymax').text)

                xmin = int(xmin * target_size / width)
                ymin = int(ymin * target_size / height)
                xmax = int(xmax * target_size / width)
                ymax = int(ymax * target_size / height)

                box.find('xmin').text = str(xmin)
                box.find('ymin').text = str(ymin)
                box.find('xmax').text = str(xmax)
                box.find('ymax').text = str(ymax)

        tree.write(output_path)

    resized_folder = '/tmp/PCB_Resized'
    train_folder = f'{resized_folder}/train'
    val_folder = f'{resized_folder}/val'
    test_folder = f'{resized_folder}/test'
    
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    for idx, datas in enumerate([train_data, val_data, test_data]):
        for data in datas:
            ## resize image
            image_path = data.image
            img = Image.open(image_path)
            resized_img = img.resize((640, 640))
        
            _, image_name = os.path.split(image_path)
            if idx==0:
                img_output_path = f'{train_folder}/{image_name}'
            elif idx==1:
                img_output_path = f'{val_folder}/{image_name}'
            else:
                img_output_path = f'{test_folder}/{image_name}'
            resized_img.save(img_output_path)
            
            ## resize xml
            anno_path = data.annotation
            _, anno_name = os.path.split(anno_path)
            if idx==0:
                anno_output_path = f'{train_folder}/{anno_name}'
            elif idx==1:
                anno_output_path = f'{val_folder}/{anno_name}'
            else:
                anno_output_path = f'{test_folder}/{anno_name}'
            
            resize_xml(anno_path, anno_output_path, 640)

    ## covert xml annotation to yolo format
    def convert_xml_to_yolo(xml_path, image_width, image_height, class_mapping):
        tree = ET.parse(xml_path)
        root = tree.getroot()

        labels = []
        for obj in root.findall('object'):
            class_name = obj.find('name').text
            if class_name not in class_mapping:
                continue

            class_id = class_mapping[class_name]
            bbox = obj.find('bndbox')

            x_center = (float(bbox.find('xmin').text) + float(bbox.find('xmax').text)) / 2.0 / image_width
            y_center = (float(bbox.find('ymin').text) + float(bbox.find('ymax').text)) / 2.0 / image_height
            width = (float(bbox.find('xmax').text) - float(bbox.find('xmin').text)) / image_width
            height = (float(bbox.find('ymax').text) - float(bbox.find('ymin').text)) / image_height

            labels.append(f"{class_id} {x_center} {y_center} {width} {height}")

        return labels

    def create_yolo_labels(source_folder, output_folder, class_mapping):
        for xml_file in os.listdir(source_folder):
            if xml_file.endswith('.xml'):
                xml_path = os.path.join(source_folder, xml_file)

                image_file = os.path.splitext(xml_file)[0] + '.jpg'
                image_path = os.path.join(source_folder.replace('Annotations', 'JPEGImages'), image_file)
                img = Image.open(image_path)
                image_width, image_height = img.size

                labels = convert_xml_to_yolo(xml_path, image_width, image_height, class_mapping)

                output_path = os.path.join(output_folder, os.path.splitext(xml_file)[0] + '.txt')
                with open(output_path, 'w') as f:
                    f.write('\n'.join(labels))
    # pcb defect class
    class_mapping = {'spurious_copper': 0, 'mouse_bite': 1, 'open_circuit': 2, 'missing_hole': 3, 'spur': 4, 'short': 5}
    
    create_yolo_labels(train_folder, train_folder, class_mapping)
    create_yolo_labels(val_folder, val_folder, class_mapping)
    create_yolo_labels(test_folder, test_folder, class_mapping)
    

    def upload_local_directory_to_minio(local_path, bucket_name, minio_path):
        assert os.path.isdir(local_path)

        for local_file in glob(local_path + '/**'):
            local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
            if not os.path.isfile(local_file):
                upload_local_directory_to_minio(
                    local_file, bucket_name, minio_path + "/" + os.path.basename(local_file))
            else:
                remote_path = os.path.join(
                    minio_path, local_file[1 + len(local_path):])
                remote_path = remote_path.replace(
                    os.sep, "/")  # Replace \ with / on Windows
                minio_client.fput_object(bucket_name, remote_path, local_file)

    ## upload train_dataset to minio
    print('Uploading Resized data to minio...')
    upload_local_directory_to_minio('/tmp/PCB_Resized', minio_bucket, 'datasets/pcb_defect_detection/PCB_Resized')


def model_building(
    epochs: int = 1,
    batch_size: int = 2
) -> NamedTuple('Output', [('mlpipeline_ui_metadata', 'UI_metadata'),('mlpipeline_metrics', 'Metrics')]):
    """
    Build the model with Keras API
    Export model parameters
    """

    import os
    os.environ['COMET_GIT_DIRECTORY'] = '/usr/src/app'
    import json
    import numpy as np
    import tensorflow as tf
    from glob import glob

    from minio import Minio
    import sys
    sys.path.append('/usr/src/app')    
    import train
    import export
    import val
    
    
    minio_client = Minio(
        "<minio_ep>",
        access_key="<minio_accK>",
        secret_key="<minio_secK>",
        secure=False
    )
    minio_bucket = "kubeflow"
    
    ## get data from minio
    print('Downloading data from minio...')
    for f in minio_client.list_objects(minio_bucket, prefix="datasets/pcb_defect_detection/PCB_Resized/",recursive=True):
        save_path = f.object_name.replace('datasets/pcb_defect_detection', '/tmp')
        minio_client.fget_object(minio_bucket, f.object_name, save_path)

    data_yaml_content = """

train: /tmp/PCB_Resized/train
val: /tmp/PCB_Resized/val
test: /tmp/PCB_Resized/test
nc: 6
names: ['spurious_copper', 'mouse_bite', 'open_circuit', 'missing_hole', 'spur', 'short']
"""
    os.makedirs('/content/yolov5/data', exist_ok=True)
    with open('/content/yolov5/data/data.yaml', 'w') as f:
        f.write(data_yaml_content)   
    ## train model
    train.run(
        data='/content/yolov5/data/data.yaml',
        imgsz=640,
        batch_size=batch_size,
        epochs=epochs,
        workers=0,
        cfg='/usr/src/app/models/yolov5s.yaml',
        weights='/content/yolov5/yolov5s.pt',
        optimizer='AdamW',
        name='pcb_defect_detection',
        project='/content/yolov5/runs/')


    ## export model
    exp = export.run(
        data='/content/yolov5/data/data.yaml',
        weights='/content/yolov5/runs/pcb_defect_detection/weights/best.pt',
        device='cpu' , # 0 for gpu
        include=('saved_model'),
        inplace=True,
        keras=True,
        int8=True,
        nms=True,
        conf_thres=0.4)
    
    model = tf.keras.models.load_model(exp[0])
    ## model summaryu
    stringlist = []
    model.summary(print_fn=lambda x: stringlist.append(x))
    metric_model_summary = "\n".join(stringlist)


    ## get test metrics
    result = val.run(
        data='/content/yolov5/data/data.yaml',
        weights='/content/yolov5/runs/pcb_defect_detection/weights/best.pt',
        batch_size=batch_size,
        project='/content/yolov5/runs/',
        verbose=True,
        task='test'
    )
    precision, recall ,map50, map50_95, _, _ , _ = a[0]
    ## metadata
    metadata = {
        "outputs": [
            {
                'storage': 'inline',
                'source': '''# Model Overview
## Model Summary

```
{}
```

## Model Performance

**Mean Precision**: {}
**Mean Recall**: {}
**mAP50**: {}
**mAP50-95**: {}
'''.format(metric_model_summary,precision,recall,map50,map50_95),
                'type': 'markdown',
            },
            {
                'type': 'table',
                'storage': 'inline',
                'format': 'csv',
                'header': ['epoch', 'train/box_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', 'x/lr0', 'x/lr1', 'x/lr2'],
                'source': '/content/yolov5/runs/pcb_defect_detection/results.csv'
            }
        ]
    }
    
    metrics = {
      'metrics': [{
          'name': 'Precision',
          'numberValue':  float(precision),
          'format' : ""
        },{
          'name': 'Recall',
          'numberValue':  float(recall),
          'format' : "PERCENTAGE"
        }
        ]}

    def upload_local_directory_to_minio(local_path, bucket_name, minio_path):
        assert os.path.isdir(local_path)

        for local_file in glob.glob(local_path + '/**'):
            local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
            if not os.path.isfile(local_file):
                upload_local_directory_to_minio(
                    local_file, bucket_name, minio_path + "/" + os.path.basename(local_file))
            else:
                remote_path = os.path.join(
                    minio_path, local_file[1 + len(local_path):])
                remote_path = remote_path.replace(
                    os.sep, "/")  # Replace \ with / on Windows
                minio_client.fput_object(bucket_name, remote_path, local_file)

    # save yolov5 train
    upload_local_directory_to_minio("/content/yolov5/runs/pcb_defect_detection",minio_bucket,"yolov5/runs/pcb_defect_detection/1") # 1 for version 1
    # save Model to minio
    upload_local_directory_to_minio(exp[0],minio_bucket,"models/yolov5/1") # 1 for version 1
    
    print("Saved model to minIO")
    
    from collections import namedtuple
    output = namedtuple('output', ['mlpipeline_ui_metadata', 'mlpipeline_metrics'])
    return output(json.dumps(metadata),json.dumps(metrics))


component_split_data = components.create_component_from_func(split_data,base_image="ultralytics/yolov5:latest-cpu",
                                                            packages_to_install=['scikit-learn','minio'])
component_model_building = components.create_component_from_func(model_building,base_image="ultralytics/yolov5:latest-cpu",
                                                            packages_to_install=['scikit-learn','minio','tensorflow==2.12.0'])


@dsl.pipeline(
    name='pcb-defect-detection',
    description='example pipeline for pcb defect detection'
)
def output_test( epochs, batch_size):
    
    now = datetime.now()
    v = now.strftime("%Y%m%d%H%M%S")
    minio_bucket = "kubeflow"
    
    step1 = component_split_data()
    step2 = component_model_building(epochs, batch_size)
    # step2.set_gpu_limit('1') # gpu request
    step2.after(step1)
    
    seldon_deployment = {
        "apiVersion": "machinelearning.seldon.io/v1",
        "kind": "SeldonDeployment",
        "metadata": {
            "name": f"pcb-defect-detection-{v}",
            "namespace": "kubeflow-user-example-com"
        },
        "spec": {
            "protocol": "seldon",
            "predictors": [
                {
                    "name": "predictor",
                    "replicas": 1,
                    "graph": {
                        "name": "classifier",
                        "implementation": "TENSORFLOW_SERVER",
                        "modelUri": f"s3://{minio_bucket}/models/yolov5",
                        "envSecretRefName": "seldon-init-container-secret"
                    }
                }
            ]
        }
    }
    
    step3 = dsl.ResourceOp(
        name=f'seldon-deployment-{v}',
        k8s_resource=seldon_deployment,
        action="create",
        attribute_outputs={"name": "{.metadata.name}"}
    )
    step3.after(step2)

if __name__ == "__main__":
    kubeflow_gateway_endpoint = "<kubeflow-gateway-endpoint>" # e.g. 172.0.0.1
    authservice_session_cookie = "<authservice_session_cookie>"
    
    client = kfp.Client(host=f"https://{kubeflow_gateway_endpoint}/pipeline",
                        cookies=f"authservice_session={authservice_session_cookie}",
                        ssl_ca_cert="cert/tls.crt") # need to store tls.crt before running the pipeline

    arguments = {
        "epochs": 4,
        "batch_size": 2
    }

    client.create_run_from_pipeline_func(output_test,arguments=arguments,experiment_name="pcb-defect-detection")