# Train Model Job

In [None]:
from kfp.components import (
    create_component_from_func,
    InputPath,
    OutputPath
)
from typing import Dict

BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"


def train_model_job(
        dataset_directory: InputPath(str),
        train_specification: str,
        train_parameters: Dict[str, str],
        model_dir: OutputPath(str),
        train_mount: str = "/train",
        model_name: str = "my-model",
        base_image: str = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest",
        node_selector: str = "",
        pvc_name: str = "",
        pvc_size: str = "10Gi",
        cpus: str = "",
        gpus: int = 0,
        memory: str = "",
        cluster_configuration_secret: str = "",
):
    '''
    Trains a model. Once trained, the model is persisted to model_dir.

            Parameters:
                    dataset_directory: Path to the directory with training data.
                    train_specification: Training command as generated from a Python function using kfp.components.func_to_component_text.
                    train_parameters: Dictionary mapping formal to actual parameters for the training spacification.
                    model_dir: Target path where the model will be stored.
                    train_mount: Optional mounting point for training data of an existing PVC. Example: "/train".
                    model_name: Optional name of the model. Must be unique for the targeted namespace and conform Kubernetes naming conventions. Example: my-model.
                    base_image: Optional base image for model training. Example: quay.io/ibm/kubeflow-notebook-image-ppc64le:latest.
                    node_selector: Optional node selector for worker nodes. Example: nvidia.com/gpu.product: "Tesla-V100-SXM2-32GB".
                    pvc_name: Optional name to an existing persistent volume claim (pvc). If given, this pvc is mounted into the training job. Example: "music-genre-classification-j4ssf-training-pvc".
                    pvc_size: Optional size of the storage during model training. Storage is mounted into to the Job based on a persitent volume claim of the given size. Example: 10Gi.
                    cpus: Optional CPU limit for the job. Leave empty for cluster defaults (typically no limit). Example: "1000m".
                    gpus: Optional number of GPUs for the job. Example: 2.
                    memory: Optional memory limit for the job. Leave empty for cluster defaults (typically no limit). Example: "1Gi".
                    cluster_configuration_secret: Optional secret name configuring a (remote) Kubernetes cluster to run the job in and the backing MinIO object store. All secret's data values are optional and appropriate defaults are chosen if not present. The secret may provide a suitable kubernetes bearer token, the associated namespace, a host, etc. Example: "remote-power-cluster".
    '''
    from datetime import datetime
    import errno
    import json
    import kfp
    from kubernetes import (
        client,
        config,
        utils,
        watch
    )
    import logging
    import os
    import shutil
    import sys
    import yaml

    logging.basicConfig(
        stream=sys.stdout,
        level=logging.INFO,
        format='%(levelname)s %(asctime)s: %(message)s'
    )
    logger = logging.getLogger()

    ###########################################################################
    # Helper Functions
    ###########################################################################

    def establish_local_cluster_connection():
        config.load_incluster_config()
        return client.ApiClient()

    def get_cluster_configuration(api_client, cluster_configuration_secret):
        import base64
        from kubernetes.client.rest import ApiException

        def decode(secret, key):
            data = secret.data[key]
            decoded_data = base64.b64decode(data)
            return decoded_data.decode('utf-8')

        def update_with_secret(secret, dictionary):
            for key in dictionary:
                if key in secret.data:
                    dictionary[key] = decode(secret, key)

        cluster_configuration = {
            "access-mode": "ReadWriteMany",
            "minio-accesskey": "minio",
            "minio-bucket": "mlpipeline",
            "minio-job-folder": "jobs",
            "minio-secretkey": "minio123",
            "minio-url": "http://minio-service.kubeflow:9000",
            "remote-host": "",
            "remote-namespace": "",
            "remote-token": "",
        }

        try:
            default_minio_secret = client.CoreV1Api(api_client).read_namespaced_secret(
                "mlpipeline-minio-artifact",
                get_current_namespace()
            )

            if default_minio_secret.data is None:
                logger.info("MinIO secret (mlpipeline-minio-artifact) includes no data - progressing with default values.")
            else:
                logger.info("Found default MinIO secret (mlpipeline-minio-artifact) - updating cluster configuration accordingly.")
                cluster_configuration["minio-accesskey"] = decode(default_minio_secret, "accesskey")
                cluster_configuration["minio-secretkey"] = decode(default_minio_secret, "secretkey")
        except ApiException as e:
            if e.status == 404:
                logger.info("Found no default MinIO secret (mlpipeline-minio-artifact) - progressing with default values.")

        if cluster_configuration_secret == "":
            logger.info("No cluster configuration secret specified - progressing with default values.")
            return cluster_configuration

        try:
            secret = client.CoreV1Api(api_client).read_namespaced_secret(
                cluster_configuration_secret,
                get_current_namespace()
            )
            if secret.data is None:
                logger.info(f"Cluster configuration secret ({cluster_configuration_secret}) includes no data - progressing with default values.")
            else:
                logger.info(f"Found cluster configuration secret ({cluster_configuration_secret}) - updating cluster configuration accordingly.")
                update_with_secret(secret, cluster_configuration)
        except ApiException as e:
            if e.status == 404:
                logger.info(f"Found no cluster configuration secret ({cluster_configuration_secret}) - progressing with default values.")

        return cluster_configuration

    def establish_training_cluster_connection(local_api_client, cluster_configuration):
        is_remote = False
        if (
            cluster_configuration["remote-host"] == "" or
            cluster_configuration["remote-token"] == ""
        ):
            logger.info("Remote cluster not configured. Using in-cluster configuration...")
            logger.info("Note: assign the name of a secret to the 'cluster_configuration_secret' pipeline argument and add the secret to your cluster.")
            logger.info("Example secret:")
            logger.info("---")
            logger.info("apiVersion: v1")
            logger.info("kind: Secret")
            logger.info("metadata:")
            logger.info("  name: my-remote-cluster")
            logger.info("stringData:")
            logger.info("  access-mode: ReadWriteOnce")
            logger.info("  minio-accesskey: minio")
            logger.info("  minio-bucket: mlpipeline")
            logger.info("  minio-job-folder: jobs")
            logger.info("  minio-secretkey: minio123")
            logger.info("  minio-url: http://minio-service.kubeflow:9000")
            logger.info("  remote-host: https://istio-ingressgateway-istio-system.apps.mydomain.ai:6443")
            logger.info("  remote-namespace: default")
            logger.info("  remote-token: eyJh...")
            logger.info("---")
            logger.info("Where you get the remote-token from your remote cluster as described here:")
            logger.info("https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#without-kubectl-proxy")

            api_client = local_api_client
            if(not os.path.exists(train_mount)):
                logger.warning(f"No local mount to {train_mount} found. Therefore, switching to remote data synchronization mode via MinIO. This will work but is slower compared to local mounts. Consider adding a mount to '{train_mount}' for this component by using a PVC inside your pipeline.")
                is_remote = True
        else:
            # see: https://github.com/kubernetes-client/python/blob/6d4587e18064288d031ed9bbf5ab5b8245460b3c/examples/remote_cluster.py
            logger.info("Remote host and token found. Using remote cluster configuration...")
            configuration = client.Configuration()
            configuration.host = cluster_configuration["remote-host"]
            configuration.verify_ssl = False
            configuration.api_key = {
                "authorization": "Bearer " + cluster_configuration["remote-token"]
            }
            api_client = client.ApiClient(configuration)
            is_remote = True

        return (api_client, is_remote)

    def clone_path(source, target):
        try:
            logger.info(f"Cloning source path {source} to {target} of training job...")
            shutil.copytree(source, target)
            logger.info("Cloning finished. Target path contents:")
            logger.info(os.listdir(target))
        except OSError as e:
            if e.errno in (errno.ENOTDIR, errno.EINVAL):
                shutil.copy(source, target)
            else:
                raise

    def sync_with_minio(
        cluster_configuration: dict,
        inputs: dict,
        job_name: str,
        is_upload: bool,
        remove_minio_files: bool = False
    ):
        import pip

        def install(package):
            if hasattr(pip, 'main'):
                pip.main(['install', package])
            else:
                pip._internal.main(['install', package])

        install("boto3")

        import boto3
        import botocore
        from botocore.client import Config
        import json
        import logging
        import os
        import sys
        import tarfile

        logging.basicConfig(
            stream=sys.stdout,
            level=logging.INFO,
            format='%(levelname)s %(asctime)s: %(message)s'
        )
        logger = logging.getLogger()

        def establish_minio_connection(cluster_configuration):
            if ("minio-accesskey" in cluster_configuration) and ("minio-secretkey" in cluster_configuration):
                minio_user = cluster_configuration["minio-accesskey"]
                minio_pass = cluster_configuration["minio-secretkey"]
            else:
                minio_user = os.getenv('MINIO_USER')
                minio_pass = os.getenv('MINIO_PASS')

            if (minio_user == "" or minio_pass == ""):
                err = "Environment variables MINIO_USER and MINIO_PASS need externally to be provided to this component using k8s_secret_key_to_env!"
                logger.error(err)
                raise Exception(err)

            return boto3.session.Session().resource(
                service_name="s3",
                endpoint_url=cluster_configuration["minio-url"],
                aws_access_key_id=minio_user,
                aws_secret_access_key=minio_pass,
                config=Config(signature_version="s3v4"),
            )

        def path_to_tarfilename(pathname):
            return f"{pathname.replace(os.sep, '-')}.tar.gz"

        def make_tarfile(output_filename, source_dir):
            with tarfile.open(output_filename, "w:gz") as tar:
                tar.add(source_dir, arcname='.')

        # see: https://stackoverflow.com/a/47565719/2625096
        def bucket_exists(minio_client, bucket):
            try:
                minio_client.meta.client.head_bucket(Bucket=bucket.name)
                return True
            except botocore.exceptions.ClientError as e:
                error_code = int(e.response['Error']['Code'])
                if error_code == 403:
                    # Forbidden Access -> Private Bucket
                    return True
                elif error_code == 404:
                    return False

        def upload_to_minio(file, upload_bucket, job_folder, job_name, minio_client):
            bucket = minio_client.Bucket(upload_bucket)

            if not bucket_exists(minio_client, bucket):
                minio_client.create_bucket(Bucket=bucket.name)

            bucket.upload_file(
                file,
                f"{job_folder}/{job_name}/{file}"
            )

        def download_from_minio(file, upload_bucket, job_folder, job_name, minio_client, remove_minio_file):
            bucket = minio_client.Bucket(upload_bucket)
            key = f"{job_folder}/{job_name}/{file}"

            bucket.download_file(key, file)

            if remove_minio_file:
                bucket.Object(key).delete()

        def extract_tarfile(tarfile_name, target):
            with tarfile.open(tarfile_name, "r:gz") as tar_gz_ref:
                tar_gz_ref.extractall(target)

        if (isinstance(cluster_configuration, str)):
            cluster_configuration = json.loads(cluster_configuration)

        if (isinstance(inputs, str)):
            inputs = json.loads(inputs)

        if (isinstance(is_upload, str)):
            if (is_upload == "True"):
                is_upload = True
            else:
                is_upload = False

        logger.info("Establishing MinIO connection...")
        minio_client = establish_minio_connection(cluster_configuration)

        for (source, target) in inputs:
            tarfilename = path_to_tarfilename(source)

            if (is_upload):
                logger.info(f"Tar.gz input {source} into {tarfilename}...")
                make_tarfile(tarfilename, source)

                logger.info(f'Uploading {tarfilename} to {cluster_configuration["minio-bucket"]}/{cluster_configuration["minio-job-folder"]}/{job_name}/{tarfilename}...')
                upload_to_minio(tarfilename, cluster_configuration["minio-bucket"], cluster_configuration["minio-job-folder"], job_name, minio_client)
            else:
                logger.info(f'Downloading {cluster_configuration["minio-bucket"]}/{cluster_configuration["minio-job-folder"]}/{job_name}/{tarfilename} to {tarfilename}...')
                download_from_minio(tarfilename, cluster_configuration["minio-bucket"], cluster_configuration["minio-job-folder"], job_name, minio_client, remove_minio_files)

                logger.info(f'Extracting {tarfilename} to {target}...')
                extract_tarfile(tarfilename, target)

                logger.info('Result:')
                logger.info(os.listdir(target))

    def generate_unique_job_name(model_name: str):
        epoch = datetime.today().strftime('%Y%m%d%H%M%S')
        return f"job-{model_name}-{epoch}"

    def get_current_namespace():
        SA_NAMESPACE = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
        with open(SA_NAMESPACE) as f:
            return f.read()

    def initialize_namespace(namespace: str):
        if namespace == "":
            namespace = get_current_namespace()
        namespace_spec = f"namespace: {namespace}"

        return (namespace, namespace_spec)

    def initialize_nodeselector(node_selector: str):
        if node_selector != "":
            node_selector = f"nodeSelector:\n        {node_selector}"
        return node_selector

    def initialize_init_container(
        base_image: str,
        cluster_configuration: Dict[str, str],
        inputs: Dict[str, str],
        is_remote: bool,
        job_name: str,
        minio_secret: str,
        mount_path: str,
    ):
        if(not is_remote):
            return ""

        command_specification = kfp.components.func_to_component_text(
            func=sync_with_minio
        )

        # inner components loose type information as needed by lists/dicts
        # -> cluster_configuration & inputs need to be a string (using json)
        cluster_configuration_json = json.dumps({
            "minio-bucket": cluster_configuration["minio-bucket"],
            "minio-job-folder": cluster_configuration["minio-job-folder"],
            "minio-url": cluster_configuration["minio-url"],
        })
        inputs_json = json.dumps(inputs)
        parameters = {
            "cluster_configuration": cluster_configuration_json,
            "inputs": inputs_json,
            "job_name": job_name,
            "is_upload": "False",
        }

        command, _, _ = initialize_command(
            command_specification,
            parameters
        )

        init_container = f"""initContainers:
          - name: init-inputs
            image: {base_image}
            command: {command}
            volumeMounts:
            - mountPath: {mount_path}
              name: training
            env:
            - name: MINIO_USER
              valueFrom:
                secretKeyRef:
                  name: {minio_secret}
                  key: accesskey
                  optional: false
            - name: MINIO_PASS
              valueFrom:
                secretKeyRef:
                  name: {minio_secret}
                  key: secretkey
                  optional: false
"""
        return init_container

    def initialize_command(
        specification: str,
        parameters: Dict[str, str],
        path_parameters: Dict[str, str] = {},
        mount_path: str = "/tmp"
    ):
        component_yaml = yaml.safe_load(specification)
        container_yaml = component_yaml["implementation"]["container"]
        command = container_yaml["command"]
        args = container_yaml["args"]

        actual_args = list()
        inputs = list()
        outputs = list()
        for idx, arg in enumerate(args):
            if type(arg) is dict:
                if "inputValue" in arg:
                    # required parameter (value)
                    key = arg["inputValue"]
                    if key in parameters:
                        actual_args.append(parameters[key])
                    else:
                        err = f"Required parameter '{key}' missing in component input!"
                        logger.error(err)
                        raise Exception(err)
                elif "if" in arg:
                    # optional parameter
                    key = arg["if"]["cond"]["isPresent"]
                    if key in parameters:
                        actual_args.append(f"--{key}")
                        actual_args.append(parameters[key])
                elif "inputPath" in arg:
                    # required InputPath
                    key = arg["inputPath"]
                    if key in parameters:
                        path_key = parameters[key]
                        if path_key in path_parameters:
                            mount = f"{mount_path}{path_parameters[path_key]}"
                            inputs.append((path_parameters[path_key], mount))
                            actual_args.append(mount)
                        else:
                            err = f"InputPath '{path_key}' unavailable in training component!"
                            logger.error(err)
                            raise Exception(err)
                    else:
                        err = f"Required parameter '{key}' missing in component input!"
                        logger.error(err)
                        raise Exception(err)
                elif "outputPath" in arg:
                    # required OutputPath
                    key = arg["outputPath"]
                    if key in parameters:
                        path_key = parameters[key]
                        if path_key in path_parameters:
                            mount = f"{mount_path}{path_parameters[path_key]}"
                            outputs.append((mount, path_parameters[path_key]))
                            actual_args.append(mount)
                        else:
                            err = f"OutputPath '{path_key}' unavailable in training component!"
                            logger.error(err)
                            raise Exception(err)
                    else:
                        err = f"Required parameter '{key}' missing in component input!"
                        logger.error(err)
                        raise Exception(err)
            else:
                # required parameter (key)
                actual_args.append(arg)

        command_with_initialized_args = json.dumps(command + actual_args)

        return command_with_initialized_args, inputs, outputs

    def initialize_fetch_command(
        cluster_configuration,
        job_name: str,
        outputs: Dict[str, str],
    ):
        command_specification = kfp.components.func_to_component_text(
            func=sync_with_minio
        )

        # inner components loose type information as needed by lists/dicts
        # -> cluster_configuration & inputs need to be a string (using json)
        cluster_configuration_json = json.dumps({
            "minio-bucket": cluster_configuration["minio-bucket"],
            "minio-job-folder": cluster_configuration["minio-job-folder"],
            "minio-url": cluster_configuration["minio-url"],
        })
        outputs_json = json.dumps(outputs)
        parameters = {
            "cluster_configuration": cluster_configuration_json,
            "inputs": outputs_json,
            "job_name": job_name,
            "is_upload": "True",
        }
        command, _, _ = initialize_command(
            command_specification,
            parameters
        )
        return command

    def create_pvc_spec(pvc_name, namespace_spec, access_mode, pvc_size):
        pvc_spec = f"""apiVersion: batch/v1
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: {pvc_name}
  {namespace_spec}
spec:
  accessModes:
  - {access_mode}
  resources:
    requests:
      storage: {pvc_size}
"""
        return yaml.safe_load(pvc_spec)

    def create_minio_secret_spec(cluster_configuration, minio_secret, namespace_spec):
        minio_secret_spec = f"""apiVersion: v1
kind: Secret
metadata:
  name: {minio_secret}
  {namespace_spec}
stringData:
  accesskey: {cluster_configuration["minio-accesskey"]}
  secretkey: {cluster_configuration["minio-secretkey"]}
"""
        return yaml.safe_load(minio_secret_spec)

    def create_train_job_spec(
        job_name,
        namespace_spec,
        node_selector,
        base_image,
        train_command,
        train_mount,
        cpus,
        memory,
        gpus,
        init_container,
        pvc_name
    ):
        if cpus:
            cpus = f"cpu: {cpus}"
        else:
            cpus = ""

        if memory:
            memory = f"memory: {memory}"
        else:
            memory = ""

        if gpus:
            gpus = f"nvidia.com/gpu: {gpus}"
        else:
            gpus = ""

        job_spec = f"""apiVersion: batch/v1
kind: Job
metadata:
  name: {job_name}
  {namespace_spec}
spec:
  template:
    metadata:
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      {node_selector}
      containers:
        - name: training-container
          image: {base_image}
          command: {train_command}
          volumeMounts:
            - mountPath: {train_mount}
              name: training
          restartPolicy: Never
          resources:
            limits:
              {cpus}
              {memory}
              {gpus}
      {init_container}
      volumes:
        - name: training
          persistentVolumeClaim:
            claimName: {pvc_name}
      restartPolicy: Never
"""
        return yaml.safe_load(job_spec)

    def create_fetch_job_spec(
        job_name,
        namespace_spec,
        base_image,
        fetch_command,
        train_mount,
        minio_secret,
        pvc_name
    ):
        job_spec = f"""apiVersion: batch/v1
kind: Job
metadata:
  name: {job_name}
  {namespace_spec}
spec:
  template:
    metadata:
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      containers:
        - name: training-container
          image: {base_image}
          command: {fetch_command}
          volumeMounts:
            - mountPath: {train_mount}
              name: training
          restartPolicy: Never
          env:
          - name: MINIO_USER
            valueFrom:
              secretKeyRef:
                name: {minio_secret}
                key: accesskey
                optional: false
          - name: MINIO_PASS
            valueFrom:
              secretKeyRef:
                name: {minio_secret}
                key: secretkey
                optional: false
      volumes:
        - name: training
          persistentVolumeClaim:
            claimName: {pvc_name}
      restartPolicy: Never
"""
        return yaml.safe_load(job_spec)

    def submit_and_monitor_job(api_client, job_spec, namespace, additional_job_resources=[]):
        objects = utils.create_from_yaml(
            api_client,
            yaml_objects=[job_spec]
        )
        job_name = objects[0][0].metadata.name
        job_uid = objects[0][0].metadata.uid

        logger.info("Creating additional job resource...")
        if additional_job_resources:
            for resource in additional_job_resources:
                resource['metadata']['ownerReferences'] = [
                    {
                        "apiVersion": "batch/v1",
                        "kind": "Job",
                        "name": job_name,
                        "uid": job_uid,
                    }
                ]
            utils.create_from_yaml(
                api_client,
                yaml_objects=additional_job_resources
            )

        logger.info("Waiting for job to succeed...")
        job_watch = watch.Watch()
        for job_event in job_watch.stream(
            batch_api.list_namespaced_job,
            namespace=namespace,
            label_selector=f"job-name={job_name}",
            timeout_seconds=0
        ):
            job = job_event['object']
            if job.status.active and not job.status.failed:
                logger.info("Monitoring pods of job...")

                # See https://stackoverflow.com/questions/65938572/kubernetes-python-client-equivalent-of-kubectl-wait-for-command
                pod_watch = watch.Watch()
                for pod_event in pod_watch.stream(
                    func=core_api.list_namespaced_pod,
                    namespace=namespace,
                    label_selector=f"controller-uid={job.metadata.labels['controller-uid']}",
                    timeout_seconds=0
                ):
                    pod = pod_event["object"]
                    pod_name = pod.metadata.name

                    logger.info(f"Pod {pod_name} status: {pod.status.phase}")
                    if pod.status.phase == "Running" or pod.status.phase == "Succeeded" or pod.status.phase == "Failed":
                        logger.info("==============================================================================")
                        logger.info("==============================================================================")
                        logger.info(f"=== Streaming logs of pod {pod_name}...")
                        logger.info("==============================================================================")
                        logger.info("==============================================================================")

                        log_watch = watch.Watch()
                        for log_event in log_watch.stream(
                            core_api.read_namespaced_pod_log,
                            name=pod_name,
                            namespace=namespace,
                            follow=True,
                            _return_http_data_only=True,
                            _preload_content=False
                        ):
                            print(log_event)
                        logger.info("==============================================================================")
                        logger.info("==============================================================================")

                        pod_watch.stop()

                        if pod.status.phase == "Failed":
                            err = "Job failed while executing."
                            logger.error(err)
                            raise Exception(err)
                        break
                    if pod_event["type"] == "DELETED":
                        err = "Pod was deleted while we where waiting for it to start."
                        logger.error(err)
                        raise Exception(err)
            if job.status.succeeded:
                job_watch.stop()
                logger.info("Job finished.")
                break

            if not job.status.active and job.status.failed:
                job_watch.stop()
                logger.error("Job failed!")
                raise Exception("Job failed!")

    ###########################################################################
    # Main Workflow
    ###########################################################################

    logger.info("Establishing local cluster connection...")
    local_api_client = establish_local_cluster_connection()

    logger.info("Receiving training cluster configuration...")
    cluster_configuration = get_cluster_configuration(
        local_api_client,
        cluster_configuration_secret
    )

    logger.info("Establishing training cluster connection...")
    api_client, is_remote = establish_training_cluster_connection(
        local_api_client,
        cluster_configuration
    )
    batch_api = client.BatchV1Api(api_client)
    core_api = client.CoreV1Api(api_client)

    logger.info("Initializing resources...")
    job_name = generate_unique_job_name(model_name)
    job_minio_secret = f"{job_name}-minio-secret"
    namespace, namespace_spec = initialize_namespace(cluster_configuration["remote-namespace"])
    pvc_name = f"{job_name}-pvc"
    node_selector = initialize_nodeselector(node_selector)

    path_parameters = {
        "dataset_directory": dataset_directory,
        "model_dir": model_dir
    }
    train_command, inputs, outputs = initialize_command(
        train_specification,
        train_parameters,
        path_parameters,
        train_mount
    )

    init_container = initialize_init_container(
        base_image,
        cluster_configuration,
        inputs,
        is_remote,
        job_name,
        job_minio_secret,
        train_mount,
    )

    logger.info("=======================================")
    logger.info("Derived configurations")
    logger.info("=======================================")
    logger.info(f"job_name: {job_name}")
    logger.info(f"namespace: {namespace}")
    logger.info(f"is_remote: {is_remote}")
    logger.info(f"minio_url: {cluster_configuration['minio-url']}")
    logger.info(f"job_minio_secret: {job_minio_secret}")
    logger.info("inputs (input paths send to job):")
    for source, target in inputs:
        logger.info(f"- {source} -> {cluster_configuration['minio-bucket']}/{cluster_configuration['minio-job-folder']}/{job_name}/{target}")
    logger.info("outputs (output paths returning from job):")
    for source, target in outputs:
        logger.info(f"- {target} <- {cluster_configuration['minio-bucket']}/{cluster_configuration['minio-job-folder']}/{job_name}/{source}")
    logger.info(f"train_command: {train_command}")
    logger.info("=======================================")

    additional_job_resources = []

    if (is_remote):
        logger.info("Using MinIO to sync data with a new remote PVC for the job...")
        sync_with_minio(cluster_configuration, inputs, job_name, is_upload=True)
        additional_job_resources.append(
            create_pvc_spec(
                pvc_name,
                namespace_spec,
                cluster_configuration["access-mode"],
                pvc_size
            )
        )
        additional_job_resources.append(
            create_minio_secret_spec(
                cluster_configuration,
                job_minio_secret,
                namespace_spec
            )
        )
    else:
        logger.info(f"Pushing inputs to local {train_mount} mount as shared with job environment...")
        for (source, target) in inputs:
            clone_path(source, target)

    logger.info("Creating train job specification...")
    train_job_spec = create_train_job_spec(
        job_name,
        namespace_spec,
        node_selector,
        base_image,
        train_command,
        train_mount,
        cpus,
        memory,
        gpus,
        init_container,
        pvc_name
    )

    logger.info(f"Starting train job '{namespace}.{job_name}'...")
    submit_and_monitor_job(
        api_client,
        train_job_spec,
        namespace,
        additional_job_resources,
    )

    logger.info("Receiving training outputs...")
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    if (is_remote):
        fetch_command = initialize_fetch_command(
            cluster_configuration,
            job_name,
            outputs
        )
        fetch_job_name = f"{job_name}-fetch"

        logger.info("Creating fetch job specification...")
        fetch_job_spec = create_fetch_job_spec(
            fetch_job_name,
            namespace_spec,
            base_image,
            fetch_command,
            train_mount,
            job_minio_secret,
            pvc_name
        )

        logger.info(f"Starting fetch job '{namespace}.{fetch_job_name}'...")
        submit_and_monitor_job(api_client, fetch_job_spec, namespace)

        logger.info("Fetching output data from MinIO & deleting it afterwards...")
        sync_with_minio(
            cluster_configuration,
            outputs,
            job_name,
            is_upload=False,
            remove_minio_files=True
        )

        logger.info(f"Deleting Job {fetch_job_name}...")
        batch_api.delete_namespaced_job(fetch_job_name, namespace)
    else:
        logger.info(f"Fetching outputs to local {train_mount} mount as shared with job environment...")
        for (source, target) in outputs:
            clone_path(source, target)

    logger.info(f"Deleting Job {job_name}...")
    batch_api.delete_namespaced_job(job_name, namespace)

    logger.info("Finished.")


train_model_job_comp = create_component_from_func(
    func=train_model_job,
    output_component_file='component.yaml',
    base_image=BASE_IMAGE
)