In [248]:
PREPROCESS_IMAGE_URI = 'gcr.io/jchavezar-demo/nnunet-pre:v1'
TRAIN_IMAGE_URI = 'gcr.io/jchavezar-demo/nnunet-tra:v1'
DATA_OUTPUT_URI = 'vtx-datasets-public-pre'
TRAIN_IMAGE_URI = "gcr.io/jchavezar-demo/nnunet-tra:v1"

In [177]:
!gsutil mb -l us-central1 gs://$DATA_OUTPUT_URI

Creating gs://vtx-datasets-public-pre/...


In [178]:
!rm -fr data_preprocess_images
!mkdir data_preprocess_images

In [179]:
%%writefile data_preprocess_images/requirements.txt
nibabel
joblib==0.16.0
scikit-learn==0.23.2
pynvml==8.0.4
fsspec==0.8.0
scikit-image==0.18.2
tqdm==4.62

Writing data_preprocess_images/requirements.txt


In [180]:
%%writefile data_preprocess_images/Dockerfile
FROM python:3.8

ADD requirements.txt .

RUN apt update
RUN apt-get install git -y
RUN pip install --upgrade pip
RUN pip install google-cloud-storage-transfer --upgrade
RUN pip install -r requirements.txt
RUN pip install ipykernel
RUN git clone https://github.com/jchavezar/utils.git
WORKDIR utils

Writing data_preprocess_images/Dockerfile


In [181]:
!gcloud builds submit -t $PREPROCESS_IMAGE_URI data_preprocess_images/.

Creating temporary tarball archive of 2 file(s) totalling 402 bytes before compression.
Uploading tarball of [data_preprocess_images/.] to [gs://jchavezar-demo_cloudbuild/source/1680279796.831216-1012404fe13c4c21991d9de28da9da81.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/526de72f-0092-469c-b59c-e6bac2d6d07a].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/526de72f-0092-469c-b59c-e6bac2d6d07a?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "526de72f-0092-469c-b59c-e6bac2d6d07a"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1680279796.831216-1012404fe13c4c21991d9de28da9da81.tgz#1680279797109304
Copying gs://jchavezar-demo_cloudbuild/source/1680279796.831216-1012404fe13c4c21991d9de28da9da81.tgz#1680279797109304...
/ [1 files][  474.0 B/  474.0 B]                                                
Operation c

In [None]:
jchavezar/vtx-nnunet:cpu

In [234]:
from kfp.dsl import component, Dataset, Input

@component(base_image=PREPROCESS_IMAGE_URI)
def data_preprocessing(
    data_output_dir: str,
    project_id: str,
    dataset : str,
    task : str,
    dim : int,
    data : str,
    exec_mode : str,
    results_folder :  str = "/data",
    force : bool = True,
    n_jobs : int = -1
) -> str:
    import os
    import shutil
    from pathlib import Path
    from runtime.utils import get_task_code
    from data_preprocessing.preprocessor import Preprocessor
    from google.cloud.storage import Client, transfer_manager

    storage_client = Client(project=project_id)
    bucket = storage_client.bucket(dataset.split('/')[0])
    
    blob_names = [blob.name for blob in storage_client.list_blobs(bucket, prefix=dataset.split('/')[1])]

    # Download from GCS
    transfer_manager.download_many_to_path(
        bucket,
        blob_names = blob_names,
        destination_directory = data,
        threads=4)
    
    #destination = shutil.copytree(src=dataset.path, dst=f'{data}/Task01_BrainTumour')

    output_dir = Preprocessor(task, dim, Path(data), Path(results_folder), exec_mode, force, n_jobs).run()
    
    # Upload to GCS
    bucket = storage_client.bucket(data_output_dir)
    results = transfer_manager.upload_many_from_filenames(
        bucket,
        filenames = os.listdir(output_dir),
        source_directory = output_dir,
        threads=4)
    
    return str(output_dir)

In [198]:
x="vtx-datasets-public/Task01_BrainTumour"
storage_client = Client(project="jchavezar-demo")
bucket = storage_client.bucket(x.split('/')[0])

blobs = storage_client.list_blobs(bucket, prefix=x.split('/')[1])

for blob in blobs:
    print(blob.name)

Task01_BrainTumour/._dataset.json
Task01_BrainTumour/._imagesTr
Task01_BrainTumour/._imagesTs
Task01_BrainTumour/._labelsTr
Task01_BrainTumour/dataset.json
Task01_BrainTumour/imagesTr/._BRATS_001.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_002.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_004.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_006.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_027.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_028.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_115.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_166.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_169.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_274.nii.gz
Task01_BrainTumour/imagesTr/._BRATS_275.nii.gz
Task01_BrainTumour/imagesTr/BRATS_001.nii.gz
Task01_BrainTumour/imagesTr/BRATS_002.nii.gz
Task01_BrainTumour/imagesTr/BRATS_003.nii.gz
Task01_BrainTumour/imagesTr/BRATS_004.nii.gz
Task01_BrainTumour/imagesTr/BRATS_005.nii.gz
Task01_BrainTumour/imagesTr/BRATS_006.nii.gz
Task01_BrainTumour/imagesTr/BRATS_007.nii.gz
Task01_Brain

In [145]:
import os
from google.cloud.storage import Client, transfer_manager

PROJECT_ID = 'jchavezar-demo'
BUCKET = 'vtx-datasets-public-pre'

storage_client = Client(project=os.environ["CLOUD_ML_PROJECT_ID"])
bucket = storage_client.bucket(BUCKET)

results = transfer_manager.upload_many_from_filenames(
    bucket,
    filenames = os.listdir(''),
    source_directory = '/home/jupyter/demo/',
    threads=4
)

## Training

In [354]:
!rm -fr training
!mkdir training

In [363]:
%%writefile training/train.py
import os
from argparse import ArgumentParser
from google.cloud.storage import Client, transfer_manager
from pathlib import Path
from subprocess import call

parser = ArgumentParser()
parser.add_argument("--datapath", type=str)
parser.add_argument("--task", type=str, default="01", help="Task code")
parser.add_argument("--dim", type=int, required=True, choices=[2, 3], help="Dimension of UNet")
parser.add_argument("--gpus", type=int, default=1, help="Number of gpus")
parser.add_argument("--seed", type=int, default=1, help="Random seed")
parser.add_argument("--learning_rate", type=float, default=3e-4)
parser.add_argument("--fold", type=int, required=True, choices=[0, 1, 2, 3, 4], help="Fold number")
parser.add_argument("--amp", action="store_true", help="Enable automatic mixed precision")
parser.add_argument("--tta", action="store_true", help="Enable test time augmentation")
parser.add_argument("--horovod", action="store_true", help="Launch horovod within script")
parser.add_argument("--bind", action="store_true", help="Bind CPUs for each GPU. Improves throughput for multi-GPU.")
parser.add_argument("--results", type=Path, default=Path("/results"), help="Path to results directory")
parser.add_argument("--logname", type=str, default="train_log.json", help="Name of the dlloger output")

if __name__ == "__main__":
    args = parser.parse_args()

    storage_client = Client(project=os.environ["CLOUD_ML_PROJECT_ID"])
    bucket = storage_client.bucket(args.datapath)
    
    results = transfer_manager.download_many_to_path(
        bucket,
        blob_names = [blob.name for blob in bucket.list_blobs()],
        destination_directory = '/data/01_2d_tf2/',
        threads=4)

    skip = 100 if args.gpus == 1 else 150
    #path_to_main = Path(__file__).resolve().parent.parent / "main.py"
    path_to_main = "main.py"
    cmd = f"horovodrun -np {args.gpus} " if args.horovod else ""
    if args.bind:
        cmd += "bindpcie --cpu=exclusive,nosmt "
    cmd += f"python {path_to_main} --exec-mode train --deep_supervision --xla --skip-eval {skip} "
    cmd += f"--task {args.task} "
    cmd += f"--dim {args.dim} "
    cmd += f"--epochs {300 if args.gpus == 1 else 600} "
    cmd += f"--batch-size {2 if args.dim == 3 else 64} "
    cmd += f"--learning_rate {args.learning_rate} "
    cmd += f"--fold {args.fold} "
    cmd += f"--amp {args.amp} "
    cmd += f"--tta {args.tta} "
    cmd += f"--results {args.results} "
    cmd += f"--logname {args.logname} "
    cmd += f"--gpus {args.gpus} "
    cmd += f"--seed {args.seed} "
    call(cmd, shell=True)

Overwriting training/train.py


%%writefile training/Dockerfile
FROM python:latest

RUN pip install google-cloud-storage-transfer --upgrade
RUN pip install google-cloud-storage --upgrade
COPY train.py train.py
RUN git clone https://github.com/NVIDIA/DeepLearningExamples.git

ENTRYPOINT ["python", "train.py"]

In [364]:
%%writefile training/requirements.txt
git+https://github.com/NVIDIA/dllogger
git+https://github.com/NVIDIA/mlperf-common.git
nibabel
joblib==0.16.0
scikit-learn==0.23.2
pynvml==8.0.4
fsspec==0.8.0
scikit-image==0.18.2
tqdm==4.62

Overwriting training/requirements.txt


In [365]:
%%writefile training/Dockerfile
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:22.11-tf2-py3
FROM ${FROM_IMAGE_NAME}

RUN pip install nvidia-pyindex
ADD requirements.txt .
RUN pip install --upgrade pip
RUN pip install --disable-pip-version-check -r requirements.txt
RUN pip install tensorflow-addons --upgrade

RUN pip install google-cloud-storage-transfer --upgrade
RUN pip install google-cloud-storage --upgrade
RUN git clone https://github.com/NVIDIA/DeepLearningExamples.git

ENV OMP_NUM_THREADS=2
ENV TF_CPP_MIN_LOG_LEVEL 3
ENV OMPI_MCA_coll_hcoll_enable 0
ENV HCOLL_ENABLE_MCAST 0 

RUN git clone https://github.com/jchavezar/utils.git
WORKDIR utils/nnUNet
COPY train.py /workspace/utils/nnUNet/train.py

ENTRYPOINT ["python", "train.py"]

Overwriting training/Dockerfile


In [366]:
!gcloud builds submit -t $TRAIN_IMAGE_URI training/.

Creating temporary tarball archive of 3 file(s) totalling 3.4 KiB before compression.
Uploading tarball of [training/.] to [gs://jchavezar-demo_cloudbuild/source/1680788500.555481-99d43faf6818474ea62e3380b573fdb8.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/jchavezar-demo/locations/global/builds/c94ce5df-016d-4be5-921c-5b7bfeda3c19].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/c94ce5df-016d-4be5-921c-5b7bfeda3c19?project=569083142710 ].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "c94ce5df-016d-4be5-921c-5b7bfeda3c19"

FETCHSOURCE
Fetching storage object: gs://jchavezar-demo_cloudbuild/source/1680788500.555481-99d43faf6818474ea62e3380b573fdb8.tgz#1680788500807476
Copying gs://jchavezar-demo_cloudbuild/source/1680788500.555481-99d43faf6818474ea62e3380b573fdb8.tgz#1680788500807476...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation completed over 1 

In [253]:
!rm -fr data_preprocess_images
!rm -fr training

In [367]:
from kfp.dsl import pipeline, importer
from google_cloud_pipeline_components.v1 import custom_job

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "a2-highgpu-1g",
            "accelerator_type": "NVIDIA_TESLA_A100",
            "accelerator_count": 1
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": TRAIN_IMAGE_URI,
#            "command": [
#                "python",
#                "train.py"
#            ],
            "args": [
                f"--datapath={DATA_OUTPUT_URI}",
                f"--dim=2",
                f"--fold=0"
            ]
        }
    }
]

@pipeline(name='test')
def pipeline():
    data_preprocessing_task = data_preprocessing(
        data_output_dir=DATA_OUTPUT_URI,
        project_id=PROJECT_ID,
        dataset="vtx-datasets-public/Task01_BrainTumour", 
        task="01",
        dim=2,
        data="/data",
        exec_mode="training",
    )
    train_with_cpu_task = custom_job.CustomTrainingJobOp(
        project="jchavezar-demo",
        display_name="testing-nnunet",
        worker_pool_specs=worker_pool_specs,
        base_output_directory="gs://vtx-models"
    ).after(data_preprocessing_task)
    

In [368]:
from kfp.v2 import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='test.json')

In [369]:
import google.cloud.aiplatform as aip

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="test",
    template_path="test.json",
    pipeline_root="gs://vtx-tmp",
    parameter_values={
    }
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/test-20230406135107
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/test-20230406135107')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-20230406135107?project=569083142710
