In [6]:
!rm -fr training
!mkdir training

In [15]:
%%writefile training/train.py
import time
import argparse
import subprocess
import xgboost as xgb
import dask.array as da
from dask_cuda import LocalCUDACluster
from dask.distributed import Client


def using_quantile_device_dmatrix(client: Client):
    import numpy as np
    
    X = np.random.choice([0, 1], size=100000000, p=[.3, .7]).reshape(100000,1000)
    y = np.random.choice([0, 1], size=100000, p=[.4, .6]).reshape(100000,1)
    
    X = da.from_array(X, chunks=(100,1000))
    y = da.from_array(y, chunks=(100,1))
    
    dtrain = xgb.dask.DaskDMatrix(client, X, y)
    
    start_time = time.time()
    output = xgb.dask.train(
        client,
        { 
         'verbosity': 2,
         'learning_rate': 0.1,
         'max_depth': 8,
         'objective': 'reg:squarederror',
         'subsample': 0.5,
         'gamma': 0.9,
         'verbose_eval': True,
         'tree_method':'gpu_hist',
         'objective': 'binary:hinge'
        },
        dtrain,
        num_boost_round=4,
        evals=[(dtrain, "train")],
    )
    
    print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))
    
    history = output['history']
    print('[INFO]: ------ Training evaluation history:', history)
    
    output['booster'].save_model('/gcs/vtxdemos-models/model.json')



def get_scheduler_info():
    scheduler_ip =  subprocess.check_output(['hostname','--all-ip-addresses'])
    scheduler_ip = scheduler_ip.decode('UTF-8').split()[0]
    scheduler_port = '8786'
    scheduler_uri = '{}:{}'.format(scheduler_ip, scheduler_port)
    return scheduler_ip, scheduler_uri


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model-file', type=str,
        help="""GCS or local dir for checkpoints, exports, and summaries.
        Use an existing directory to load a trained model, or a new directory
        to retrain""",
        default='na')
    parser.add_argument(
        '--num-gpu-per-worker', type=int, help='num of workers',
        default=1)
    parser.add_argument(
        '--threads-per-worker', type=int, help='num of threads per worker',
        default=4)
    
    args = parser.parse_args()
    
    print("[INFO]: ------ Arguments parsed")
    print(args)
    
    sched_ip, sched_uri = get_scheduler_info()


    with LocalCUDACluster(ip=sched_ip,
                        n_workers=args.num_gpu_per_worker, 
                        threads_per_worker=args.threads_per_worker 
                        ) as cluster:
            with Client(cluster) as client:

                print('[INFO]: ------ Calling main function ')
                using_quantile_device_dmatrix(client)

Overwriting training/train.py


In [16]:
%%writefile training/entrypoint.sh
source /conda/etc/profile.d/conda.sh
conda activate rapids

echo "Running: rapids_xgboost.py $@"
python train.py $@

Overwriting training/entrypoint.sh


In [17]:
%%writefile training/Dockerfile
FROM rapidsai/rapidsai:cuda11.5-runtime-ubuntu20.04-py3.10

RUN . /opt/conda/etc/profile.d/conda.sh \
    && conda activate rapids \
    && pip install -U gcsfs

ADD train.py /rapids
ADD entrypoint.sh /rapids

WORKDIR /rapids

ENTRYPOINT ["bash", "entrypoint.sh"]


Overwriting training/Dockerfile


In [27]:
!docker build submit -t gcr.io/vtxdemos/dask-xgboost-train:v1 training/.
!docker push gcr.io/vtxdemos/dask-xgboost-train:v1

/bin/bash: line 1: docker: command not found
/bin/bash: line 1: docker: command not found


In [18]:
from google.cloud import aiplatform as aip

aip.init(project="vtxdemos", location="us-central1")

worker_pool_specs = [
        {
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_type": "NVIDIA_TESLA_T4",
                "accelerator_count": 1,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": "gcr.io/vtxdemos/dask-xgboost-train:v2",
                "command": [],
                "args": [
                    "--num-gpu-per-worker=1", 
                    "--threads-per-worker=4"],
            },
        }
    ]

job = aip.CustomJob(
    display_name="dask-xgboost",
    worker_pool_specs=worker_pool_specs,
    staging_bucket="gs://vtxdemos-staging"
)

job.run()

Creating CustomJob
CustomJob created. Resource name: projects/254356041555/locations/us-central1/customJobs/7206936843307188224
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/254356041555/locations/us-central1/customJobs/7206936843307188224')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7206936843307188224?project=254356041555
CustomJob projects/254356041555/locations/us-central1/customJobs/7206936843307188224 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/254356041555/locations/us-central1/customJobs/7206936843307188224 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/254356041555/locations/us-central1/customJobs/7206936843307188224 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/254356041555/locations/us-central1/customJobs/7206936843307188224 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/254356041555/locations/us-central1/customJobs/7206936