CoverType is a dataset with forest cartographic variables, this is a multiclassification tabular job.

## Set Constants

In [4]:
PROJECT_ID = 'jchavezar-demo'
REGION = 'us-central1'
DIR = 'xgboost_custom'
#DATASET_URI = 'gs://vtx-datasets-public/ecommerce/datasets.csv'
MODEL_URI = 'gs://vtx-models/xgboost/cover_type'
STAGING_URI = 'gs://vtx-staging/xgboost/cover_type/'
TRAIN_IMAGE_URI = 'us-central1-docker.pkg.dev/jchavezar-demo/trainings/xgboost-dask-gpu:latest'
#PREDICTION_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest'

## Import Libraries

In [5]:
from google.cloud import aiplatform as aip

## Create Folder Structure

```
xgboost_custom
  |  Dockerfile
  └─── trainer
     |  train.py
     |

```

In [6]:
!rm -fr $DIR
!mkdir $DIR
!mkdir $DIR/trainer

In [13]:
%%writefile $DIR/trainer/train.py
import os
import json
import dask
import argparse
import subprocess
import dask_bigquery
import xgboost as xgb
from google.cloud import storage
from xgboost import dask as dxgb
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import warnings
warnings.filterwarnings(action="ignore")

class Training:
    def __init__(self, project, bq_table_dir, num_workers, threads_per_worker):
        self.project = project
        self.bq_table_dir = bq_table_dir
        self.num_workers = num_workers
        self.threads_per_worker = threads_per_worker
        
        print(self.threads_per_worker)
    
    def load_data(self):
        '''Load data from BigQuery to Dask'''
        _ = self.bq_table_dir.split('.')
    
        ddf = dask_bigquery.read_gbq(
            project_id='jchavezar-demo',
            dataset_id=_[0],
            table_id=_[1]
        ).dropna()
        
        print(f"[INFO] ------ Splitting dataset")
        df_train, df_eval = ddf.random_split([0.8, 0.2], random_state=123)
        self.df_train_features = df_train.drop('Cover_Type', axis=1)
        self.df_eval_features = df_eval.drop('Cover_Type', axis=1)
        self.df_train_labels = df_train.pop('Cover_Type')
        self.df_eval_labels = df_eval.pop('Cover_Type')
    
    def model_train(self):
        print("[INFO] ------ Creating dask cluster")
        scheduler_ip = subprocess.check_output(['hostname','--all-ip-addresses'])
        scheduler_ip = scheduler_ip.decode('UTF-8').split()[0]
        
        with LocalCUDACluster(
            ip=scheduler_ip,
            n_workers=self.num_workers, 
            threads_per_worker=self.threads_per_worker
        ) as cluster:
            with Client(cluster) as client:
                print('[INFO]: ------ Calling main function ')

                print("[INFO]: ------ Dataset for dask")
                dtrain = dxgb.DaskDeviceQuantileDMatrix(client, self.df_train_features, self.df_train_labels)
                dvalid = dxgb.DaskDeviceQuantileDMatrix(client, self.df_eval_features, self.df_eval_labels)

                print("[INFO]: ------ Training...")
                output = xgb.dask.train(
                    client,
                    {
                        "verbosity": 2, 
                        "tree_method": "gpu_hist", 
                        "objective": "multi:softprob",
                        "eval_metric": ["mlogloss"],
                        "learning_rate": 0.1,
                        "gamma": 0.9,
                        "subsample": 0.5,
                        "max_depth": 9,
                        "num_class": 8
                    },
                    dtrain,
                    num_boost_round=10,
                    evals=[(dvalid, "valid1")],
                    early_stopping_rounds=5
                )
                model = output["booster"]
                best_model = model[: model.best_iteration]
                print(f"[INFO] ------ Best model: {best_model}")
                best_model.save_model("/tmp/model.json")
                model_metrics = output["history"]["valid1"]
                with open("/tmp/metadata.json", "w") as outfile:
                    json.dump(model_metrics, outfile)
    
    def storage_artifacts(self):        
        print('[INFO] ------ Storing Artifacts on Google Cloud Storage')
        bucket = os.environ['AIP_MODEL_DIR'].split('/')[2]
        blob_name = '/'.join(os.environ['AIP_MODEL_DIR'].split('/')[3:])
        bucket ='vtx-models'
        storage_client = storage.Client(project=self.project)
        bucket = storage_client.bucket(bucket)

        for i in ["model.json", "metadata.json"]:
            blob = bucket.blob(f'{blob_name}/{i}')
            blob.upload_from_filename(f'/tmp/{i}')        
            
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--project',
        type = str,
        default = os.environ['CLOUD_ML_PROJECT_ID'],
        help = 'This is the tenant or the Google Cloud project id name'
    )
    parser.add_argument(
        "--bq_table_dir",
        type = str,
        help = "BigQuery Dataset URI in the format [DATASET].[TABLE]"
    )
    parser.add_argument(
        '--num_workers', type=int, help='num of workers',
        default=1
    )
    parser.add_argument(
        '--threads_per_worker', type=int, help='num of threads per worker',
        default=1
    )
    
    args = parser.parse_args()
    training = Training(args.project, args.bq_table_dir, args.num_workers, args.threads_per_worker)
    training.load_data()
    training.model_train()
    training.storage_artifacts()

Overwriting xgboost_custom/trainer/train.py


In [14]:
%%writefile $DIR/Dockerfile
FROM rapidsai/rapidsai-nightly:22.12-cuda11.2-base-ubuntu20.04-py3.8

RUN pip install google-cloud-storage \
  && pip install gcsfs \
  && pip install pandas \
  && pip install dask-bigquery

COPY trainer trainer/

ENTRYPOINT ["python", "trainer/train.py"]

Overwriting xgboost_custom/Dockerfile


## Crete Image and Push it to Google Artifacts Repository

In [15]:
!docker build -t $TRAIN_IMAGE_URI $DIR/.

Sending build context to Docker daemon  8.192kB
Step 1/4 : FROM rapidsai/rapidsai-nightly:22.12-cuda11.2-base-ubuntu20.04-py3.8
 ---> fbadf85eb205
Step 2/4 : RUN pip install google-cloud-storage   && pip install gcsfs   && pip install pandas   && pip install dask-bigquery
 ---> Using cache
 ---> c82daf08c0ca
Step 3/4 : COPY trainer trainer/
 ---> 4be0875be0a2
Step 4/4 : ENTRYPOINT ["python", "trainer/train.py"]
 ---> Running in a423f7b02c9e
Removing intermediate container a423f7b02c9e
 ---> b0bdbc428ddf
Successfully built b0bdbc428ddf
Successfully tagged us-central1-docker.pkg.dev/jchavezar-demo/trainings/xgboost-dask-gpu:latest


In [16]:
!docker push $TRAIN_IMAGE_URI

The push refers to repository [us-central1-docker.pkg.dev/jchavezar-demo/trainings/xgboost-dask-gpu]

[1B1d2178e3: Preparing 
[1B7871a528: Preparing 
[1B10f8ab46: Preparing 
[1Ba60296d0: Preparing 
[1B04ce2dbe: Preparing 
[1B8d70af49: Preparing 
[1B57cc060a: Preparing 
[1Bf22f7d2b: Preparing 
[1Be8b67dbb: Preparing 
[1B4e28b8f7: Preparing 
[1Bfdd7be17: Preparing 
[1B070c6f18: Preparing 
[13Bd2178e3: Pushed lready exists 6kB[7A[2K[3A[2K[13A[2Klatest: digest: sha256:2a22b9447394758bbea7904034263853e7cb157465743640bd3af3a035ae24ff size: 3064


## Create Vertex Training from Code [CustomJob]

In [None]:
aip.init(
    project=PROJECT_ID,
    location=REGION)

num_gpus = 4

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "a2-highgpu-4g",
            "accelerator_type": "NVIDIA_TESLA_A100",
            "accelerator_count": num_gpus
        },
        "replica_count": "1",
        "container_spec": {
            "image_uri": TRAIN_IMAGE_URI,
            "args": [
                "--bq_table_dir", "vertex_datasets_public.cover_type_4Mrows",
                "--num_workers", f"{num_gpus}",
                "--threads_per_worker", "4" 
            ]
        }
    },
    ]

job = aip.CustomJob(
    display_name = '05cb-bqdask-xgboost-customjob',
    worker_pool_specs = worker_pool_specs,
    base_output_dir = MODEL_URI,
    staging_bucket = STAGING_URI
)

model = job.run(
)

Creating CustomJob
CustomJob created. Resource name: projects/569083142710/locations/us-central1/customJobs/7843736869597609984
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/569083142710/locations/us-central1/customJobs/7843736869597609984')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/7843736869597609984?project=569083142710
