CoverType is a dataset with forest cartographic variables, this is a multiclassification tabular job.

In [None]:
PROJECT_ID = 'jchavezar-demo'
REGION = 'us-central1'
DIR = 'xgboost_custom'
#DATASET_URI = 'gs://vtx-datasets-public/ecommerce/datasets.csv'
MODEL_URI = 'gs://vtx-models/xgboost/cover_type'
STAGING_URI = 'gs://vtx-staging/xgboost/cover_type/'
TRAIN_IMAGE_URI = 'us-central1-docker.pkg.dev/jchavezar-demo/trainings/xgboost-dask-gpu:latest'
#PREDICTION_IMAGE_URI = 'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-9:latest'

In [49]:
from google.cloud import aiplatform as aip

In [50]:
!rm -fr $DIR
!mkdir $DIR
!mkdir $DIR/trainer

In [167]:
%%writefile $DIR/trainer/train.py
import os
import json
import dask
import argparse
import subprocess
import dask_bigquery
import xgboost as xgb
from google.cloud import storage
from xgboost import dask as dxgb
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import warnings
warnings.filterwarnings(action="ignore")

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--project',
        type = str,
        default = os.environ['CLOUD_ML_PROJECT_ID'],
        help = 'This is the tenant or the Google Cloud project id name'
    )
    parser.add_argument(
        "--bq_table_dir",
        type = str,
        help = "BigQuery Dataset URI in the format [DATASET].[TABLE]"
    )
    parser.add_argument(
        '--num_workers', type=int, help='num of workers',
        default=2
    )
    parser.add_argument(
        '--threads_per_worker', type=int, help='num of threads per worker',
        default=4
    )
    return parser.parse_args()

def load_data(dataset: str):
    '''Load data from BigQuery to Dask'''
    _ = dataset.split('.')
    
    ddf = dask_bigquery.read_gbq(
        project_id='jchavezar-demo',
        dataset_id=_[0],
        table_id=_[1]
    ).dropna()
    
    print(f"[INFO] ------ Splitting dataset")
    df_train, df_eval = ddf.random_split([0.8, 0.2], random_state=123)
    df_train_features = df_train.drop('Cover_Type', axis=1)
    df_eval_features = df_eval.drop('Cover_Type', axis=1)
    df_train_labels = df_train.pop('Cover_Type')
    df_eval_labels = df_eval.pop('Cover_Type')
    
    return df_train_features, df_eval_features, df_train_labels, df_eval_labels
    
def model_train(
    args,
    df_train_features: dask.dataframe, 
    df_eval_features: dask.dataframe, 
    df_train_labels: dask.dataframe, 
    df_eval_labels: dask.dataframe
):
    print("[INFO] ------ Creating dask cluster")
    scheduler_ip =  subprocess.check_output(['hostname','--all-ip-addresses'])
    scheduler_ip = scheduler_ip.decode('UTF-8').split()[0]
    
    with LocalCUDACluster(
        ip=scheduler_ip,
        n_workers=args.num_workers, 
        threads_per_worker=args.threads_per_worker
    ) as cluster:
        with Client(cluster) as client:
            print('[INFO]: ------ Calling main function ')
            
            print("[INFO] ------ Dataset for dask")
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, df_train_features, df_train_labels)
            dvalid = dxgb.DaskDeviceQuantileDMatrix(client, df_eval_features, df_eval_labels)
            
            output = xgb.dask.train(
                client,
                {
                    "verbosity": 2, 
                    "tree_method": "gpu_hist", 
                    "objective": "multi:softprob",
                    "eval_metric": ["mlogloss"],
                    "learning_rate": 0.1,
                    "gamma": 0.9,
                    "subsample": 0.5,
                    "max_depth": 9,
                    "num_class": 8
                },
                dtrain,
                num_boost_round=10,
                evals=[(dvalid, "valid1")],
                early_stopping_rounds=5
            )

        model = output["booster"]
        best_model = output["booster"][: model.best_iteration]
        best_model.save('/tmp/model.json')
        model_metrics = output["history"]["valid1"]
        with open("/tmp/metadata.json", "w") as outfile:
            json.dump(model_metrics, outfile)
            
def store_artifacts(args, model_file, model_metrics_files):
    print('[INFO] ------ Storing Artifacts on Google Cloud Storage')
    artifacts = [model_file, model_metrics_files]
    bucket = os.environ['AIP_MODEL_DIR'].split('/')[2]
    blob_name = '/'.join(os.environ['AIP_MODEL_DIR'].split('/')[3:])
    print(args.project)
    storage_client = storage.Client(project=args.project)
    bucket = storage_client.bucket(bucket)

    for i in artifacts:
        blob = bucket.blob(f'cover_type/{i}')
        blob.upload_from_filename(f'/tmp/{i}')
        
def main():
    # set constants
    model_file = "model.json"
    model_metrics_file = "metadata.json"
    args = get_args()
    df_train_features, df_eval_features, df_train_labels, df_eval_labels = load_data(args.bq_table_dir)
    model_train(args, df_train_features, df_eval_features, df_train_labels, df_eval_labels)
    store_artifacts(args, model_file, model_metrics_file)

if __name__ == "__main__":
    main()

Overwriting xgboost_custom/trainer/train.py


In [168]:
%%writefile $DIR/Dockerfile
FROM rapidsai/rapidsai-nightly:22.12-cuda11.2-base-ubuntu20.04-py3.8

RUN pip install google-cloud-storage \
  && pip install gcsfs \
  && pip install pandas \
  && pip install dask-bigquery

COPY trainer trainer/

ENTRYPOINT ["python", "trainer/train.py"]

Overwriting xgboost_custom/Dockerfile


## Crete Image and Push it to Google Artifacts Repository

In [169]:
!docker build -t $TRAIN_IMAGE_URI $DIR/.

Sending build context to Docker daemon   7.68kB
Step 1/4 : FROM rapidsai/rapidsai-nightly:22.12-cuda11.2-base-ubuntu20.04-py3.8
 ---> fbadf85eb205
Step 2/4 : RUN pip install google-cloud-storage   && pip install gcsfs   && pip install pandas   && pip install dask-bigquery
 ---> Using cache
 ---> c82daf08c0ca
Step 3/4 : COPY trainer trainer/
 ---> c4031350fda1
Step 4/4 : ENTRYPOINT ["python", "trainer/train.py"]
 ---> Running in 1d828d83a5b2
Removing intermediate container 1d828d83a5b2
 ---> 5828321237af
Successfully built 5828321237af
Successfully tagged us-central1-docker.pkg.dev/jchavezar-demo/trainings/xgboost-dask-gpu:latest


In [170]:
!docker push $TRAIN_IMAGE_URI

The push refers to repository [us-central1-docker.pkg.dev/jchavezar-demo/trainings/xgboost-dask-gpu]

[1B55002f17: Preparing 
[1B7871a528: Preparing 
[1B10f8ab46: Preparing 
[1Ba60296d0: Preparing 
[1B04ce2dbe: Preparing 
[1B8d70af49: Preparing 
[1B57cc060a: Preparing 
[1Bf22f7d2b: Preparing 
[1Be8b67dbb: Preparing 
[1B4e28b8f7: Preparing 
[1Bfdd7be17: Preparing 
[1B070c6f18: Preparing 
[13B5002f17: Pushed lready exists 2kB[10A[2K[5A[2K[3A[2K[1A[2K[13A[2Klatest: digest: sha256:b6b7dc92fb4c5676951d468859843b50e1b1b98f2b1c2c4df413ddb92437e339 size: 3064


## Create Vertex Training from Code [CustomJob]

In [171]:
aip.init(
    project=PROJECT_ID,
    location=REGION)

num_gpus = 4

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-32",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 4
        },
        "replica_count": "1",
        "container_spec": {
            "image_uri": TRAIN_IMAGE_URI,
            "args": [
                "--bq_table_dir", "vertex_datasets_public.cover_type_4Mrows",
                "--num_workers", f"{num_gpus}",
                "--threads_per_worker", "4" 
            ]
        }
    },
    ]

job = aip.CustomJob(
    display_name = '05cb-bqdask-xgboost-customjob',
    worker_pool_specs = worker_pool_specs,
    base_output_dir = MODEL_URI,
    staging_bucket = STAGING_URI
)

model = job.run(
)

Creating CustomJob
CustomJob created. Resource name: projects/569083142710/locations/us-central1/customJobs/8907149331610468352
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/569083142710/locations/us-central1/customJobs/8907149331610468352')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/8907149331610468352?project=569083142710
CustomJob projects/569083142710/locations/us-central1/customJobs/8907149331610468352 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/8907149331610468352 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/8907149331610468352 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/8907149331610468352 current state:
JobState.JOB_STATE_PENDING
CustomJob projects/569083142710/locations/us-central1/customJobs/8907149

RuntimeError: Job failed with:
code: 3
message: "The replica workerpool0-0 exited with a non-zero status of 1. To find out more about why your job exited please check the logs: https://console.cloud.google.com/logs/viewer?project=569083142710&resource=ml_job%2Fjob_id%2F8907149331610468352&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%228907149331610468352%22"


In [None]:
            "command": [
                "python",
                "trainer/train.py"
            ],

In [150]:
def test(data: str):
    print(args)
    print(data)
    
def main():
    args = 'x'
    test('testing')

main()
    

1
testing
