# Using built-in xgboost with AI Platform Training


This notebook demonstrates how to use AI Platfrom Training built-in XGBoost algorithm. You will train a multi-class classification model that predicts the type of forest cover from cartographic data. The [dataset](https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-on-gcp/master/datasets/covertype/README.md) used in the lab is based on **Covertype Data Set** from UCI Machine Learning Repository.



In [1]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import uuid
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

## Configure environment settings

Set location paths, connections strings, and other environment settings. Make sure to update   `REGION`, and `ARTIFACT_STORE`  with the settings reflecting your lab environment. 

- `REGION` - the compute region for AI Platform Training and Prediction
- `ARTIFACT_STORE` - the GCS bucket used for storing data and output from AI Platform Training.

In [2]:
REGION = 'us-central1'
ARTIFACT_STORE = 'gs://mlops-dev-workspace/xgboost-demo'

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATA_ROOT='{}/data'.format(ARTIFACT_STORE)
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)

ORIG_DATASET = 'gs://workshop-datasets/covertype/small/dataset.csv'
TRAINING_DATASET = '{}/covertype_preprocessed/dataset.csv'.format(DATA_ROOT)

## Prepare the dataset for the built-in XGBoost

In [3]:
df = pd.read_csv(ORIG_DATASET)
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3142,183,9,648,101,757,223,247,157,1871,Commanche,C7757,1
1,2156,18,28,0,0,1207,187,170,107,960,Cache,C6102,3
2,1967,124,16,60,9,124,245,227,105,451,Cache,C2704,2
3,3237,305,15,663,19,3593,178,231,193,1260,Commanche,C7201,0
4,2981,221,18,150,14,4584,195,254,191,1822,Rawah,C7745,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2710,245,10,175,41,417,200,248,185,335,Rawah,C7746,1
99996,2854,99,9,60,12,2581,235,228,123,2160,Rawah,C7745,1
99997,2936,64,7,285,23,4808,227,226,133,2804,Rawah,C7745,0
99998,2602,98,7,379,74,691,232,230,130,1445,Commanche,C2703,5


### Convert numeric features to floats

In [4]:
numeric_feature_indexes = slice(0, 10)
num_features_type_map = {feature: 'float64' for feature in df.columns[numeric_feature_indexes]}

df_training = df.astype(num_features_type_map)
df_training

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3142.0,183.0,9.0,648.0,101.0,757.0,223.0,247.0,157.0,1871.0,Commanche,C7757,1
1,2156.0,18.0,28.0,0.0,0.0,1207.0,187.0,170.0,107.0,960.0,Cache,C6102,3
2,1967.0,124.0,16.0,60.0,9.0,124.0,245.0,227.0,105.0,451.0,Cache,C2704,2
3,3237.0,305.0,15.0,663.0,19.0,3593.0,178.0,231.0,193.0,1260.0,Commanche,C7201,0
4,2981.0,221.0,18.0,150.0,14.0,4584.0,195.0,254.0,191.0,1822.0,Rawah,C7745,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2710.0,245.0,10.0,175.0,41.0,417.0,200.0,248.0,185.0,335.0,Rawah,C7746,1
99996,2854.0,99.0,9.0,60.0,12.0,2581.0,235.0,228.0,123.0,2160.0,Rawah,C7745,1
99997,2936.0,64.0,7.0,285.0,23.0,4808.0,227.0,226.0,133.0,2804.0,Rawah,C7745,0
99998,2602.0,98.0,7.0,379.0,74.0,691.0,232.0,230.0,130.0,1445.0,Commanche,C2703,5


### Move the target column  to the first position

In [5]:
columns = list(df.columns)
columns.insert(0, columns.pop(columns.index('Cover_Type')))

df_training = df_training.reindex(columns=columns)
df_training

Unnamed: 0,Cover_Type,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type
0,1,3142.0,183.0,9.0,648.0,101.0,757.0,223.0,247.0,157.0,1871.0,Commanche,C7757
1,3,2156.0,18.0,28.0,0.0,0.0,1207.0,187.0,170.0,107.0,960.0,Cache,C6102
2,2,1967.0,124.0,16.0,60.0,9.0,124.0,245.0,227.0,105.0,451.0,Cache,C2704
3,0,3237.0,305.0,15.0,663.0,19.0,3593.0,178.0,231.0,193.0,1260.0,Commanche,C7201
4,0,2981.0,221.0,18.0,150.0,14.0,4584.0,195.0,254.0,191.0,1822.0,Rawah,C7745
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,2710.0,245.0,10.0,175.0,41.0,417.0,200.0,248.0,185.0,335.0,Rawah,C7746
99996,1,2854.0,99.0,9.0,60.0,12.0,2581.0,235.0,228.0,123.0,2160.0,Rawah,C7745
99997,0,2936.0,64.0,7.0,285.0,23.0,4808.0,227.0,226.0,133.0,2804.0,Rawah,C7745
99998,5,2602.0,98.0,7.0,379.0,74.0,691.0,232.0,230.0,130.0,1445.0,Commanche,C2703


In [6]:
df_training.to_csv(TRAINING_DATASET, header=False, index=False)

In [7]:
!gsutil cat -r 0-297 {TRAINING_DATASET}

1,3142.0,183.0,9.0,648.0,101.0,757.0,223.0,247.0,157.0,1871.0,Commanche,C7757
3,2156.0,18.0,28.0,0.0,0.0,1207.0,187.0,170.0,107.0,960.0,Cache,C6102
2,1967.0,124.0,16.0,60.0,9.0,124.0,245.0,227.0,105.0,451.0,Cache,C2704
0,3237.0,305.0,15.0,663.0,19.0,3593.0,178.0,231.0,193.0,1260.0,Commanche,C7201


## Configure and submit the hyperparameter tuning job

### Create the hyperparameter configuration file. 


The below file configures AI Platform hypertuning to run up to 12 trials on up to three nodes and to choose from three discrete values of `max_depth` and the linear range betwee 0.2 and 0.4 for `eta`.

In [8]:
HPTUNING_CONFIG = 'hptuning_config.yaml'

In [9]:
%%writefile {HPTUNING_CONFIG}

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainingInput:
  hyperparameters:
    goal: MINIMIZE
    maxTrials: 12
    maxParallelTrials: 3
    hyperparameterMetricTag: merror
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_depth
      type: DISCRETE
      discreteValues: [
          8,
          10,
          12
          ]
    - parameterName: eta
      type: DOUBLE
      minValue:  0.2
      maxValue:  0.4
      scaleType: UNIT_LINEAR_SCALE

Overwriting hptuning_config.yaml


### Start the hyperparameter tuning job.

Use the `gcloud` command to start the hyperparameter tuning job.

In [10]:
IMAGE_URI = 'gcr.io/cloud-ml-algos/boosted_trees:latest'
JOB_NAME = 'job_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = '{}/{}'.format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = 'CUSTOM'
MASTER_TYPE = 'n1-standard-16'

In [11]:
!gcloud ai-platform jobs submit training {JOB_NAME} \
--master-image-uri={IMAGE_URI} \
--scale-tier={SCALE_TIER} \
--master-machine-type={MASTER_TYPE} \
--job-dir={JOB_DIR} \
--region={REGION} \
--config {HPTUNING_CONFIG} \
-- \
--preprocess \
--objective=multi:softmax \
--training_data_path={TRAINING_DATASET}

Job [job_20200626_152842] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe job_20200626_152842

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs job_20200626_152842
jobId: job_20200626_152842
state: QUEUED


In [None]:
!gcloud ai-platform jobs describe $JOB_NAME

In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

### Retrieve HP-tuning results.

After the job completes you can review the results using GCP Console or programatically by calling the AI Platform Training REST end-point.

In [None]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")
    
response

The returned run results are sorted by a value of the optimization metric. The best run is the first item on the returned list.

In [None]:
response['trainingOutput']['trials'][0]

## Deploy the model to AI Platform Prediction

### Set the deployment config

In [None]:
training_output = response['trainingOutput']['trials'][0]['builtInAlgorithmOutput']['modelPath']

!gsutil cp  {training_output}/deployment_config.yaml .
!cat deployment_config.yaml

### Create a model resource

In [None]:
DATASET_NAME = 'covertype'
ALGORITHM = 'xgboost'
MODEL_TYPE = 'classification'
MODEL_NAME = '{}_{}_{}'.format(DATASET_NAME, ALGORITHM, MODEL_TYPE)

In [None]:
!gcloud ai-platform models create  $MODEL_NAME \
    --regions={REGION}

### Create a model version

In [None]:
MODEL_VERSION = 'v1'

!gcloud ai-platform versions create {MODEL_VERSION} \
  --model {MODEL_NAME} \
  --config deployment_config.yaml

### Serve predictions
#### Download training artifacts.

In [None]:
training_artifacts = training_output.rsplit('/', 2)[0]
!gsutil ls {training_artifacts}/artifacts

In [None]:
!gsutil cp {training_artifacts}/artifacts/* .

#### Prepare the input file with JSON formated instances.

In [None]:
INSTANCE_FILE = 'serving_instance.json'
RAW_DATA_POINT = '3142.0, 183.0, 9.0, 648.0, 101.0, 757.0, 223.0, 247.0, 157.0, 1871.0, Commanche, C7757'

!python instance_generator.py --raw_data_string="{RAW_DATA_POINT}" > {INSTANCE_FILE}

In [None]:
!cat {INSTANCE_FILE}

#### Invoke the model

In [None]:
!gcloud ai-platform predict \
--model {MODEL_NAME} \
--version {MODEL_VERSION} \
--json-instances {INSTANCE_FILE}

<font size=-1>Licensed under the Apache License, Version 2.0 (the \"License\");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at [https://www.apache.org/licenses/LICENSE-2.0](https://www.apache.org/licenses/LICENSE-2.0)

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the specific language governing permissions and limitations under the License.</font>