# Using built-in xgboost with AI Platform Training


This notebook demonstrates how to use AI Platfrom Training built-in XGBoost algorithm. You will train a multi-class classification model that predicts the type of forest cover from cartographic data. The [dataset](../../../datasets/covertype/README.md) used in the lab is based on **Covertype Data Set** from UCI Machine Learning Repository.



In [72]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import uuid
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

from google.cloud import bigquery
from jinja2 import Template
from kfp.components import func_to_container_op
from typing import NamedTuple

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

## Configure environment settings

Set location paths, connections strings, and other environment settings. Make sure to update   `REGION`, and `ARTIFACT_STORE`  with the settings reflecting your lab environment. 

- `REGION` - the compute region for AI Platform Training and Prediction
- `ARTIFACT_STORE` - the GCS bucket used for storing data and output from AI Platform Training.

In [73]:
!gsutil ls

gs://artifacts.mlops-dev-env.appspot.com/
gs://dataflow-staging-us-central1-881178567352/
gs://dataprep-staging-8484f61e-42ce-4613-908f-3de78ec85e67/
gs://hostedkfp-default-qyrlksyfx2/
gs://jk-mlops-dev-sandbox/
gs://mlops-datasets/
gs://mlops-dev-env-kubeflowpipelines-default/
gs://mlops-dev-env-staging/
gs://mlops-dev-env-us-central1-ml-metadata-kfp/
gs://mlops-dev-env.appspot.com/
gs://mlops-dev-env_cloudbuild/
gs://mlops-dev-workspace/
gs://staging.mlops-dev-env.appspot.com/


In [89]:
REGION = 'us-central1'
ARTIFACT_STORE = 'gs://mlops-dev-workspace/xgboos-demo'

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATA_ROOT='{}/data'.format(ARTIFACT_STORE)
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)

ORIG_DATASET = 'gs://workshop-datasets/covertype/small/dataset.csv'
TRAINING_DATASET = '{}/covertype_preprocessed/dataset.csv'.format(DATA_ROOT)

## Prepare the dataset for the built-in XGBoost

In [105]:
df = pd.read_csv(DATASET)
df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3142,183,9,648,101,757,223,247,157,1871,Commanche,C7757,1
1,2156,18,28,0,0,1207,187,170,107,960,Cache,C6102,3
2,1967,124,16,60,9,124,245,227,105,451,Cache,C2704,2
3,3237,305,15,663,19,3593,178,231,193,1260,Commanche,C7201,0
4,2981,221,18,150,14,4584,195,254,191,1822,Rawah,C7745,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2710,245,10,175,41,417,200,248,185,335,Rawah,C7746,1
99996,2854,99,9,60,12,2581,235,228,123,2160,Rawah,C7745,1
99997,2936,64,7,285,23,4808,227,226,133,2804,Rawah,C7745,0
99998,2602,98,7,379,74,691,232,230,130,1445,Commanche,C2703,5


### Convert numeric features to floats

In [106]:
numeric_feature_indexes = slice(0, 10)
num_features_type_map = {feature: 'float64' for feature in df.columns[numeric_feature_indexes]}

df_training = df.astype(num_features_type_map)
df_training

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3142.0,183.0,9.0,648.0,101.0,757.0,223.0,247.0,157.0,1871.0,Commanche,C7757,1
1,2156.0,18.0,28.0,0.0,0.0,1207.0,187.0,170.0,107.0,960.0,Cache,C6102,3
2,1967.0,124.0,16.0,60.0,9.0,124.0,245.0,227.0,105.0,451.0,Cache,C2704,2
3,3237.0,305.0,15.0,663.0,19.0,3593.0,178.0,231.0,193.0,1260.0,Commanche,C7201,0
4,2981.0,221.0,18.0,150.0,14.0,4584.0,195.0,254.0,191.0,1822.0,Rawah,C7745,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2710.0,245.0,10.0,175.0,41.0,417.0,200.0,248.0,185.0,335.0,Rawah,C7746,1
99996,2854.0,99.0,9.0,60.0,12.0,2581.0,235.0,228.0,123.0,2160.0,Rawah,C7745,1
99997,2936.0,64.0,7.0,285.0,23.0,4808.0,227.0,226.0,133.0,2804.0,Rawah,C7745,0
99998,2602.0,98.0,7.0,379.0,74.0,691.0,232.0,230.0,130.0,1445.0,Commanche,C2703,5


### Move the target column  to the first position

In [107]:
columns = list(df.columns)
columns.insert(0, columns.pop(columns.index('Cover_Type')))

df_training = df_training.reindex(columns=columns)
df_training

Unnamed: 0,Cover_Type,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type
0,1,3142.0,183.0,9.0,648.0,101.0,757.0,223.0,247.0,157.0,1871.0,Commanche,C7757
1,3,2156.0,18.0,28.0,0.0,0.0,1207.0,187.0,170.0,107.0,960.0,Cache,C6102
2,2,1967.0,124.0,16.0,60.0,9.0,124.0,245.0,227.0,105.0,451.0,Cache,C2704
3,0,3237.0,305.0,15.0,663.0,19.0,3593.0,178.0,231.0,193.0,1260.0,Commanche,C7201
4,0,2981.0,221.0,18.0,150.0,14.0,4584.0,195.0,254.0,191.0,1822.0,Rawah,C7745
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,2710.0,245.0,10.0,175.0,41.0,417.0,200.0,248.0,185.0,335.0,Rawah,C7746
99996,1,2854.0,99.0,9.0,60.0,12.0,2581.0,235.0,228.0,123.0,2160.0,Rawah,C7745
99997,0,2936.0,64.0,7.0,285.0,23.0,4808.0,227.0,226.0,133.0,2804.0,Rawah,C7745
99998,5,2602.0,98.0,7.0,379.0,74.0,691.0,232.0,230.0,130.0,1445.0,Commanche,C2703


In [108]:
df_training.to_csv(TRAINING_DATASET, header=False, index=False)

In [109]:
!gsutil cat -r 0-297 {TRAINING_DATASET}

1,3142.0,183.0,9.0,648.0,101.0,757.0,223.0,247.0,157.0,1871.0,Commanche,C7757
3,2156.0,18.0,28.0,0.0,0.0,1207.0,187.0,170.0,107.0,960.0,Cache,C6102
2,1967.0,124.0,16.0,60.0,9.0,124.0,245.0,227.0,105.0,451.0,Cache,C2704
0,3237.0,305.0,15.0,663.0,19.0,3593.0,178.0,231.0,193.0,1260.0,Commanche,C7201


## Configure and submit the training job

In [114]:
IMAGE_URI = 'gcr.io/cloud-ml-algos/boosted_trees:latest'
DATASET_NAME = 'covertype'
ALGORITHM = 'xgboost'
MODEL_TYPE = 'classification'
MODEL_NAME = '{}_{}_{}'.format(DATASET_NAME, ALGORITHM, MODEL_TYPE)

JOB_NAME = 'JOB_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = '{}/{}'.format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = 'CUSTOM'
MASTER_TYPE = 'n1-standard-16'

VALIDATION_SPLIT = 0.10
TEST_SPLIT = 0.10

In [115]:
!gcloud ai-platform jobs submit training {JOB_NAME} \
--master-image-uri={IMAGE_URI} \
--scale-tier={SCALE_TIER} \
--master-machine-type={MASTER_TYPE} \
--job-dir={JOB_DIR} \
--region={REGION} \
-- \
--preprocess \
--objective=multi:softmax \
--training_data_path={TRAINING_DATASET} \
--validation_split={VALIDATION_SPLIT} \
--test_split={TEST_SPLIT}

Job [JOB_20200623_043251] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200623_043251

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200623_043251
jobId: JOB_20200623_043251
state: QUEUED


## Monitor the job

In [116]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2020-06-23T04:32:54Z'
etag: N1FiTpmzwyU=
jobId: JOB_20200623_043251
state: PREPARING
trainingInput:
  args:
  - --preprocess
  - --objective=multi:softmax
  - --training_data_path=gs://mlops-dev-workspace/xgboos-demo/data/covertype_preprocessed/dataset.csv
  - --validation_split=0.1
  - --test_split=0.1
  jobDir: gs://mlops-dev-workspace/xgboos-demo/jobs/JOB_20200623_043251
  masterConfig:
    imageUri: gcr.io/cloud-ml-algos/boosted_trees:latest
  masterType: n1-standard-16
  region: us-central1
  scaleTier: CUSTOM
trainingOutput:
  builtInAlgorithmOutput:
    framework: XGBOOST
    pythonVersion: '2.7'
    runtimeVersion: '1.14'
  isBuiltInAlgorithmJob: true

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/JOB_20200623_043251?project=mlops-dev-env

View logs at:
https://console.cloud.google.com/logs?resource=ml.googleapis.com%2Fjob_id%2FJOB_20200623_043251&project=mlops-dev-env


In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

INFO	2020-06-23 04:32:54 +0000	service		Validating job requirements...
INFO	2020-06-23 04:32:54 +0000	service		Job creation request has been successfully validated.
INFO	2020-06-23 04:32:55 +0000	service		Waiting for job to be provisioned.
INFO	2020-06-23 04:32:55 +0000	service		Job JOB_20200623_043251 is queued.
INFO	2020-06-23 04:32:56 +0000	service		Waiting for training program to start.
INFO	2020-06-23 04:34:44 +0000	master-replica-0		Downloading data.
INFO	2020-06-23 04:34:44 +0000	master-replica-0		File gs://mlops-dev-workspace/xgboos-demo/data/covertype_preprocessed/dataset.csv was downloaded
INFO	2020-06-23 04:34:44 +0000	master-replica-0		Processing data
INFO	2020-06-23 04:34:44 +0000	master-replica-0		Reading raw data file: /root/temp/training.csv
INFO	2020-06-23 04:34:44 +0000	master-replica-0		Splitting data...
INFO	2020-06-23 04:34:45 +0000	master-replica-0		Removing rows with many missing values...
INFO	2020-06-23 04:34:45 +0000	master-replica-0		Analyzing data...
INFO	20

### Inspect the job's output

In [62]:
!gsutil ls {JOB_DIR}

gs://mlops-dev-workspace/xgboos-demo/jobs/JOB_20200623_025457/artifacts/
gs://mlops-dev-workspace/xgboos-demo/jobs/JOB_20200623_025457/model/
gs://mlops-dev-workspace/xgboos-demo/jobs/JOB_20200623_025457/processed_data/


In [64]:
!gsutil cat {JOB_DIR}/artifacts/instance_generator.py

# Lint as: python2, python3
"""Preprocess code to be exported.

Usage:
This code snippet can be used in two ways:
1. Command line preprocessing.
  You can call instance_generator.py from the command line with three arguments:
    1) raw_data_string: this argument is mandatory. It will be a line of data
                        represented as a comma-separated string.
    2) metadata: this argument is optional. It will be the full file path of
                  the 'metadata.json' file. If not specified, the program will
                  look for a 'metadata.json' file in the same directory as this
                  code snippet.
  The exact command will be:

    python instance_generator.py --raw_data_string <RAW_DATA_STRING>
                                 --metadata <META_DATA>

2. Use the CMLEPreProcessor module in your own code.
  This approach gives you the control of processing as many data points as you
  like instead of just one line of data. To use the CMLEPreProcessor module

## Configure and submit the hyperparameter tuning job

### Create the hyperparameter configuration file. 


The below file configures AI Platform hypertuning to run up to 12 trials on up to three nodes and to choose from three discrete values of `max_depth` and the linear range betwee 0.2 and 0.4 for `eta`.

In [66]:
HPTUNING_CONFIG = 'hptuning_config.yaml'

In [67]:
%%writefile {HPTUNING_CONFIG}

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainingInput:
  hyperparameters:
    goal: MINIMIZE
    maxTrials: 12
    maxParallelTrials: 3
    hyperparameterMetricTag: merror
    enableTrialEarlyStopping: TRUE 
    params:
    - parameterName: max_depth
      type: DISCRETE
      discreteValues: [
          8,
          10,
          12
          ]
    - parameterName: eta
      type: DOUBLE
      minValue:  0.2
      maxValue:  0.4
      scaleType: UNIT_LINEAR_SCALE

Overwriting hptuning_config.yaml


### Start the hyperparameter tuning job.

Use the `gcloud` command to start the hyperparameter tuning job.

In [68]:
JOB_NAME = 'JOB_{}'.format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = '{}/{}'.format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = 'CUSTOM'
MASTER_TYPE = 'n1-standard-16'

In [69]:
!gcloud ai-platform jobs submit training {JOB_NAME} \
--master-image-uri={IMAGE_URI} \
--scale-tier={SCALE_TIER} \
--master-machine-type={MASTER_TYPE} \
--job-dir={JOB_DIR} \
--region={REGION} \
--config {HPTUNING_CONFIG} \
-- \
--preprocess \
--objective=multi:softmax \
--training_data_path={TRAINING_DATASET}

Job [JOB_20200623_031400] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20200623_031400

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20200623_031400
jobId: JOB_20200623_031400
state: QUEUED


In [70]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2020-06-23T03:14:18Z'
etag: _7Epk14ICaI=
jobId: JOB_20200623_031400
startTime: '2020-06-23T03:14:22Z'
state: RUNNING
trainingInput:
  args:
  - --preprocess
  - --objective=multi:softmax
  - --training_data_path=gs://mlops-dev-workspace/xgboos-demo/data/covertype_preprocessed/dataset.csv
  hyperparameters:
    enableTrialEarlyStopping: true
    goal: MINIMIZE
    hyperparameterMetricTag: merror
    maxParallelTrials: 3
    maxTrials: 12
    params:
    - discreteValues:
      - 8.0
      - 10.0
      - 12.0
      parameterName: max_depth
      type: DISCRETE
    - maxValue: 0.4
      minValue: 0.2
      parameterName: eta
      scaleType: UNIT_LINEAR_SCALE
      type: DOUBLE
  jobDir: gs://mlops-dev-workspace/xgboos-demo/jobs/JOB_20200623_031400
  masterConfig:
    imageUri: gcr.io/cloud-ml-algos/boosted_trees:latest
  masterType: n1-standard-16
  region: us-central1
  scaleTier: CUSTOM
trainingOutput:
  builtInAlgorithmOutput:
    framework: XGBOOST
    pythonVersion: '2.

In [71]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

INFO	2020-06-23 03:14:18 +0000	service		Validating job requirements...
INFO	2020-06-23 03:14:18 +0000	service		Job creation request has been successfully validated.
INFO	2020-06-23 03:14:19 +0000	service		Job JOB_20200623_031400 is queued.
INFO	2020-06-23 03:14:28 +0000	service	2	Waiting for job to be provisioned.
INFO	2020-06-23 03:14:28 +0000	service	3	Waiting for job to be provisioned.
INFO	2020-06-23 03:14:28 +0000	service	1	Waiting for job to be provisioned.
INFO	2020-06-23 03:14:30 +0000	service	3	Waiting for training program to start.
INFO	2020-06-23 03:14:30 +0000	service	1	Waiting for training program to start.
INFO	2020-06-23 03:14:30 +0000	service	2	Waiting for training program to start.
INFO	2020-06-23 03:16:16 +0000	master-replica-0	1	Trial id 1
INFO	2020-06-23 03:16:16 +0000	master-replica-0	1	Job Dir gs://mlops-dev-workspace/xgboos-demo/jobs/JOB_20200623_031400
INFO	2020-06-23 03:16:16 +0000	master-replica-0	1	Downloading data.
INFO	2020-06-23 03:16:17 +0000	master-repli

### Retrieve HP-tuning results.

After the job completes you can review the results using GCP Console or programatically by calling the AI Platform Training REST end-point.

In [None]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")
    
response

The returned run results are sorted by a value of the optimization metric. The best run is the first item on the returned list.

In [None]:
response['trainingOutput']['trials'][0]

## Retrain the model with the best hyperparameters

You can now retrain the model using the best hyperparameters and using combined training and validation splits as a training dataset.

### Configure and run the training job

In [None]:
alpha = response['trainingOutput']['trials'][0]['hyperparameters']['alpha']
max_iter = response['trainingOutput']['trials'][0]['hyperparameters']['max_iter']

In [None]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--alpha=$alpha \
--max_iter=$max_iter \
--nohptune

In [None]:
!gcloud ai-platform jobs stream-logs $JOB_NAME

### Examine the training output

The training script saved the trained model as the 'model.pkl' in the `JOB_DIR` folder on GCS.

In [None]:
!gsutil ls $JOB_DIR

## Deploy the model to AI Platform Prediction

### Create a model resource

In [None]:
model_name = 'forest_cover_classifier'
labels = "task=classifier,domain=forestry"
filter = 'name:{}'.format(model_name)
models = !(gcloud ai-platform models list --filter={filter} --format='value(name)')

if not models:
    !gcloud ai-platform models create  $model_name \
    --regions=$REGION \
    --labels=$labels
else:
    print("Model: {} already exists.".format(models[0]))

### Create a model version

In [None]:
model_version = 'v01'
filter = 'name:{}'.format(model_version)
versions = !(gcloud ai-platform versions list --model={model_name} --format='value(name)' --filter={filter})

if not versions:
    !gcloud ai-platform versions create {model_version} \
    --model={model_name} \
    --origin=$JOB_DIR \
    --runtime-version=1.15 \
    --framework=scikit-learn \
    --python-version=3.7
else:
    print("Model version: {} already exists.".format(versions[0]))

### Serve predictions
#### Prepare the input file with JSON formated instances.

In [None]:
input_file = 'serving_instances.json'

with open(input_file, 'w') as f:
    for index, row in X_validation.head().iterrows():
        f.write(json.dumps(list(row.values)))
        f.write('\n')

In [None]:
!cat $input_file

#### Invoke the model

In [None]:
!gcloud ai-platform predict \
--model $model_name \
--version $model_version \
--json-instances $input_file

<font size=-1>Licensed under the Apache License, Version 2.0 (the \"License\");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at [https://www.apache.org/licenses/LICENSE-2.0](https://www.apache.org/licenses/LICENSE-2.0)

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the specific language governing permissions and limitations under the License.</font>