In [22]:
# Copyright 2018 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor
import urllib.request

import fairing
from fairing import builders
from fairing.training import native

TRAINING_URL="https://raw.githubusercontent.com/kubeflow/examples/master/xgboost_ames_housing/ames_dataset/train.csv"
TRAINING_FILE="train.csv"

ESTIMATORS=1000
LEARNING_RATE=0.1
TEST_FRACTION_SIZE=0.25
EARLY_STOPPING_ROUNDS=50

DOCKER_REPOSITORY_NAME = 'gcr.io/mrick-gcp'
NOTEBOOK_FILE = '/home/jovyan/work/xgboost-kubeflow.ipynb'
BASE_IMAGE = 'gcr.io/kubeflow-images-public/xgboost-fairing-example:v1'
fairing.config.set_builder(builders.AppendBuilder(
    repository=DOCKER_REPOSITORY_NAME,
    base_image=BASE_IMAGE,
    notebook_file=NOTEBOOK_FILE))

@native.Training()
class XgBoostModel(object):
    def train(self):
        (train_X, train_y), (test_X, test_y) = read_input()
        model = train_model(train_X,
                                  train_y,
                                  test_X,
                                  test_y,
                                  ESTIMATORS,
                                  LEARNING_RATE)

        eval_model(model, test_X, test_y)

def download(url, file_name):
    with urllib.request.urlopen(url) as response, open(file_name, "wb") as file:
        file.write(response.read())

def read_input(test_size=TEST_FRACTION_SIZE):
  """Read input data and split it into train and test."""
  download(TRAINING_URL, TRAINING_FILE)
  data = pd.read_csv(TRAINING_FILE)
  data.dropna(axis=0, subset=['SalePrice'], inplace=True)

  y = data.SalePrice
  X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

  train_X, test_X, train_y, test_y = train_test_split(X.values,
                                                      y.values,
                                                      test_size=test_size,
                                                      shuffle=False)

  imputer = Imputer()
  train_X = imputer.fit_transform(train_X)
  test_X = imputer.transform(test_X)

  return (train_X, train_y), (test_X, test_y)

def train_model(train_X,
                train_y,
                test_X,
                test_y,
                n_estimators,
                learning_rate):
  """Train the model using XGBRegressor."""
  model = XGBRegressor(n_estimators=n_estimators,
                      learning_rate=learning_rate)

  model.fit(train_X,
            train_y,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            eval_set=[(test_X, test_y)])

  logging.info("Best RMSE on eval: %.2f with %d rounds",
               model.best_score,
               model.best_iteration+1)
  return model

def eval_model(model, test_X, test_y):
  """Evaluate the model performance."""
  predictions = model.predict(test_X)
  logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))




In [23]:
model = XgBoostModel()
model.train()

Running...
Uploading gcr.io/mrick-gcp/fairing-job:feaf6a2f7f1e06de9649dc2791b33ac303c76e1365ccd71fb0a1a1d14befdbc0
Pushed image gcr.io/mrick-gcp/fairing-job:feaf6a2f7f1e06de9649dc2791b33ac303c76e1365ccd71fb0a1a1d14befdbc0
Training(s) launched.
Waiting for job to start...


b'[0]\tvalidation_0-rmse:177514'
b"Will train until validation_0-rmse hasn't improved in 50 rounds."
b'[1]\tvalidation_0-rmse:161858'
b'[2]\tvalidation_0-rmse:147237'
b'[3]\tvalidation_0-rmse:134132'
b'[4]\tvalidation_0-rmse:122224'
b'[5]\tvalidation_0-rmse:111538'
b'[6]\tvalidation_0-rmse:102142'
b'[7]\tvalidation_0-rmse:93392.2'
b'[8]\tvalidation_0-rmse:85824.6'
b'[9]\tvalidation_0-rmse:79667.6'
b'[10]\tvalidation_0-rmse:73463.4'
b'[11]\tvalidation_0-rmse:68059.4'
b'[12]\tvalidation_0-rmse:63350.5'
b'[13]\tvalidation_0-rmse:59732.1'
b'[14]\tvalidation_0-rmse:56260.7'
b'[15]\tvalidation_0-rmse:53392.6'
b'[16]\tvalidation_0-rmse:50770.8'
b'[17]\tvalidation_0-rmse:48107.8'
b'[18]\tvalidation_0-rmse:45923.9'
b'[19]\tvalidation_0-rmse:44154.2'
b'[20]\tvalidation_0-rmse:42488.1'
b'[21]\tvalidation_0-rmse:41263.3'
b'[22]\tvalidation_0-rmse:40212.8'
b'[23]\tvalidation_0-rmse:39089.1'
b'[24]\tvalidation_0-rmse:37691.1'
b'[25]\tvalidation_0-rmse:36875.2'
b'[26]\tvalidation_0-rmse:36276.2'
b'[2