In [1]:
import argparse
import logging
import joblib
import sys
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [2]:
logging.basicConfig(format='%(message)s')
logging.getLogger().setLevel(logging.INFO)

In [3]:
def read_input(file_name, test_size=0.25):
    """Read input data and split it into train and test."""
    data = pd.read_csv(file_name)
    data.dropna(axis=0, subset=['SalePrice'], inplace=True)

    y = data.SalePrice
    X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

    train_X, test_X, train_y, test_y = train_test_split(X.values,
                                                      y.values,
                                                      test_size=test_size,
                                                      shuffle=False)

    imputer = SimpleImputer()
    train_X = imputer.fit_transform(train_X)
    test_X = imputer.transform(test_X)

    return (train_X, train_y), (test_X, test_y)

In [4]:
def train_model(train_X,
                train_y,
                test_X,
                test_y,
                n_estimators,
                learning_rate):
    """Train the model using XGBRegressor."""
    model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate)

    model.fit(train_X,
            train_y,
            early_stopping_rounds=40,
            eval_set=[(test_X, test_y)])

    print("Best RMSE on eval: %.2f with %d rounds",
               model.best_score,
               model.best_iteration+1)
    return model

def eval_model(model, test_X, test_y):
    """Evaluate the model performance."""
    predictions = model.predict(test_X)
    logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))

def save_model(model, model_file):
    """Save XGBoost model for serving."""
    joblib.dump(model, model_file)
    logging.info("Model export success: %s", model_file)

In [5]:
def gcs_copy(src_path, dst_path):
    import subprocess
    print(subprocess.run(['gsutil', 'cp', src_path, dst_path], stdout=subprocess.PIPE).stdout[:-1].decode('utf-8'))
    #print(subprocess.run(['gsutil', 'ls', dst_path], stdout=subprocess.PIPE).stdout[:-1].decode('utf-8'))

In [6]:
### create GCS bucket for storing model weights
GCS_BUCKET = "gs://fairing-demo"
!gsutil mb {GCS_BUCKET}

Creating gs://fairing-demo/...
ServiceException: 409 Bucket fairing-demo already exists.


In [14]:
class HousingServe(object):
    
    def __init__(self):
        self.train_input = "ames_dataset/train.csv"
        self.n_estimators = 50
        self.learning_rate = 0.1
        self.model_file = "trained_ames_model.dat"
        self.gcs_model_file = "{}/housing_serve/{}".format(GCS_BUCKET, self.model_file)
        self.model = None

    def train(self):
        (train_X, train_y), (test_X, test_y) = read_input(self.train_input)
        model = train_model(train_X,
                          train_y,
                          test_X,
                          test_y,
                          self.n_estimators,
                          self.learning_rate)

        eval_model(model, test_X, test_y)
        save_model(model, self.model_file)
        gcs_copy(self.model_file, self.gcs_model_file)
        print("Model saved to {}".format(self.gcs_model_file))

    def predict(self, X, feature_names):
        """Predict using the model for given ndarray."""
        if not self.model:
            self.model = joblib.load(self.model_file)
        # Do any preprocessing
        prediction = self.model.predict(data=X)
        # Do any postprocessing
        return [[prediction.item(0), prediction.item(0)]]

## Training Locally

In [8]:
HousingServe().train()

[0]	validation_0-rmse:177514
Will train until validation_0-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:161858
[2]	validation_0-rmse:147237
[3]	validation_0-rmse:134132
[4]	validation_0-rmse:122224
[5]	validation_0-rmse:111538
[6]	validation_0-rmse:102142
[7]	validation_0-rmse:93392.3
[8]	validation_0-rmse:85824.6
[9]	validation_0-rmse:79667.6
[10]	validation_0-rmse:73463.4
[11]	validation_0-rmse:68059.4
[12]	validation_0-rmse:63350.5
[13]	validation_0-rmse:59732.1
[14]	validation_0-rmse:56260.7
[15]	validation_0-rmse:53392.6
[16]	validation_0-rmse:50770.8
[17]	validation_0-rmse:48107.8
[18]	validation_0-rmse:45923.9
[19]	validation_0-rmse:44154.2
[20]	validation_0-rmse:42488.1
[21]	validation_0-rmse:41263.3
[22]	validation_0-rmse:40212.8
[23]	validation_0-rmse:39089.1
[24]	validation_0-rmse:37691.1
[25]	validation_0-rmse:36875.2
[26]	validation_0-rmse:36276.2
[27]	validation_0-rmse:35444.1
[28]	validation_0-rmse:34831.5
[29]	validation_0-rmse:34205.4
[30]	validation_0-rmse

mean_absolute_error=18173.15
Model export success: trained_ames_model.dat


Best RMSE on eval: %.2f with %d rounds 28787.720703 50

Model saved to gs://fairing-demo/housing_serve/trained_ames_model.dat


## Fairing

In [9]:
import os
import fairing

# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job'.format(GCP_PROJECT)

## Build base image for remote training and prediction

In [10]:
py_version = ".".join([str(x) for x in sys.version_info[0:3]])
base_image = 'gcr.io/{}/fairing-predict-example:latest'.format(GCP_PROJECT)

In [15]:
!cat requirements.txt

pandas
joblib
numpy
xgboost
sklearn
seldon-core

In [None]:
!docker build --build-arg PY_VERSION={py_version} . -t {base_image}
!docker push {base_image}

## Training in KF

In [11]:
from fairing.ml_tasks import TrainJob
train_job = TrainJob(HousingServe, base_image, input_files=['ames_dataset/train.csv'], nodes=2, cpus_per_nmpde=16, gpus_pernode=1)
train_job.submit()

Using preprocessor: <class 'fairing.preprocessors.function.FunctionPreProcessor'>
Using docker registry: gcr.io/caip-dexter-bugbash
Using builder: DockerBuilder
Docker command: ['python', '/app/function_shim.py', '--serialized_fn_file', '/app/pickled_fn.p']
/Users/cartick/Documents/workspace/fairing/venv/lib/python3.7/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Building docker image gcr.io/caip-dexter-bugbash/fairing-job:A457822C...
Build output: Step 1/6 : FROM gcr.io/caip-dexter-bugbash/fairing-predict-example:latest
Build output: 
Build output: ---> 129284ce93f5
Build output: Step 2/6 : WORKDIR /app/
Build output: 
Build output: ---> Using cache
Build output: ---> f84480a8f8d5
Build output: Step 3/6 : ENV FAIRING_RUNTIME 1
Build output: 
Build output: ---> Using cache
Build output: ---> 474924c16985
Build output: Step 4/6 : COPY /app/ /app/
Build output: 
Build output: ---> c2857869ab15
Build output: Step 5/6 : RUN if [ -e requirements.txt ];then

Build output: ---> 78b3320ed337
Build output: Step 6/6 : CMD python /app/function_shim.py --serialized_fn_file /app/pickled_fn.p
Build output: 
Build output: ---> Running in 7c95522fb220
Build output: ---> 903337404cca
Push finished: {'ID': 'sha256:903337404cca2948cd51b2c4e686ab020a90fdabf46bbd0cd79fb3cdf59004b8'}
Build output: Successfully built 903337404cca
Build output: Successfully tagged gcr.io/caip-dexter-bugbash/fairing-job:A457822C
Publishing image gcr.io/caip-dexter-bugbash/fairing-job:A457822C...
Push output: The push refers to repository [gcr.io/caip-dexter-bugbash/fairing-job] None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push ou

[0]	validation_0-rmse:177514
Will train until validation_0-rmse hasn't improved in 40 rounds.
[1]	validation_0-rmse:161858
[2]	validation_0-rmse:147237
[3]	validation_0-rmse:134132
[4]	validation_0-rmse:122224
[5]	validation_0-rmse:111538
[6]	validation_0-rmse:102142
[7]	validation_0-rmse:93392.3
[8]	validation_0-rmse:85824.6
[9]	validation_0-rmse:79667.6
[10]	validation_0-rmse:73463.4
[11]	validation_0-rmse:68059.4
[12]	validation_0-rmse:63350.5
[13]	validation_0-rmse:59732.1
[14]	validation_0-rmse:56260.7
[15]	validation_0-rmse:53392.6
[16]	validation_0-rmse:50770.8
[17]	validation_0-rmse:48107.8
[18]	validation_0-rmse:45923.9
[19]	validation_0-rmse:44154.2
[20]	validation_0-rmse:42488.1
[21]	validation_0-rmse:41263.3
[22]	validation_0-rmse:40212.8
[23]	validation_0-rmse:39089.1
[24]	validation_0-rmse:37691.1
[25]	validation_0-rmse:36875.2
[26]	validation_0-rmse:36276.2
[27]	validation_0-rmse:35444.1
[28]	validation_0-rmse:34831.5
[29]	validation_0-rmse:34205.4
[30]	validation_0-rmse

Cleaning up job fairing-job-s5x9t...


## Deploying model and creating an endpoint in KF

In [11]:
from fairing.ml_tasks import PredictionEndpoint
endpoint = PredictionEndpoint(HousingServe, base_image, input_files=['trained_ames_model.dat'])
endpoint.create()

Using preprocessor: <class 'fairing.preprocessors.function.FunctionPreProcessor'>
Using docker registry: gcr.io/caip-dexter-bugbash
Using builder: DockerBuilder
Docker command: ['python', '/app/function_shim.py', '--serialized_fn_file', '/app/pickled_fn.p']
/Users/cartick/Documents/workspace/fairing/venv/lib/python3.7/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Building docker image gcr.io/caip-dexter-bugbash/fairing-job:913FA984...
Build output: Step 1/6 : FROM gcr.io/caip-dexter-bugbash/fairing-predict-example:latest
Build output: 
Build output: ---> 129284ce93f5
Build output: Step 2/6 : WORKDIR /app/
Build output: 
Build output: ---> Using cache
Build output: ---> f84480a8f8d5
Build output: Step 3/6 : ENV FAIRING_RUNTIME 1
Build output: 
Build output: ---> Using cache
Build output: ---> 474924c16985
Build output: Step 4/6 : COPY /app/ /app/
Build output: 
Build output: ---> fd62bfcacd28
Build output: Step 5/6 : RUN if [ -e requirements.txt ];then

Build output: ---> 91838369e266
Build output: Step 6/6 : CMD python /app/function_shim.py --serialized_fn_file /app/pickled_fn.p
Build output: 
Build output: ---> Running in be9e593df1c9
Build output: ---> 34d25711c3a3
Push finished: {'ID': 'sha256:34d25711c3a3678083058c59c2e379af7b712f094e5cbed999cf18d16dc00fb0'}
Build output: Successfully built 34d25711c3a3
Build output: Successfully tagged gcr.io/caip-dexter-bugbash/fairing-job:913FA984
Publishing image gcr.io/caip-dexter-bugbash/fairing-job:913FA984...
Push output: The push refers to repository [gcr.io/caip-dexter-bugbash/fairing-job] None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push output: Preparing None
Push ou

Waiting for prediction endpoint to come up...
Prediction endpoint: http://35.222.2.191:5000/predict


## Making prediction calls against the endpoint

In [12]:
# Copy the prediction endpoint from prev step
!curl http://35.222.2.191:5000/predict -H "Content-Type: application/x-www-form-urlencoded" -d 'json={"data":{"tensor":{"shape":[1,37],"values":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37]}}}'
        

{"data":{"names":["t:0","t:1"],"tensor":{"shape":[1,2],"values":[108354.9609375,108354.9609375]}},"meta":{}}
