# Training on AI Platform with a custom container

In [1]:
import os
import time

## Configure environment
### Set a GCS bucket

In [12]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
ARTIFACT_STORE = 'gs://{}-artifact-store'.format(PROJECT_ID)

In [13]:
TRAINING_DATA_PATH = '{}/datasets/training.csv'.format(ARTIFACT_STORE)
TESTING_DATA_PATH = '{}/datasets/testing.csv'.format(ARTIFACT_STORE)
REGION = "us-central1"
JOBDIR_BUCKET = '{}/jobs'.format(ARTIFACT_STORE)

## Create a training container image

### Create a Dockerfile

In [15]:
TRAINING_IMAGE_FOLDER = '../training_image'

os.makedirs(TRAINING_IMAGE_FOLDER, exist_ok=True)

In [19]:
%%writefile $TRAINING_IMAGE_FOLDER/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN python -m pip install -U fire gcsfs
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting ../training_image/Dockerfile


### Create a training script

In [20]:
%%writefile $TRAINING_IMAGE_FOLDER/train.py

import logging
import os
import subprocess
import sys
import joblib
import fire
import numpy as np
import pandas as pd

from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def train(job_dir, data_path, n_features_options, l2_reg_options):
    
  # Load data from GCS
  df_train = pd.read_csv(data_path)

  y = df_train.octane
  X = df_train.drop('octane', axis=1)
    
  # Configure a training pipeline
  pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regress', Ridge())
  ])

  # Configure a parameter grid
  param_grid = [
    {
      'reduce_dim__n_components': n_features_options,
      'regress__alpha': l2_reg_options
    }
  ]

  # Tune hyperparameters
  grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)
  grid.fit(X, y)

  logging.info("Best estimator: {}".format(grid.best_params_))
  logging.info("Best score: {}".format(grid.best_score_))
    
  # Retrain the best model on a full dataset
  best_estimator = grid.best_estimator_
  trained_pipeline = best_estimator.fit(X, y)

  # Save the model
  model_filename = 'model.joblib'
  joblib.dump(value=trained_pipeline, filename=model_filename)
  gcs_model_path = "{}/{}".format(job_dir, model_filename)
  subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
  logging.info("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
  logging.basicConfig(level=logging.INFO)
  fire.Fire(train)

Overwriting ../training_image/train.py


### Build the image

In [21]:
IMAGE_NAME="octane-regression-training"
IMAGE_TAG="latest"
IMAGE_URI="gcr.io/{}/{}:{}".format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

!gcloud builds submit --tag $IMAGE_URI $TRAINING_IMAGE_FOLDER

Creating temporary tarball archive of 2 file(s) totalling 2.0 KiB before compression.
Uploading tarball of [../training_image] to [gs://mlops-dev-100_cloudbuild/source/1583977371.65-0179be7af5384d5386aa88bd56c6e0a6.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/mlops-dev-100/builds/24919490-a8fd-471b-8917-b51eb4cedf8f].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/24919490-a8fd-471b-8917-b51eb4cedf8f?project=286479790129].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "24919490-a8fd-471b-8917-b51eb4cedf8f"

FETCHSOURCE
Fetching storage object: gs://mlops-dev-100_cloudbuild/source/1583977371.65-0179be7af5384d5386aa88bd56c6e0a6.tgz#1583977371875128
Copying gs://mlops-dev-100_cloudbuild/source/1583977371.65-0179be7af5384d5386aa88bd56c6e0a6.tgz#1583977371875128...
/ [1 files][  1.1 KiB/  1.1 KiB]                                                
Operation completed over 1 objects/1.1 KiB.            

### Submit a training job

In [None]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
SCALE_TIER = "BASIC"

N_FEATURES_OPTIONS="[2,4,6]"
L2_REG_OPTIONS="[0.1,0.2,0.3,0.5]"

In [None]:
!gcloud ai-platform jobs submit training $JOB_NAME \
--region $REGION \
--job-dir $JOBDIR_BUCKET/$JOB_NAME \
--master-image-uri $IMAGE_URI \
--scale-tier $SCALE_TIER \
-- \
--data_path $TRAINING_DATA_PATH  \
--n_features_options $N_FEATURES_OPTIONS \
--l2_reg_options $L2_REG_OPTIONS

In [None]:
!gcloud ai-platform jobs describe $JOB_NAME
!gcloud ai-platform jobs stream-logs $JOB_NAME

### Copy the trained model to model repo

In [None]:
MODEL_NAME = "octane-regressor-container"

!gsutil cp $JOBDIR_BUCKET/$JOB_NAME/model.joblib $ARTIFACT_BUCKET/models/$MODEL_NAME/model.joblib

## Download and test the model

In [None]:
LOCAL_PATH = '/tmp/model.joblib'

!gsutil cp $ARTIFACT_BUCKET/models/$MODEL_NAME/model.joblib $LOCAL_PATH 

In [None]:
import joblib
import pandas as pd

df_test = pd.read_csv(TESTING_DATA_PATH)
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)

y_hat = predictor.predict(X)
print(list(zip(y, y_hat)))