# AI Platform Training with scikit-learn

In [40]:
import sklearn

sklearn.__version__

'0.21.2'

## Create a training application
### Create a canonical folder structure

In [27]:
import os

app_folder = '../training_app/trainer'
os.makedirs(app_folder, exist_ok=True)

!touch $app_folder/__init__.py

### Create a training script

In [41]:
%%writefile $app_folder/train.py

import fire
import joblib
import numpy as np
import pandas as pd


from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def train(job_dir, data_path, n_features_options, l2_reg_options):
    
  import sklearn
  print(sklearn.__version__)
  return
    
  # Load data from GCS
  df_train = pd.read_csv(data_path, index_col=0)
    
  # Set up grid search
  pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regress', Ridge())
  ])

  param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': n_features_options,
        'regress': [Ridge()],
        'regress__alpha': l2_reg_options
    },
    {
        'reduce_dim': ['passthrough'],
        'regress': [PLSRegression(scale=False)],
        'regress__n_components': n_features_options
    }
  ]

  grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error')

  y = df_train.octane
  X = df_train.drop('octane', axis=1)

  grid.fit(X, y)

  print("Best estimator: {}".format(grid.best_params_))
  print("Best score: {}".format(grid.best_score_))
    
if __name__ == "__main__":
  fire.Fire(train)

Overwriting ../training_app/trainer/train.py


### Create a setup.py to install dependencies

In [46]:
%%writefile $app_folder/../setup.py

from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = ['fire', 'gcsfs', 'sklearn>=0.21.0']

setup(
    name='trainer',
    version='0.1',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='My training application package.'
)

Overwriting ../training_app/trainer/../setup.py


## Run training locally

In [43]:
%%bash

TRAINING_PACKAGE_PATH="../training_app/trainer"
MAIN_TRAINER_MODULE="trainer.train"
JOB_DIR="../job_dir"
DATA_PATH="gs://jk-demo-datasets/gasdata/gasdata.csv"
N_FEATURES_OPTIONS=[2,4,6]
L2_REG_OPTIONS=[0.1,0.2,0.3,0.5]

gcloud ai-platform local train \
  --job-dir $JOB_DIR \
  --package-path $TRAINING_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  -- \
  --data_path $DATA_PATH \
  --n_features_options $N_FEATURES_OPTIONS \
  --l2_reg_options $L2_REG_OPTIONS



0.21.2


## Run training on AI Platform Training

In [47]:
%%bash

JOB_NAME="JOB_$(date +"%Y%m%d_%H%M%S")"
REGION=us-west1
RUNTIME_VERSION=1.14
PYTHON_VERSION=3.5
SCALE_TIER=BASIC

TRAINING_PACKAGE_PATH="../training_app/trainer"
MAIN_TRAINER_MODULE="trainer.train"
JOB_DIR="gs://jk-demo-jobdir/${JOB_NAME}"
DATA_PATH="gs://jk-demo-datasets/gasdata/gasdata.csv"
N_FEATURES_OPTIONS=[2,4,6]
L2_REG_OPTIONS=[0.1,0.2,0.3,0.5]

gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $TRAINING_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --region $REGION \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --scale-tier $SCALE_TIER \
  -- \
  --data_path $DATA_PATH \
  --n_features_options $N_FEATURES_OPTIONS \
  --l2_reg_options $L2_REG_OPTIONS

jobId: JOB_20190817_145004
state: QUEUED


Job [JOB_20190817_145004] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20190817_145004

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20190817_145004


In [35]:
!gcloud ai-platform jobs describe JOB_20190817_143416

createTime: '2019-08-17T14:34:18Z'
etag: xDlfxwSWxFc=
jobId: JOB_20190817_143416
state: PREPARING
trainingInput:
  args:
  - --data_path
  - gs://jk-demo-datasets/gasdata/gasdata.csv
  - --n_features_options
  - '[2,4,6]'
  - --l2_reg_options
  - '[0.1,0.2,0.3,0.5]'
  jobDir: gs://jk-demo-jobdir/JOB_20190817_143416
  packageUris:
  - gs://jk-demo-jobdir/JOB_20190817_143416/packages/612269193884345b5ec428f358455e4c447025dedee485fecc40bbd2cf41bf01/trainer-0.1.tar.gz
  pythonModule: trainer.train
  pythonVersion: '3.5'
  region: us-west1
  runtimeVersion: '1.14'
trainingOutput: {}

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/JOB_20190817_143416?project=jk-sandbox12

View logs at:
https://console.cloud.google.com/logs?resource=ml.googleapis.com%2Fjob_id%2FJOB_20190817_143416&project=jk-sandbox12


In [48]:
!gcloud ai-platform jobs stream-logs JOB_20190817_145004

INFO	2019-08-17 14:50:06 +0000	service		Validating job requirements...
INFO	2019-08-17 14:50:06 +0000	service		Job creation request has been successfully validated.
INFO	2019-08-17 14:50:06 +0000	service		Job JOB_20190817_145004 is queued.
INFO	2019-08-17 14:50:06 +0000	service		Waiting for job to be provisioned.
INFO	2019-08-17 14:50:09 +0000	service		Waiting for training program to start.
INFO	2019-08-17 14:50:48 +0000	master-replica-0		Running task with arguments: --cluster={"master": ["127.0.0.1:2222"]} --task={"type": "master", "index": 0} --job={  "package_uris": ["gs://jk-demo-jobdir/JOB_20190817_145004/packages/28e38208e29925f04b968dc8ee6701083aaf6fe82bfbaf0b19f0c43009a8d43d/trainer-0.1.tar.gz"],  "python_module": "trainer.train",  "args": ["--data_path", "gs://jk-demo-datasets/gasdata/gasdata.csv", "--n_features_options", "[2,4,6]", "--l2_reg_options", "[0.1,0.2,0.3,0.5]"],  "region": "us-west1",  "runtime_version": "1.14",  "job_dir": "gs://jk-demo-jobdir/JOB_20190817_145004"