In [34]:
import kfp
import os

In [35]:
print(kfp.__version__)

0.1.36


### Upload the dataset

In [3]:
%%sh

PROJECT_ID=mlops-demo
DATASET_LOCATION=US
DATASET_ID=lab_11
TABLE_ID=covertype
DATA_SOURCE=gs://workshop-datasets/covertype/full/covertype.csv
SCHEMA=Elevation:INTEGER,\
Aspect:INTEGER,\
Slope:INTEGER,\
Horizontal_Distance_To_Hydrology:INTEGER,\
Vertical_Distance_To_Hydrology:INTEGER,\
Horizontal_Distance_To_Roadways:INTEGER,\
Hillshade_9am:INTEGER,\
Hillshade_Noon:INTEGER,\
Hillshade_3pm:INTEGER,\
Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,\
Soil_Type:INTEGER,\
Cover_Type:INTEGER

bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

Dataset 'mlops-demo:lab_11' successfully created.



Waiting on bqjob_r6448a45daee24138_0000016f8c2a07b1_1 ... (10s) Current status: DONE   

### Create the staging GCS bucket

In [None]:
%%sh

PROJECT_ID=mlops-demo
BUCKET_NAME=gs://${PROJECT_ID}-lab-11
gsutil mb -p $PROJECT_ID $BUCKET_NAME

### Build a trainer image

In [11]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

In [12]:
%%writefile {TRAINING_APP_FOLDER}/train.py

import os
import subprocess
import sys

import fire
import pickle
import numpy as np
import pandas as pd

import hypertune

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune):
    
  df_train = pd.read_csv(training_dataset_path)
  df_validation = pd.read_csv(validation_dataset_path)
  if not hptune:
    df_train = pd.concat([df_train, df_validation])

  numeric_features = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points']
    
  categorical_features = ['Wilderness_Area', 'Soil_Type']

  preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features) 
    ])

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log'))
  ])

  print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
  X_train = df_train.drop('Cover_Type', axis=1)
  y_train = df_train['Cover_Type']
  
    
  pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
  pipeline.fit(X_train, y_train)
  
  if hptune:
    X_validation = df_validation.drop('Cover_Type', axis=1)
    y_validation = df_validation['Cover_Type']
    accuracy = pipeline.score(X_validation, y_validation)
    print('Model accuracy: {}'.format(accuracy))
    # Log it with hypertune
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
      hyperparameter_metric_tag='accuracy',
      metric_value=accuracy
    )

  # Save the model
  if not hptune:
    model_filename = 'model.pkl'
    with open(model_filename, 'wb') as model_file:
        pickle.dump(pipeline, model_file)
    gcs_model_path = "{}/{}".format(job_dir, model_filename)
    subprocess.check_call(['gsutil', 'cp', model_filename, gcs_model_path], stderr=sys.stdout)
    print("Saved model in: {}".format(gcs_model_path)) 
    
if __name__ == "__main__":
  fire.Fire(train_evaluate)

Overwriting training_app/train.py


In [13]:
%%writefile {TRAINING_APP_FOLDER}/Dockerfile

FROM gcr.io/deeplearning-platform-release/base-cpu
RUN pip install -U fire cloudml-hypertune
WORKDIR /app
COPY train.py .

ENTRYPOINT ["python", "train.py"]

Overwriting training_app/Dockerfile


In [16]:
PROJECT_ID='mlops-demo'
IMAGE_NAME='trainer_image'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Creating temporary tarball archive of 2 file(s) totalling 2.4 KiB before compression.
Uploading tarball of [training_app] to [gs://mlops-demo_cloudbuild/source/1578605826.62-1a80ecd6629a41ffb767abb9901eeb69.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/mlops-demo/builds/5be349a9-5e33-4700-974d-1a3c2e54845c].
Logs are available at [https://console.cloud.google.com/gcr/builds/5be349a9-5e33-4700-974d-1a3c2e54845c?project=609934025272].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "5be349a9-5e33-4700-974d-1a3c2e54845c"

FETCHSOURCE
Fetching storage object: gs://mlops-demo_cloudbuild/source/1578605826.62-1a80ecd6629a41ffb767abb9901eeb69.tgz#1578605827100723
Copying gs://mlops-demo_cloudbuild/source/1578605826.62-1a80ecd6629a41ffb767abb9901eeb69.tgz#1578605827100723...
/ [1 files][  1.2 KiB/  1.2 KiB]                                                
Operation completed over 1 objects/1.2 KiB.                                     

### Compile the pipeline

In [23]:
%%sh

export PROJECT_ID=mlops-demo
export COMPONENT_URL_SEARCH_PREFIX=https://raw.githubusercontent.com/kubeflow/pipelines/0.1.36/components/gcp/
export BASE_IMAGE=gcr.io/deeplearning-platform-release/base-cpu
export TRAINER_IMAGE=gcr.io/$PROJECT_ID/trainer_image:latest
export RUNTIME_VERSION=1.15
export PYTHON_VERSION=3.7

dsl-compile --py covertype_training_pipeline.py --output covertype_training_pipeline.yaml

### Upload the pipeline to KFP environment

In [25]:
PIPELINE_NAME='covertype_classifier_training'
PIPELINE_PACKAGE='covertype_training_pipeline.yaml'
INVERSE_PROXY_HOST='1ff32660e53f5d89-dot-us-central1.notebooks.googleusercontent.com'

!kfp --endpoint $INVERSE_PROXY_HOST pipeline upload -p $PIPELINE_NAME $PIPELINE_PACKAGE 

Pipeline f8b7f150-e92f-4e3c-a46e-133a5832267a has been submitted

Pipeline Details
------------------
ID           f8b7f150-e92f-4e3c-a46e-133a5832267a
Name         covertype_classifier_training
Description
Uploaded at  2020-01-10T18:29:55+00:00
+-----------------------------+--------------------------------------------------+
| Parameter Name              | Default Value                                    |
| project_id                  |                                                  |
+-----------------------------+--------------------------------------------------+
| region                      |                                                  |
+-----------------------------+--------------------------------------------------+
| source_table_name           |                                                  |
+-----------------------------+--------------------------------------------------+
| gcs_root                    |                                                  |
+------

### List pipelines

In [26]:
INVERSE_PROXY_HOST='1ff32660e53f5d89-dot-us-central1.notebooks.googleusercontent.com'

!kfp --endpoint $INVERSE_PROXY_HOST pipeline list

+--------------------------------------+----------------------------------------------------------+---------------------------+
| Pipeline ID                          | Name                                                     | Uploaded at               |
| f8b7f150-e92f-4e3c-a46e-133a5832267a | covertype_classifier_training                            | 2020-01-10T18:29:55+00:00 |
+--------------------------------------+----------------------------------------------------------+---------------------------+
| 48b0c023-b881-4ee1-9119-4b479998f142 | popularity_predictor_training                            | 2020-01-10T03:44:39+00:00 |
+--------------------------------------+----------------------------------------------------------+---------------------------+
| be12500c-2f01-4efe-bf2d-0f24479f217c | [Sample] Basic - Exit Handler                            | 2020-01-09T18:56:04+00:00 |
+--------------------------------------+----------------------------------------------------------+-----

In [22]:
PIPELINE_ID='bef76c5a-2bb4-4190-ab52-20834c78a846'

PROJECT_ID='mlops-demo'
INVERSE_PROXY_HOST='1ff32660e53f5d89-dot-us-central1.notebooks.googleusercontent.com'
STAGING_GCS_BUCKET='gs://mlops-demo-lab-11'
RUN_NAME="Training_Run_001"
EXPERIMENT_NAME='Covertype_Classifier_Training'

!kfp --endpoint $INVERSE_PROXY_HOST run submit \
-e $EXPERIMENT_NAME \
-r $RUN_NAME \
-p $PIPELINE_ID \
project_id=$PROJECT_ID \
gcs_root=$STAGING_GCS_BUCKET \
region=us-central1 \
source_table_name=lab_11.covertype \
dataset_id=splits \
evaluation_metric_name=accuracy \
evaluation_metric_threshold=0.69 \
model_id=covertype_classifier \
version_id=v0.1 \
replace_existing_version=True

Run 2aac3a95-9515-475e-8cbd-b7b56c088208 is submitted
+--------------------------------------+------------------+----------+---------------------------+
| run id                               | name             | status   | created at                |
| 2aac3a95-9515-475e-8cbd-b7b56c088208 | Training_Run_001 |          | 2020-01-10T17:57:41+00:00 |
+--------------------------------------+------------------+----------+---------------------------+
