# Chicago Crime Prediction Pipeline

An example notebook that demonstrates how to:
* Download data from BigQuery
* Create a Kubeflow pipeline
* Include Google Cloud AI Platform components to train and deploy the model in the pipeline
* Submit a job for execution

The model forecasts how many crimes are expected to be reported the next day, based on how many were reported over the previous `n` days.

## Imports

In [1]:
%%capture

# Install the SDK (Uncomment the code if the SDK is not installed before)
!pip3 install --upgrade pip -q
!pip3 install kfp --upgrade -q
!pip3 install pandas --upgrade -q

In [2]:
import json

import kfp
import kfp.compiler as compiler
import kfp.components as comp
import kfp.dsl as dsl
import kfp.gcp as gcp

import pandas as pd

import time

## Pipeline

### Constants

In [22]:
# Required Parameters
PROJECT_ID = 'ml-pipeline-test'
output = 'gs://guideline_example_bucket' # No ending slash

# Optional Parameters
REGION = 'us-central1'
RUNTIME_VERSION = '1.13'
PYTHON_MODULE = 'trainer.task'
EXPERIMENT_NAME = 'Chicago Crime Prediction'
PIPELINE_NAME = 'Chicago Crime Prediction'
PIPELINE_FILENAME_PREFIX = 'chicago'
PIPELINE_DESCRIPTION = ''

In [None]:
MODEL_NAME = 'chicago_pipeline_model' + str(int(time.time()))
MODEL_VERSION = 'chicago_pipeline_model_v1' + str(int(time.time()))
PACKAGE_URIS=json.dumps(['gs://chicago-crime/chicago_crime_trainer-0.0.tar.gz'])
TRAINER_OUTPUT_GCS_PATH = output + '/train/output/' + str(int(time.time())) + '/'
DATA_GCS_PATH = output + '/reports.csv'
TRAINER_ARGS =  json.dumps([
    '--data-file-url', DATA_GCS_PATH,
    '--job-dir', output
])
print('project name: ' + PROJECT_ID)
print('output name: ' + output)
print('model name: ' + MODEL_NAME)
print('model version: ' + MODEL_VERSION)
print('trainer output: ' + TRAINER_OUTPUT_GCS_PATH)
print('trainer args: ' + TRAINER_ARGS)

### Download data

Define a download function that uses the BigQuery component

In [12]:
bigquery_query_op = comp.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/bigquery/query/component.yaml')

QUERY = """
    SELECT count(*) as count, TIMESTAMP_TRUNC(date, DAY) as day
    FROM `bigquery-public-data.chicago_crime.crime`
    GROUP BY day
    ORDER BY day
"""

def download(project_id, data_gcs_path):

    return bigquery_query_op(
        query=QUERY,
        project_id=project_id,
        output_gcs_path=data_gcs_path
    ).apply(
       gcp.use_gcp_secret('user-gcp-sa') 
    )

### Train the model

Run training code that will pre-process the data and then submit a training job to the AI Platform.

In [13]:
mlengine_train_op = comp.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/ml_engine/train/component.yaml')

def train(project_id,
          trainer_args,
          package_uris,
          trainer_output_gcs_path,
          gcs_working_dir,
          region,
          python_module,
          runtime_version):
        
    return mlengine_train_op(
        project_id=project_id, 
        python_module=python_module,
        package_uris=package_uris,
        region=region,
        args=trainer_args,
        job_dir=trainer_output_gcs_path,
        runtime_version=runtime_version
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))

### Deploy model

Deploy the model with the ID given from the training step

In [14]:
mlengine_deploy_op = comp.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/e598176c02f45371336ccaa819409e8ec83743df/components/gcp/ml_engine/deploy/component.yaml')

def deploy(
    project_id,
    model_uri,
    model_id,
    model_version,
    runtime_version):
    
    return mlengine_deploy_op(
        model_uri=model_uri,
        project_id=project_id, 
        model_id=model_id,
        version_id=model_version,
        runtime_version=runtime_version, 
        replace_existing_version=True, 
        set_default=True).apply(gcp.use_gcp_secret('user-gcp-sa'))

### Define pipeline

In [17]:
@dsl.pipeline(
    name=PIPELINE_NAME,
    description=PIPELINE_DESCRIPTION
)

def pipeline(
    data_gcs_path=DATA_GCS_PATH,
    gcs_working_dir=output,
    project_id=PROJECT_ID,
    python_module=PYTHON_MODULE,
    region=REGION,
    runtime_version=RUNTIME_VERSION,
    package_uris=PACKAGE_URIS,
    trainer_output_gcs_path=TRAINER_OUTPUT_GCS_PATH,
    trainer_args=TRAINER_ARGS,
):      
    download_task = download(project_id,
                             data_gcs_path)

    train_task = train(project_id,
                       trainer_args,
                       package_uris,
                       trainer_output_gcs_path,
                       gcs_working_dir,
                       region,
                       python_module,
                       runtime_version).after(download_task)
    
    deploy_task = deploy(project_id,
                         train_task.outputs['job_dir'],
                         MODEL_NAME,
                         MODEL_VERSION,
                         runtime_version)    
    return True

# Reference for invocation later
pipeline_func = pipeline

### Compile pipeline

In [18]:
pipeline_filename = PIPELINE_FILENAME_PREFIX + '.pipeline.zip'

compiler.Compiler().compile(pipeline_func, pipeline_filename)

### Submit the pipeline for execution

In [19]:
# Specify pipeline argument values
arguments = {}

# Get or create an experiment and submit a pipeline run
client = kfp.Client()
try:
    experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)
except:
    experiment = client.create_experiment(EXPERIMENT_NAME)

# Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
# Note that 
# Wait for 20mins, model deployment should be ready by then.
result = client.wait_for_run_completion(run.id, timeout=1200)

### Clean up
If run within a Kubeflow cluster deployment, we can clean up the deploy model after the sample.

In [None]:
# the step is only needed if you are using an in-cluster JupyterHub instance.
!gcloud auth activate-service-account --key-file ${GOOGLE_APPLICATION_CREDENTIALS}
!gcloud ai-platform versions delete $MODEL_VERSION --model $MODEL_NAME 
!gcloud ai-platform models delete $MODEL_NAME