# Training and deploying a tabular model using Vertex AutoML - Part 2.

![Training pipeline](../images/automl.png)

## Import the required packages

In [11]:
import os
import pprint
import pandas as pd
import time

import matplotlib.pyplot as plt

from google.cloud import aiplatform as vertex_ai
from google.cloud.aiplatform_v1beta1 import types
from google.cloud import bigquery
from google.cloud import exceptions

## Configure GCP settings

*Before running the notebook make sure to follow the repo's README file to install the pre-requisites and configure GCP authentication.*

In [13]:
PREFIX = 'jk1'

shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT = shell_output[0]
print("Project ID: ", PROJECT)

Project ID:  jk-vertex-workshop


In [14]:
REGION = 'us-central1'
STAGING_BUCKET = f'gs://{PREFIX}-bucket'
VERTEX_SA = f'training-sa@{PROJECT}.iam.gserviceaccount.com'
BQ_DATASET_NAME = f'{PREFIX}_dataset' 
BQ_TABLE_NAME = 'features'
BQ_LOCATION = 'US'

INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/910094146258/locations/us-central1/trainingPipelines/3144466915997515776 current state:
PipelineState.PIPELINE_STATE_RUNNING


## Creating a tabular dataset in Vertex

### Initialize Vertex AI SDK

In [8]:
vertex_ai.init(
    project=PROJECT,
    location=REGION,
    staging_bucket=STAGING_BUCKET
)

### Create a dataset and import data

In [9]:
display_name = f'{PREFIX} Chicago taxi trips'
bq_source_uri = f'bq://{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}'

filter = f'display_name="{display_name}"'

dataset = vertex_ai.TabularDataset.list(filter=filter)
if not dataset:
    print("Creating a new dataset.")
    dataset = vertex_ai.TabularDataset.create(
        display_name=display_name, bq_source=bq_source_uri,
    )

    dataset.wait()
else:
    print("Using existing dataset: ", dataset[0].resource_name)
    dataset = vertex_ai.TabularDataset(dataset_name=dataset[0].resource_name)

Creating a new dataset.
INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/910094146258/locations/us-central1/datasets/4162953332899446784/operations/157533902859141120
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/910094146258/locations/us-central1/datasets/4162953332899446784
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/910094146258/locations/us-central1/datasets/4162953332899446784')


## Launching an AutoML training job

In [10]:
display_name = f'{PREFIX} Chicago Taxi classifier training'
model_display_name = 'Chicago Taxi classifier'
target_column = 'tip_bin'
optimization_prediction_type = 'classification'
optimization_objective = 'maximize-recall-at-precision'
optimization_objective_precision_value = 0.7
split_column = 'data_split'
budget_milli_node_hours = 1000

column_transformations = [
    {'categorical': {'column_name': 'trip_month'}},
    {'categorical': {'column_name': 'trip_day'}},
    {'categorical': {'column_name': 'trip_day_of_week'}},
    {'categorical': {'column_name': 'trip_hour'}},
    {'categorical': {'column_name': 'payment_type'}},
    {'categorical': {'column_name': 'pickup_grid'}},
    {'categorical': {'column_name': 'dropoff_grid'}},
    {'numeric': {'column_name': 'trip_seconds'}},
    {'numeric': {'column_name': 'euclidean'}},
    {'numeric': {'column_name': 'trip_miles'}},
]

job = vertex_ai.AutoMLTabularTrainingJob(
    display_name=display_name,
    optimization_prediction_type=optimization_prediction_type,
    optimization_objective=optimization_objective,
    optimization_objective_precision_value=optimization_objective_precision_value,
    column_transformations=column_transformations,
)

model = job.run(
    dataset=dataset,
    target_column=target_column,
    budget_milli_node_hours=budget_milli_node_hours,
    model_display_name=model_display_name,
    predefined_split_column_name=split_column,
    sync=False
)

INFO:google.cloud.aiplatform.training_jobs:View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/3144466915997515776?project=910094146258
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/910094146258/locations/us-central1/trainingPipelines/3144466915997515776 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/910094146258/locations/us-central1/trainingPipelines/3144466915997515776 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.training_jobs:AutoMLTabularTrainingJob projects/910094146258/locations/us-central1/trainingPipelines/3144466915997515776 current state:
PipelineState.PIPELINE_STATE_RUNNING
