# 02.1 - ML Experimentation with AutoML

The purpose of this notebook is to use [AutoML Tables](https://cloud.google.com/automl-tables) to train a classifier 
to predict whether a given trip will result in a tip > 20%. The notebook covers the following tasks:
1. Retrieves the managed Dataset uri to be used for training.
2. Prepare and submit an AutoMl Tables training job.
3. Retrieves the uploaded model by the AutoMl Tables.
3. Retrieves the evaluation results of the AutoML Table.


## Setup

In [None]:
import os
import pandas as pd
from datetime import datetime
import tensorflow_data_validation as tfdv
from tensorflow_transform.tf_metadata import schema_utils
from google.cloud.aiplatform import gapic as aip

In [None]:
PROJECT = 'ksalama-cloudml'  # Change to your project Id.
REGION = 'us-central1'

DATASET_DISPLAYNAME = 'chicago_taxi_tips'
AUTOML_MODEL_DISPLAYNAME = f'{DATASET_DISPLAYNAME}_classifier_automl'
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"
PARENT = f"projects/{PROJECT}/locations/{REGION}"

client_options = {"api_endpoint": API_ENDPOINT}

RAW_SCHEMA_DIR = 'model_src/raw_schema/schema.pbtxt'

## 1. Get Managed Dataset 

In [None]:
dataset_client = aip.DatasetServiceClient(client_options=client_options)
for dataset in dataset_client.list_datasets(parent=PARENT):
    if dataset.display_name == DATASET_DISPLAYNAME:
        dataset_uri = dataset.name
        break
        
dataset = dataset_client.get_dataset(name=dataset_uri)
print("Dataset uri:", dataset.name)
dataset_id = dataset.name.split('/')[-1]
print("Dataset id:", dataset_id)

## 2. Train a classifier using AutoML Tables

### Load raw schema

In [None]:
target_column = 'tip_bin'
data_split_column = 'data_split'
exclude_cloumns = ['trip_start_timestamp']

source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_DIR)
raw_feature_spec = schema_utils.schema_as_feature_spec(source_raw_schema).feature_spec
input_columns = [key for key in raw_feature_spec if key not in exclude_cloumns]
input_columns

### Prepare AutoML Tables training job

In [None]:
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


def train_automl_table(
    automl_pipeline_client,
    parent,
    dataset_id,
    model_display_name,
    input_columns,
    target_column
):
    transformations = [
        {"auto": {"column_name": column}} 
        for column in input_columns
    ]

    training_task_inputs_dict = {
        "targetColumn": target_column,
        "predictionType": "classification",
        "transformations": transformations,
        "trainBudgetMilliNodeHours": 1,
        "disableEarlyStopping": False,
        "optimizationObjective": "minimize-log-loss",
    }
    training_task_inputs = json_format.ParseDict(training_task_inputs_dict, Value())

    training_pipeline = {
        "display_name": f"train_{model_display_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}",
        "training_task_definition": "gs://google-cloud-aiplatform/schema/trainingjob/definition/automl_tabular_1.0.0.yaml",
        "training_task_inputs": training_task_inputs,
        "input_data_config": {
            "dataset_id": dataset_id,
#             "fraction_split": {
#                 "training_fraction": 0.8,
#                 "validation_fraction": 0.1,
#                 "test_fraction": 0.1,
#             },
            "predefined_split": {
               "key": data_split_column 
            }
            
        },
        "model_to_upload": {"display_name": model_display_name},
    }

    response = automl_pipeline_client.create_training_pipeline(
        parent=parent, training_pipeline=training_pipeline
    )
    
    print("response:", response)

### Submit AutoML Tables training job.

In [None]:
automl_pipeline_client = aip.PipelineServiceClient(
    client_options=client_options)

In [None]:
train_automl_table(
    automl_pipeline_client=automl_pipeline_client,
    parent=PARENT,
    dataset_id=dataset_id,
    model_display_name=AUTOML_MODEL_DISPLAYNAME,
    input_columns=input_columns,
    target_column=target_column
)

In [None]:
automl_pipeline_client.create_training_pipeline()

### List training jobs

In [None]:
output = automl_pipeline_client.list_training_pipelines(parent=PARENT)
output

## 3. Retrieve the Uploaded Model

In [None]:
model_client = aip.ModelServiceClient(client_options=client_options)

In [None]:
model_list = model_client.list_models(parent=PARENT)

for entry in model_list:
    if entry.display_name == AUTOML_MODEL_DISPLAYNAME:
        model_uri = entry.name
        break

print(model_uri)

## 4. Get Evaluation Metrics

In [None]:
evaluation_results = model_client.list_model_evaluations(parent=model_uri)
evaluation_results

In [None]:
metrics = list(evaluation_results)[0].metrics

In [None]:
list(metrics.keys())

In [None]:
print("Log loss:", metrics['logLoss'])
print("AUC - PRC:", metrics['auPrc'])
print("AUC - ROC:", metrics['auRoc'])

In [None]:
entries = metrics['confusionMatrix']['rows']

print("TN:", entries[0][0])
print("FP:", entries[0][1])
print("FN:", entries[1][0])
print("TP:", entries[1][1])

total = sum(entries[0]) + sum(entries[1])
accuracy = (entries[0][0] + entries[1][1]) / total

print(f"Accuracy: {round(accuracy * 100, 2)}%")

In [None]:
confidence_metrics = list(evaluation_results)[0].metrics['confidenceMetrics']
list(confidence_metrics[-1].keys())

In [None]:
thresholds = []
scores = []

for m in confidence_metrics:
    entry = dict(m)
    f1Score = entry['f1Score']
    threshold = 0
    
    if 'confidenceThreshold' in entry:
        threshold = entry['confidenceThreshold']
        
    thresholds.append(threshold)
    scores.append(f1Score)

In [None]:
pd.DataFrame(
    {
        'threshold': thresholds,
        'score': scores
    }
).plot(kind='line', x='threshold', y='score')