# 02.1 - ML Experimentation with AutoML

The purpose of this notebook is to use [AutoML Tables](https://cloud.google.com/automl-tables) to train a classifier 
to predict whether a given trip will result in a tip > 20%. The notebook covers the following tasks:

1. Prepare and submit an AutoMl Tables training job.
2. Retrieve the uploaded model by the AutoMl Tables.
3. Retrieve the evaluation results of the AutoML Table.


## Setup

In [None]:
%load_ext autoreload
%autoreload 2?

In [None]:
import os
import time
import pandas as pd
from datetime import datetime
import tensorflow_data_validation as tfdv
from tensorflow_transform.tf_metadata import schema_utils

In [None]:
PROJECT = 'ksalama-cloudml'  # Change to your project Id.
REGION = 'us-central1'

DATASET_DISPLAY_NAME = 'chicago_taxi_tips'
AUTOML_MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}_classifier_automl'

RAW_SCHEMA_DIR = 'model_src/raw_schema/schema.pbtxt'

In [None]:
from utils.ucaip_utils import AIPUtils
aip_utils = AIPUtils(PROJECT, REGION)

## 1. Train a classifier using AutoML Tables

### Load raw schema

In [None]:
target_column = 'tip_bin'
data_split_column = 'data_split'
exclude_cloumns = ['trip_start_timestamp']

source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_DIR)
raw_feature_spec = schema_utils.schema_as_feature_spec(source_raw_schema).feature_spec
input_columns = [key for key in raw_feature_spec if key not in exclude_cloumns]
input_columns

### Create training task inputs spec

In [None]:
transformations = [
    {"auto": {"column_name": column}} 
    for column in input_columns
]

training_task_inputs_spec = {
    "targetColumn": target_column,
    "predictionType": "classification",
    "transformations": transformations,
    "trainBudgetMilliNodeHours": 1,
    "disableEarlyStopping": False,
    "optimizationObjective": "minimize-log-loss",
}

predefined_split = {
    "key": data_split_column 
}

### Submit AutoML Tables training job

In [None]:
training_job = aip_utils.train_automl_table(
        dataset_display_name=DATASET_DISPLAY_NAME,
        model_display_name=AUTOML_MODEL_DISPLAY_NAME+"s",
        training_task_inputs_spec=training_task_inputs_spec,
        predefined_split=predefined_split
)

## Monitor job state

In [None]:
while True:
    response = aip_utils.get_automl_training_job_by_uri(training_job.name)
    if response.state.name == 'PIPELINE_STATE_SUCCEEDED':
        print("Training job completed. - Training Time:", response.update_time - response.create_time)
        break
    elif response.state.name == 'PIPELINE_STATE_FAILD':
        print("Training job failed!")
        break
    else:
        print(f"Training job state is: {response.state.name}.")
    time.sleep(60)

In [None]:
response.state.name

## 2. Retrieve the Uploaded Model

In [None]:
model = aip_utils.get_model_by_display_name(AUTOML_MODEL_DISPLAY_NAME)
model

## 4. Get Evaluation Metrics

In [None]:
evaluation_results = aip_utils.get_evaluation_results_by_model_display_name(
    AUTOML_MODEL_DISPLAY_NAME)
evaluation_results

In [None]:
metrics = list(evaluation_results)[0].metrics

In [None]:
list(metrics.keys())

In [None]:
print("Log loss:", metrics['logLoss'])
print("AUC - PRC:", metrics['auPrc'])
print("AUC - ROC:", metrics['auRoc'])

In [None]:
entries = metrics['confusionMatrix']['rows']

print("TN:", entries[0][0])
print("FP:", entries[0][1])
print("FN:", entries[1][0])
print("TP:", entries[1][1])

total = sum(entries[0]) + sum(entries[1])
accuracy = (entries[0][0] + entries[1][1]) / total

print(f"Accuracy: {round(accuracy * 100, 2)}%")

In [None]:
confidence_metrics = list(evaluation_results)[0].metrics['confidenceMetrics']
list(confidence_metrics[-1].keys())

In [None]:
thresholds = []
scores = []

for m in confidence_metrics:
    entry = dict(m)
    f1Score = entry['f1Score']
    threshold = 0
    
    if 'confidenceThreshold' in entry:
        threshold = entry['confidenceThreshold']
        
    thresholds.append(threshold)
    scores.append(f1Score)

In [None]:
pd.DataFrame(
    {
        'threshold': thresholds,
        'score': scores
    }
).plot(kind='line', x='threshold', y='score')