# AI Platform ML Metadata -- Sample Notebook

# Prerequisites

Please follow the "Getting Started" section of the [User Guide](https://docs.google.com/document/d/1WNEFvjZbki0GTG2oB5JS7D2JVaQjH8PUlVSTt9xPSjQ/edit?usp=sharing) to be able to use the service.

# Installation

You will need to be authenticated with gcloud as a user with valid permissions in a properly prepared project.

In [None]:
!gcloud auth application-default login

Get packages and install them. You may need to restart the kernel after this step.

In [None]:
!gsutil cp gs://ai-platform-metadata/python/google-cloud-aiplatform-metadata-0.0.1.tar.gz .
!pip3 install google-cloud-aiplatform-metadata-0.0.1.tar.gz
!pip3 install tensorflow
!pip3 install networkx
!sudo apt-get install graphviz libgraphviz-dev pkg-config
!pip3 install pygraphviz

Ref: https://www.tensorflow.org/tutorials/keras/regression

## Steps
1. Read data
2. Clean data
3. Train test split data
4. Normalize data
5. Train
6. Evaluate

In [None]:
import pathlib
import numpy as np
import pandas as pd
from tensorflow.python.keras import layers, Sequential
from tensorflow.python.keras.utils import data_utils
from google.cloud import aiplatform

aiplatform.connect(project='YOUR-PROJECT-NAME-HERE', location='us-central1')
aiplatform.set_experiment('automobile-fuel-economy-prediction')

In [None]:
@aiplatform.execution(name="Data Reader")
def read_data(uri):
    aiplatform.log_parameter('data uri', uri)
    dataset_path = data_utils.get_file("auto-mpg.data", uri)
    column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                    'Acceleration', 'Model Year', 'Origin']
    raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values = "?",
                              comment='\t', sep=" ", skipinitialspace=True)
    
    aiplatform.log_dataset(raw_dataset, "Raw Auto MPG Data")
    return raw_dataset

dataset = read_data("http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")

In [None]:
aiplatform.graph_experiment()

In [None]:
@aiplatform.execution(name="Data Cleaner")
def clean_data(dataset):
    dataset = dataset.dropna()
    dataset['Origin'] = dataset['Origin'].map(
        lambda x: {1: 'USA', 2: 'Europe', 3: 'Japan'}.get(x))
    dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')        
    aiplatform.log_dataset(dataset, "Cleaned Auto MPG Data")
    aiplatform.log_metric('num_records', dataset.shape[0])
    return dataset

clean_dataset = clean_data(dataset)

In [None]:
aiplatform.graph_experiment()

In [None]:
@aiplatform.execution(name="Data Splitter")
def train_test_split(dataset, split_frac=0.8, random_state=0):
    aiplatform.log_parameters(split_fraction=split_frac,
                            random_state=random_state)
    
    train_dataset = dataset.sample(frac=split_frac, random_state=random_state)
    test_dataset = dataset.drop(train_dataset.index)
    train_labels = train_dataset.pop('MPG')
    test_labels = test_dataset.pop('MPG')
    
    aiplatform.log_dataset(train_dataset, "Train Data")
    aiplatform.log_dataset(test_dataset, "Test Data")
    aiplatform.log_dataset(train_labels, "Train Labels")
    aiplatform.log_dataset(test_labels, "Test Labels")
    
    return train_dataset, test_dataset, train_labels, test_labels

train_dataset, test_dataset, train_labels, test_labels = train_test_split(clean_dataset)

In [None]:
aiplatform.graph_experiment()

In [None]:
@aiplatform.execution(name="Data Normalizer")
def normalize_dataset(train_dataset, test_dataset):
    train_stats = train_dataset.describe()
    train_stats = train_stats.transpose()
    def norm(x):
        return (x - train_stats['mean']) / train_stats['std']
    normed_train_data = norm(train_dataset)
    normed_test_data = norm(test_dataset)
    aiplatform.log_dataset(normed_train_data, "Normalized Train Data")
    aiplatform.log_dataset(normed_test_data, "Normalized Test Data")
    
    return normed_train_data, normed_test_data

normed_train_data, normed_test_data = normalize_dataset(train_dataset,
                                                        test_dataset)

In [None]:
aiplatform.graph_experiment()

In [None]:
@aiplatform.execution(name="Trainer")
def train(train_data, train_labels, num_units=64, activation='relu',
          dropout_rate=0.0, validation_split = 0.2, epochs=1000):
    aiplatform.log_parameters(num_units=num_units, activation=activation,
                            dropout_rate=dropout_rate,
                            validation_split=validation_split, epochs=epochs)
    
    model = Sequential([
        layers.Dense(num_units, activation=activation,
                     input_shape=[len(train_dataset.keys())]),
        layers.Dropout(rate=dropout_rate),
        layers.Dense(num_units, activation=activation),
        layers.Dense(1)
    ])


    model.compile(loss='mse',
                optimizer='adam',
                metrics=['mae', 'mse'])
    print(model.summary())
    
    history = model.fit(
      train_data, train_labels,
      epochs=epochs, validation_split=validation_split)
    
    aiplatform.log_model(model,'MPG DNN Model')
    aiplatform.log_metrics(**{metric: values[-1]
                            for metric, values in history.history.items()})

    return model, history

model, history = train(normed_train_data, train_labels, num_units=16,
                       activation='relu', epochs=3, dropout_rate=0.1)

In [None]:
aiplatform.graph_experiment()

In [None]:
@aiplatform.execution(name="Evaluator")
def evaluate(model, test_data, test_labels):
    loss, mae, mse = model.evaluate(test_data, test_labels, verbose=2) 
    aiplatform.log_metrics(test_loss=loss, test_mae=mae, test_mse=mse)
    return loss, mae, mse

test_loss, test_mae, test_mse = evaluate(model, normed_test_data, test_labels)

In [None]:
aiplatform.graph_experiment()

In [None]:
with aiplatform.experiment('second-experiment'):
  model, history = train(normed_train_data, train_labels, num_units=32,
                         epochs=6, dropout_rate=0.3)
  evaluate(model, normed_test_data, test_labels)

with aiplatform.experiment('third-experiment'):
  model, history = train(normed_train_data, train_labels, epochs=12)
  evaluate(model, normed_test_data, test_labels)

with aiplatform.experiment('fourth-experiment'):
  model, history = train(normed_train_data, train_labels, num_units=8,
                         epochs=12)
  evaluate(model, normed_test_data, test_labels)

In [None]:
experiments_df = aiplatform.get_experiments_dataframe()

In [None]:
experiments_df.dropna(subset=['metric.val_loss']).plot.bar(x='experiment_name',
                                                           y='metric.val_loss')

In [None]:
col_names = ['experiment_name', 'param.num_units', 'param.epochs', 'metric.val_mae']
experiments_df[experiments_df.execution_name == 'Trainer'][col_names]
                                                           

## Pivot Dataframe to view at Experiment cross section.

In [None]:
def select_first(col):
  col = col.dropna()
  if len(col):
    return col.iloc[0]

experiments_pivot = experiments_df.drop(['execution_resource_name', 'execution_name'],axis=1) \
                        .pivot_table(index='experiment_name', aggfunc=select_first)
experiments_pivot.T

## Parallel Coordinates Example

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [15, 5]

ax = pd.plotting.parallel_coordinates(
    experiments_pivot.reset_index(level=0),
    'experiment_name', cols=['param.num_units', 'param.dropout_rate',
                             'param.epochs', 'metric.loss',
                             'metric.val_loss', 'metric.test_loss'],
    color=['blue', 'green', 'pink', 'red'])
ax.set_yscale('symlog')
ax.legend(bbox_to_anchor=(1.0, 0.5))