# TF Model training script for Kubeflow pipeline and Katib
Uses TFRecords created using Spark

Adapted from https://gitlab.corp.zulily.com/personalization/user-guides/kubeflow-tensorflow/-/blob/master/model_train_tf_records.ipynb

## Get project-ID and collect Python module package components

In [43]:
# Get your GCP project id from gcloud
shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = shell_output[0]
print("Project ID:", PROJECT_ID)
NAMESPACE =  'rmenon'#tf_job_utils.get_default_target_namespace() - not working
print("Namespace:", NAMESPACE)

Project ID: zulilymodeltraining
Namespace: rmenon


In [44]:
# Path used to save this python trainer module
MODULE_PATH = f"gs://{PROJECT_ID}/{NAMESPACE}/p13n-tf-trainer_module"
print('Module path is: ', MODULE_PATH)

Module path is:  gs://zulilymodeltraining/rmenon/p13n-tf-trainer_module


In [45]:
# Make folder for Python training script
! rm -rf custom
! mkdir custom

# Add package information
! touch custom/README.md

# Add requirements information
requirements = "tensorflow==2.5.0\ntensorflow_io==0.19.1\nverta==0.17.6\ngoogle-cloud-storage==1.38.0"
! echo "$requirements" > custom/requirements.txt

setup_cfg = "[egg_info]\n\ntag_build =\n\ntag_date = 0"
! echo "$setup_cfg" > custom/setup.cfg

setup_py = "import setuptools\n\nwith open('requirements.txt') as f:\n    REQUIRES = f.readlines()\n\nsetuptools.setup(\n\n    name='trainer',\n\n    install_requires=REQUIRES,\n\n    packages=setuptools.find_packages(),\n\n    package_data = {'': ['*.yaml']})"
! echo "$setup_py" > custom/setup.py

pkg_info = "Metadata-Version: 1.0\n\nName: Flowers image classification\n\nVersion: 0.0.0\n\nSummary: Demostration training script\n\nHome-page: www.google.com\n\nAuthor: Google\n\nAuthor-email: aferlitsch@google.com\n\nLicense: Public\n\nDescription: Demo\n\nPlatform: Vertex"
! echo "$pkg_info" > custom/PKG-INFO

# Make the training subfolder and adding utils and relavent files to the trainer director
! mkdir custom/trainer
! touch custom/trainer/__init__.py
! cp ../utilities/common_utilities.py custom/trainer/
! cp ../utilities/modeldb_tf_utilities.py custom/trainer/
! cp ../utilities/model_utilities.py custom/trainer/
! cp ../utilities/tf_records_utils.py custom/trainer/
! cp ../model_configs/text_based_tfrecord_config.yaml custom/trainer/

## Imports

In [46]:
%%writefile custom/trainer/task.py

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import sys
import os
from pathlib import Path
import pandas as pd
import time
import argparse
import logging
import yaml
import shutil
from google.cloud import storage
from tensorflow.io import FixedLenFeature


# This will make sure that we are able to import utils and from_tfrcords as they ll be added to pythonpath while executing this script
file_dir = os.path.dirname(__file__)
sys.path.append(file_dir)
print(file_dir)

import modeldb_tf_utilities
import model_utilities
import tf_records_utils

AUTOTUNE = tf.data.experimental.AUTOTUNE


Writing custom/trainer/task.py


## Configuration Parameters

### Model and Data Related

In [47]:
%%writefile -a custom/trainer/task.py

# YAML file containing model data configuration: i.e. Feature names, identification of categorical names etc
model_data_config_file_name = os.path.join(file_dir, "text_based_tfrecord_config.yaml")
print(model_data_config_file_name)

# Dataset paths
data_config = {
    "training":'gs://zulilymodeltraining/rmenon/data/tfrecords/version_8/train/',
    "validation":'gs://zulilymodeltraining/rmenon/data/tfrecords/version_8/valid/',
    "test":'gs://zulilymodeltraining/rmenon/data/tfrecords/version_8/test/',
}

# Path to save trained model and other model-related specs
model_data_path_prefix= f"gs://personalization-tensorflow/models/text_features/"

# Model training parameters
model_fit_config = {
    "batch_size": 2048,
    "initial_lr": 1e-3,
    "epochs": 6,
    "shuffle_buffer_size": 16384,
}

# Evaluating LTR performance metrics
max_rank = 15

# Use distributed training across GPUs (only set to True if using >1GPU). Also only efficient for large models.
use_distributed_training = False

# Use checkpointed model (usually based on lowest validation loss) to generate validation metrics
use_checkpointed_model = True

# Run locally for testing
run_local = True


Appending to custom/trainer/task.py


### Feature Normalizer Related

In [48]:
%%writefile -a custom/trainer/task.py

# Parameters related to feature normalizer for the model
num_samples_to_train_normalizer = 500 # Set a sample size (in terms of number of batches). If set to None, the entire "training" set will be used to train the normalizer.
if run_local:
    num_samples_to_train_normalizer = 1

Appending to custom/trainer/task.py


### Model DB Related

In [49]:
%%writefile -a custom/trainer/task.py

# Model DB configuration parameters
modeldb_config = {
    ## Required configs
    # These are required configs for a modeldb run. 
    # Please refer to notes here: https://confluence.zulily.com/display/tech/Notes+about+using+ModelDB if you are updaing the default
    # project and experiment name.
    "client_url": "https://modeldb.mlp.ml.gce.z8s.io/",
    "project_name": 'P13N_Event_Sort_Models_2021',
    
    # This parameter is by default true and is required if you are going to run multiple runs with same experiment_run_name.
    # This will prevent you from overwritng an experiment_run data and create a new run everytime a pipeline runs.
    "add_random_hash_to_run_name": 'true'
}

Appending to custom/trainer/task.py


### Augment configuration parameters with command line arguments - if provided

In [50]:
%%writefile -a custom/trainer/task.py

parser = argparse.ArgumentParser(description="TF.Keras Daily Email Model")
parser.add_argument("--module-path", dest="module_path", default="", type=str, help="GCS location of the training module.")
parser.add_argument("--namespace", dest="namespace", type=str, help="User namespace.")

# training hyperparameters
parser.add_argument("--lr", dest="initial_lr", default=model_fit_config['initial_lr'], type=float, help="Learning rate.")
parser.add_argument("--batch-size", dest="batch_size", default=model_fit_config['batch_size'], type=int, help="mini-batch size")

parser.add_argument(
    "--modeldb-experiment-name", dest="modeldb_experiment_name", type=str, help="ModelDB Experiment Name")
parser.add_argument(
    "--modeldb-experiment-run-name", dest="modeldb_experiment_run_name", type=str,
    help="ModelDB Experiment Run Name. If an experiment run name is not specified, then ModelDB will randomly assign a run_name.")    
args = parser.parse_args()

modeldb_config['username'] = args.namespace
modeldb_config['experiment_name'] = args.modeldb_experiment_name
modeldb_config['experiment_run_name'] = args.modeldb_experiment_run_name
model_fit_config['batch_size']= args.batch_size
model_fit_config['initial_lr']= args.initial_lr

Appending to custom/trainer/task.py


### Model Internal initializations based on YAML configuration

In [51]:
%%writefile -a custom/trainer/task.py

# Retrieve model configuration from YAML file.
with open(model_data_config_file_name) as file:
    model_data_config = yaml.load(file, Loader=yaml.SafeLoader)
# Do some name mappings to make code cleaner
feature_names = model_data_config['feature_names']
categorical_columns = model_data_config['categorical_columns']
categorical_columns_vocabulary_list = model_data_config['categorical_columns_vocabulary_list']
numeric_columns_to_norm = model_data_config['numeric_columns_to_norm']
vector_features = model_data_config['vector_columns']
identifier_columns = model_data_config['identifier_columns']
numeric_columns_remaining = [xx for xx in feature_names if ((xx not in categorical_columns) \
                                                            and (xx not in numeric_columns_to_norm)\
                                                            and (xx not in vector_features))]
vector_column_lengths = model_data_config['vector_column_lengths']
target_name = model_data_config['target_name']
numeric_columns_remaining.remove(target_name)

Appending to custom/trainer/task.py


In [52]:
%%writefile -a custom/trainer/task.py

# Build TF Structure
tf_feature_descriptions = dict()
# Assume that identifier columns are first
for column in identifier_columns:
    tf_feature_descriptions[column] = tf.io.FixedLenFeature([], tf.string)
for column in feature_names:
    if column == target_name:
        tf_feature_descriptions[column] = tf.io.FixedLenFeature([], tf.int64)
    else:
        if column in categorical_columns:
            if column in vector_features:
                tf_feature_descriptions[column] = tf.io.FixedLenFeature([vector_column_lengths[column]], tf.string)
            else:
                tf_feature_descriptions[column] = tf.io.FixedLenFeature([], tf.string)  
        else:
            if column in vector_features:
                tf_feature_descriptions[column] = tf.io.FixedLenFeature([vector_column_lengths[column]], tf.float32)
            else:
                tf_feature_descriptions[column] = tf.io.FixedLenFeature([], tf.float32)

Appending to custom/trainer/task.py


## Create/ Load a Feature Normalizer

In [53]:
%%writefile -a custom/trainer/task.py

def parse_tf_records_norm(example_proto):
    return tf.io.parse_single_example(example_proto, normalizer_tf_feature_descriptions)

Appending to custom/trainer/task.py


In [54]:
%%writefile -a custom/trainer/task.py

st = time.time()

normalizer_tf_feature_descriptions = {}
for column in numeric_columns_to_norm:    
    normalizer_tf_feature_descriptions[column] = tf.io.FixedLenFeature([], tf.float32)
    
# reate a data generator to run thru the training data for normalizing features
data_batches_for_norm = tf_records_utils.get_tf_record_ds(data_config['training'])\
                        .map(parse_tf_records_norm, num_parallel_calls=AUTOTUNE) \
                        .batch(2048)

# Pick a random sample if specified
if num_samples_to_train_normalizer is not None:
    data_batches_for_norm = data_batches_for_norm.take(int(num_samples_to_train_normalizer))

# Stack features: Change from dictionary format to a a stacked tensor array
def stack_features(features):
    return tf.stack(list(features.values()), axis=1)
data_batches_for_norm_stacked = data_batches_for_norm.map(stack_features)


# Train the normalizer 
if use_distributed_training:
    mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():
        feature_normalizer = preprocessing.Normalization()
        feature_normalizer.adapt(data_batches_for_norm_stacked)
else:
    feature_normalizer = preprocessing.Normalization()
    feature_normalizer.adapt(data_batches_for_norm_stacked)
print('Normalizer training took {}secs'.format(time.time() - st))

Appending to custom/trainer/task.py


## Model Helper functions

In [55]:
%%writefile -a custom/trainer/task.py
# Model-DB logging functions
def log_model_attributes(modeldb_expt_run):
    """
    Capturing Model attributes before starting training in ModelDB.
    """
    modeldb_expt_run.log_hyperparameters(model_fit_config)
    modeldb_expt_run.log_attributes(data_config)
    modeldb_expt_run.log_attributes(model_data_config)

    
def log_model_metrics(modeldb_expt_run, model, model_save_path, model_checkpoint_path, test_ds = None):
    """
    Capturing Model metrics at the end of training in ModelDB.
    """
    
    # Log the paths where the model and related data were saved
    modeldb_expt_run.log_artifact_path('other_model_related_data_path', model_data_path_prefix)
    modeldb_expt_run.log_artifact_path('model_save_path', model_save_path)
    modeldb_expt_run.log_artifact_path('model_checkpoint_path', model_checkpoint_path)
    
    # Log accuracy of the supplied data set (if supplied)
    if test_ds is not None:
        loss, accuracy, precision, recall = model.evaluate(test_ds)        
        modeldb_expt_run.log_metric('loss_', loss)
        modeldb_expt_run.log_metric('accuracy', accuracy)
        modeldb_expt_run.log_metric('precision', precision)
        modeldb_expt_run.log_metric('recall', recall)
        # Printing the result is important as katib parses this output to report metric
        print("\naccuracy={}".format(accuracy))
        print("\nloss={}".format(loss))
        

def log_model_summary(modeldb_expt_run, model):
    """
    Log the structure of the Model
    """
    stringlist = []
    # Only store the last sequential layer
    model.get_layer(index=-1).summary(print_fn=lambda x: stringlist.append(x))
    short_model_summary = "\n".join(stringlist)    
    
    if os.path.exists('/tmp/model/'):        
        shutil.rmtree('/tmp/model')
    os.mkdir('/tmp/model')

    with open('/tmp/model/model.txt', 'w') as f:
        f.write(short_model_summary)
    f.close()
    modeldb_expt_run.log_artifact('Model_Summary', '/tmp/model/model.txt')    
    
def parse_tf_records(example_proto):
    return tf.io.parse_single_example(example_proto, tf_feature_descriptions)

Appending to custom/trainer/task.py


In [56]:
%%writefile -a custom/trainer/task.py
# Target variable mapping function
def parse_label_from_data(data):
    """
    Function to map the data parsed in order to generate the labels
    """
    labels = data.pop(target_name)
    
    label_0_values = tf.constant([0], dtype=tf.dtypes.int64)    
    labels = tf.reshape(labels, [-1, 1])
    labels_converted = tf.where(tf.reduce_any(tf.equal(labels, label_0_values), axis=1), 
                              tf.constant(0, dtype=tf.dtypes.int64), 
                              tf.constant(1, dtype=tf.dtypes.int64)) 
    return data, labels_converted

Appending to custom/trainer/task.py


## Model Setup and training

In [57]:
%%writefile -a custom/trainer/task.py

# Get training, test and validation data generators
training_data = tf_records_utils.get_tf_record_ds(data_config['training'])\
                .map(parse_tf_records, num_parallel_calls=AUTOTUNE) \
                .map(parse_label_from_data, num_parallel_calls=AUTOTUNE) \
                .shuffle(model_fit_config['shuffle_buffer_size']).batch(model_fit_config['batch_size'])
training_data = training_data.prefetch(buffer_size=AUTOTUNE)

validation_data = tf_records_utils.get_tf_record_ds(data_config['validation']) \
                .map(parse_tf_records, num_parallel_calls=AUTOTUNE) \
                .map(parse_label_from_data, num_parallel_calls=AUTOTUNE) \
                .batch(model_fit_config['batch_size'])
validation_data = validation_data.prefetch(buffer_size=AUTOTUNE)

test_data = tf_records_utils.get_tf_record_ds(data_config['test']) \
            .map(parse_tf_records, num_parallel_calls=AUTOTUNE) \
            .map(parse_label_from_data, num_parallel_calls=AUTOTUNE) \
            .batch(model_fit_config['batch_size'])
test_data = test_data.prefetch(buffer_size=AUTOTUNE)

if run_local:
    # For testing purposes: Selects just 1 batch of data for training, validation and test
    training_data = training_data.take(10)
    validation_data = validation_data.take(10)
    test_data = test_data.take(10)

Appending to custom/trainer/task.py


In [58]:
%%writefile -a custom/trainer/task.py

num_epochs = model_fit_config['epochs']

# Create Model-DB Instance
modeldb_expt_run = modeldb_tf_utilities.create_modeldb_experiment_run(modeldb_config)

# Get callbacks and save paths
model_data_path_prefix = os.path.join(model_data_path_prefix, modeldb_expt_run.name)
callbacks = modeldb_tf_utilities.get_tf_callbacks(modeldb_expt_run, model_data_path_prefix)

# Save some attributes before training starts
log_model_attributes(modeldb_expt_run)

# Define model 
#loss=tf.keras.losses.BinaryCrossentropy(from_logits=False)
loss=tf.keras.losses.MeanSquaredError()
optimizer=tf.optimizers.Adam(learning_rate=model_fit_config['initial_lr'])

# Create a distribution strategy to run on multiple GPUs
if use_distributed_training:
    with mirrored_strategy.scope():
        sort_model = model_utilities.get_tfrecord_sort_model(feature_normalizer, 
                                                       numeric_columns_to_norm, 
                                                       numeric_columns_remaining, 
                                                       categorical_columns,
                                                       categorical_columns_vocabulary_list,
                                                       vector_features,
                                                        vector_column_lengths)
        sort_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
else:
    sort_model = model_utilities.get_tfrecord_sort_model(feature_normalizer,
                                                   numeric_columns_to_norm, 
                                                   numeric_columns_remaining, 
                                                   categorical_columns,
                                                   categorical_columns_vocabulary_list,
                                                    vector_features,
                                                    vector_column_lengths)
    sort_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

log_model_summary(modeldb_expt_run, sort_model)

Appending to custom/trainer/task.py


In [59]:
%%writefile -a custom/trainer/task.py
# Start the training process
try:
    start_time = time.time()  
    #cached_ds = training_data.cache()
    #cached_validation_ds = validation_data.cache()
    # Fit the model
    model_history = sort_model.fit(training_data, validation_data=validation_data, epochs=num_epochs, callbacks=callbacks)
    # Log time taken to fit model
    modeldb_expt_run.log_metric('model_fit_run_duration_in_secs', (time.time() - start_time))            
    # Save Model
    model_save_path = os.path.join(model_data_path_prefix, 'saved_model/')
    sort_model.save(model_save_path)
    # But reload model from checkpoint (lowest validation loss) to generate validation performance
    model_checkpoint_path = os.path.join(model_data_path_prefix, 'checkpoints/')
    if use_checkpointed_model:
        # But reload model from checkpoint (lowest validation loss) to generate validation performance        
        sort_model.load_weights(model_checkpoint_path)    
    # Log other metrics from model including validation data performance
    log_model_metrics(modeldb_expt_run, sort_model, model_save_path, model_checkpoint_path, validation_data)
    modeldb_expt_run.log_tag('success')
except:
    modeldb_expt_run.log_tag('failed_run')
    raise

Appending to custom/trainer/task.py


In [60]:
%%writefile -a custom/trainer/task.py
modeldb_expt_run.log_metric('experiment_run_duration_in_secs', (time.time() - start_time))            

Appending to custom/trainer/task.py


## Build the final Package

In [61]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar --exclude="*.ipynb_checkpoints*" custom 
! gzip custom.tar

custom/
custom/requirements.txt
custom/setup.py
custom/README.md
custom/setup.cfg
custom/PKG-INFO
custom/trainer/
custom/trainer/tf_records_utils.py
custom/trainer/text_based_tfrecord_config.yaml
custom/trainer/modeldb_tf_utilities.py
custom/trainer/model_utilities.py
custom/trainer/task.py
custom/trainer/__init__.py
custom/trainer/common_utilities.py


In [62]:
! gsutil cp custom.tar.gz $MODULE_PATH/custom.tar.gz
! echo $MODULE_PATH/custom

Copying file://custom.tar.gz [Content-Type=application/x-tar]...
/ [1 files][  9.0 KiB/  9.0 KiB]                                                
Operation completed over 1 objects/9.0 KiB.                                      
gs://zulilymodeltraining/rmenon/p13n-tf-trainer_module/custom


In [63]:
errout. Below does not need to be done always

SyntaxError: invalid syntax (<ipython-input-63-af52cae17cef>, line 1)

## Execute this file locally

In [None]:
 # Install the built package to test the code locally
!pip install custom.tar.gz

In [None]:
# Fetch the notebook image to capture the base image of notebook in ModelDB
import subprocess
notebook_name = subprocess.run(['cat', '/etc/hostname'], stdout=subprocess.PIPE)
notebook_name = notebook_name.stdout.decode('utf-8').strip("\n")
runner_docker_image = subprocess.run(['kubectl', 'get', 'po',  notebook_name,  '-o=jsonpath="{$.spec.containers[:1].image}"'], stdout=subprocess.PIPE)
runner_docker_image = runner_docker_image.stdout.decode("utf-8").strip('"')

In [None]:
!python -m trainer.task --namespace $NAMESPACE \
--modeldb-experiment-name $NAMESPACE-keras-model-local-run-sampled-data \
--modeldb-experiment-run-name keras-with-sampled-data-on-cpu --lr 0.3 --batch-size 512

In [None]:
# Once tested uninstall the custom package 
! pip uninstall trainer --yes