# TF Model training script

## Imports and other initializations

In [20]:
# fairing:include-cell
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import sys
import os
from pathlib import Path
import pandas as pd
from kubeflow import fairing 
import time
import json
import logging
import imp
import yaml
import seaborn as sns
import shutil
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import normalize
from google.cloud import storage

from matplotlib import pyplot as plt

sys.path.append('../utilities/')
import modeldb_tf_utilities
import evaluation_utilities
imp.reload(modeldb_tf_utilities)
imp.reload(evaluation_utilities)

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [21]:
# fairing:include-cell
# Global configs that would be used at all the steps of the notebook.
GCP_PROJECT = fairing.backends.gcp.guess_project_name()
NAMESPACE = fairing.backends.utils.get_current_k8s_namespace()
PROJECT_ID = GCP_PROJECT
MODELDB_CLIENT_URL = "https://modeldb.mlp.ml.gce.z8s.io/"
print(NAMESPACE)
print(GCP_PROJECT)

rmenon
zulilymodeltraining


## Configuration Parameters

### Model and Data Related

In [22]:
# fairing:include-cell
# YAML file containing model data configuration: i.e. Feature names, identification of categorical names etc
model_data_config_file_name = "./../model_configs/text_based_config.yaml"

# Dataset paths
data_config = {
    #"training": 'gs://personalization-tensorflow/data/train/all_features_tf_classification_v07_train_mobile_imp_04_58*.csv', 
    #"training": 'gs://personalization-tensorflow/data/train/all_features_tf_classification_v07_train_mobile_imp_0_458*.csv', 
    #"validation": 'gs://personalization-tensorflow/data/valid/all_features_tf_classification_v07_valid_diff*.csv',
    #"test": 'gs://personalization-tensorflow/data/test/all_features_tf_classification_v07_test_diff*.csv',
    "training":'gs://zulilymodeltraining/rmenon/data/train/all_features_text_tf_classification_v07_train_mobile_imp_0_458*.csv',
    "validation":'gs://zulilymodeltraining/rmenon/data/valid/all_features_text_tf_classification_v07_valid_diff*.csv',
    #"test":'gs://zulilymodeltraining/rmenon/data/test/all_features_text_tf_classification_v07_test_diff*.csv',    
    "test":'gs://zulilymodeltraining/rmenon/data/test/all_features_text_tf_classification_v07_test_test_v2*.csv',
}

# Path to save trained model and other model-related specs
model_data_path_prefix= f"gs://personalization-tensorflow/models/text_features/"

# Model training parameters
model_fit_config = {
    "batch_size": 2048,
    "initial_lr": 1e-3,
    "epochs": 50
}

# Evaluating LTR performance metrics
max_rank = 15
file_path_to_bs_results = "bs_results_on_test_test_v2.csv"

### Feature Normalizer Related

In [23]:
# fairing:include-cell
# Parameters related to feature normalizer for the model
num_samples_to_train_normalizer = None # Set a sample size (in terms of number of batches). If set to None, the entire "training" set will be used to train the normalizer.

### Model DB Related

In [24]:
# fairing:include-cell
# Model DB configuration parameters
modeldb_config = {
    ## Required configs
    # These are required configs for a modeldb run. 
    # Please refer to notes here: https://confluence.zulily.com/display/tech/Notes+about+using+ModelDB if you are updaing the default
    # project and experiment name.
    "client_url": MODELDB_CLIENT_URL,
    "project_name": 'P13N_Event_Sort_Models_2021',
    "experiment_name": f"text-features",
    # Username is mapped into as a ModelDB tag which will help to identiy a run by an user.
    "username": NAMESPACE,
    
    ## Optional configs
    # If an experiment run name is not specified, then ModelDB will randomly assign a run_name.
    "experiment_run_name": 'text_features_4layer_1024_target_0_458_regression',
    # This parameter is by default true and is required if you are going to run multiple runs with same experiment_run_name.
    # This will prevent you from overwritng an experiment_run data and create a new run everytime a pipeline runs.
    "add_random_hash_to_run_name": 'true',
}

### Model Internal initializations based on YAML configuration

In [25]:
# fairing:include-cell
# Retrieve model configuration from YAML file.
with open(model_data_config_file_name) as file:
    model_data_config = yaml.load(file, Loader=yaml.SafeLoader)
# Do some name mappings to make code cleaner
feature_names = model_data_config['feature_names']
categorical_columns = model_data_config['categorical_columns']
categorical_columns_vocabulary_list = model_data_config['categorical_columns_vocabulary_list']
numeric_columns_to_norm = model_data_config['numeric_columns_to_norm']
numeric_columns_remaining = [xx for xx in feature_names if ((xx not in categorical_columns) \
                                                            and (xx not in numeric_columns_to_norm))]
target_name = model_data_config['target_name']
numeric_columns_remaining.remove(target_name)

## Create/ Load a Feature Normalizer

In [26]:
# fairing:include-cell
def google_file_path_exists(path_name):
    """
    Checks if a file path exists in google storage
    path_name should be something like 'gs://zulilymodeltraining/rmenon/tf-models-data/normalizer_models/saved_model.pb'
    """    

    storage_client = storage.Client()
    interim = path_name.split('//') # Extract gs:
    interim = interim[1].split('/') # Extract stuff after gs, 'zulilymodeltraining/rmenon/tf-models-data/normalizer_models/saved_model.pb'
    bucket_name = interim[0] # Extract bucket name, 'zulilymodeltraining'
    name = ('/').join(interim[1:]) #Create file path excluding bucket name, 'rmenon/tf-models-data/normalizer_models/saved_model.pb'
    bucket = storage_client.bucket(bucket_name)
    stats = storage.Blob(bucket=bucket, name=name).exists(storage_client)
    return(stats)

In [27]:
# fairing:include-cell
st = time.time()

# Create a data generator to run thru the training data
column_defaults = ['float32' for column in numeric_columns_to_norm]
data_batches_for_norm = tf.data.experimental.make_csv_dataset(
    file_pattern = data_config["training"], 
    select_columns = numeric_columns_to_norm,
    column_defaults = column_defaults,
    num_epochs=1, # Only want to go thru this data once in 1 training epoch    
    ignore_errors = True,
    batch_size = 2048    
    )

# Stack features: Change from dictionary format to a a stacked tensor array
def stack_features(features):
    return tf.stack(list(features.values()), axis=1)
data_batches_for_norm_stacked = data_batches_for_norm.map(stack_features)

# Pick a random sample if specified
if num_samples_to_train_normalizer is not None:
    data_batches_for_norm_stacked = data_batches_for_norm_stacked.take(int(num_samples_to_train_normalizer))

# Train the normalizer 
feature_normalizer = preprocessing.Normalization()
feature_normalizer.adapt(data_batches_for_norm_stacked)
print('Normalizer training took {}secs'.format(time.time() - st))

Normalizer training took 104.08817338943481secs


## Model Helper functions

In [28]:
# fairing:include-cell
# Model-DB logging functions
def log_model_attributes(modeldb_expt_run):
    """
    Capturing Model attributes before starting training in ModelDB.
    """
    modeldb_expt_run.log_hyperparameters(model_fit_config)
    modeldb_expt_run.log_attributes(data_config)
    modeldb_expt_run.log_attributes(model_data_config)

    
def log_model_metrics(modeldb_expt_run, model, model_save_path, test_ds = None):
    """
    Capturing Model metrics at the end of training in ModelDB.
    """
    
    # Log the paths where the model and related data were saved
    modeldb_expt_run.log_artifact_path('other_model_related_data_path', model_data_path_prefix)
    modeldb_expt_run.log_artifact_path('model_save_path', model_save_path)
    
    # Log accuracy of the supplied data set (if supplied)
    if test_ds is not None:
#         loss, accuracy, precision, recall = model.evaluate(test_ds)        
#         modeldb_expt_run.log_metric('loss', loss)
#         modeldb_expt_run.log_metric('accuracy', accuracy)
#         modeldb_expt_run.log_metric('precision', precision)
#         modeldb_expt_run.log_metric('recall', recall)
        loss = model.evaluate(test_ds)        
        modeldb_expt_run.log_metric('loss', loss)
        

def log_model_summary(modeldb_expt_run, model):
    """
    Log the structure of the Model
    """
    stringlist = []
    # Only store the last sequential layer
    model.get_layer(index=-1).summary(print_fn=lambda x: stringlist.append(x))
    short_model_summary = "\n".join(stringlist)    
    
    if os.path.exists('/tmp/model/'):        
        shutil.rmtree('/tmp/model')
    os.mkdir('/tmp/model')

    with open('/tmp/model/model.txt', 'w') as f:
        f.write(short_model_summary)
    f.close()
    modeldb_expt_run.log_artifact('Model_Summary', '/tmp/model/model.txt')    

In [29]:
# fairing:include-cell
# Target variable mapping function
def parse_label_from_data(features, labels):
    """
    Function to map the data parsed in order to generate the labels
    """
   
    label_0_values = tf.constant([0], dtype=tf.dtypes.int32)    
    labels = tf.reshape(labels, [-1, 1])
    labels_converted = tf.where(tf.reduce_any(tf.equal(labels, label_0_values), axis=1), 
                              tf.constant(0, dtype=tf.dtypes.int64), 
                              tf.constant(1, dtype=tf.dtypes.int64)) 
    return features, labels_converted

# CSV Data generator
def get_dataset_generator(file_path, target_name, feature_names, shuffle_dataset = True):
    """
    Dataset does not need to be shuffled for validation and testing
    """
    data_batches = tf.data.experimental.make_csv_dataset(
        file_pattern = file_path, 
        select_columns = feature_names,
        num_epochs=1, # Only want to go thru this data once in 1 training epoch
        label_name=target_name,
        ignore_errors = True,
        shuffle = shuffle_dataset,
        batch_size = model_fit_config['batch_size'],
        sloppy = True, # Better reading performance since data reads will not be deterministic
        prefetch_buffer_size = 1,
        num_parallel_reads = 3 # Set this to >1 only if using multiple CPUs
        )
    #data_batches = data_batches.map(parse_label_from_data, num_parallel_calls=AUTOTUNE)
    
    return data_batches

## Model Setup and training

In [30]:
# fairing:include-cell
def get_sort_model(numeric_preprocessor):
    """
    preprocessor: Any tensorflow preprocessing modules
    inputs: Inputs to the tensorflow model - will determine the size of the input layer
    """
    
    # Create input definitions for the model
    inputs = {}
    numeric_norm_inputs = {}
    for header in numeric_columns_to_norm:            
        numeric_norm_inputs[header] = tf.keras.Input(shape=(1,), name=header, dtype=tf.float32)         
        inputs[header] = numeric_norm_inputs[header]
    
    remaining_inputs = {}
    for header in numeric_columns_remaining:            
        remaining_inputs[header] = tf.keras.Input(shape=(1,), name=header, dtype=tf.float32)         
        inputs[header] = remaining_inputs[header]
        
    for header in categorical_columns:            
        remaining_inputs[header] = tf.keras.Input(shape=(1,), name=header, dtype=tf.int64)         
        inputs[header] = remaining_inputs[header]
    
    # Use the normalizer for features to be normalized
    numeric_norm_inputs = layers.Concatenate()(list(numeric_norm_inputs.values()))
    numeric_norm_preprocessed_inputs = numeric_preprocessor(numeric_norm_inputs)
    
    #Set up feature columns for other features
    feature_columns = []
    # numeric cols
    for column in numeric_columns_remaining:
        feature_columns.append(tf.feature_column.numeric_column(column))
        
    # Create categorical feature preprocessor    
    for column in categorical_columns:
        categorical_feature = tf.feature_column.categorical_column_with_vocabulary_list(column, \
                                                                                        categorical_columns_vocabulary_list[column],\
                                                                                        default_value = -1,\
                                                                                       dtype=tf.dtypes.int64)
        categorical_feature_one_hot = tf.feature_column.indicator_column(categorical_feature)
        feature_columns.append(categorical_feature_one_hot) 
    
    # Define preprocessing layer
    pre_processing_layer = tf.keras.layers.DenseFeatures(feature_columns=feature_columns)
    preprocessed_inputs = pre_processing_layer(remaining_inputs)
    
    # Put together categorical and numerical features
    preprocessed_inputs = layers.Concatenate()([numeric_norm_preprocessed_inputs, preprocessed_inputs])
    
    # Define the inner trainable layers of the sort model
    sort_model_body = tf.keras.Sequential([
            layers.Dense(1024, activation='relu'),    
            layers.Dense(512, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dropout(0.5),
            layers.Dense(1), # activation = 'sigmoid'
        ])
    
    # Define flow thru inputs to the results stage. All done with stand-in for inputs 
    result = sort_model_body(preprocessed_inputs)
    
    # Put together the model
    sort_model = tf.keras.Model(inputs, result)
    
    return sort_model
    

In [31]:
# fairing:include-cell
# Get test and validation data generators
training_data = get_dataset_generator(data_config['training'], target_name, feature_names)
validation_data = get_dataset_generator(data_config['validation'], target_name, feature_names, shuffle_dataset=False)
test_features = list(feature_names)
test_features.append('customer_id') # Need this information to evaluate against test dataset for LTR metrics
test_data = get_dataset_generator(data_config['test'], target_name, feature_names, shuffle_dataset=False)
# training_data = training_data.take(1)
# validation_data = validation_data.take(1)
# test_data = test_data.take(1)

In [32]:
# fairing:include-cell
initial_lr = model_fit_config['initial_lr']
num_epochs = model_fit_config['epochs']

# Create Model-DB Instance
modeldb_expt_run = modeldb_tf_utilities.create_modeldb_experiment_run(modeldb_config)

# Get callbacks and save paths
model_data_path_prefix = os.path.join(model_data_path_prefix, modeldb_expt_run.name)
callbacks = modeldb_tf_utilities.get_tf_callbacks(modeldb_expt_run, model_data_path_prefix)

# Save some attributes before training starts
log_model_attributes(modeldb_expt_run)

# Define model 
#loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
loss=tf.keras.losses.MeanSquaredError()
optimizer=tf.optimizers.Adam(learning_rate=initial_lr)
sort_model = get_sort_model(numeric_preprocessor=feature_normalizer)
#sort_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
sort_model.compile(loss=loss, optimizer=optimizer)# Log the model
log_model_summary(modeldb_expt_run, sort_model)

connection successfully established
got existing Project: P13N_Event_Sort_Models_2021
got existing Experiment: text-features
created new ExperimentRun: text_features_4layer_1024_target_0_458_regression_bhcb1jke
upload complete (Model_Summary)


In [33]:
# fairing:include-cell
# Start the training process
try:
    start_time = time.time()  
    cached_ds = training_data.cache()
    cached_validation_ds = validation_data.cache()
    # Fit the model
    model_history = sort_model.fit(cached_ds, validation_data=cached_validation_ds, epochs=num_epochs, callbacks=callbacks)
    # Log time taken to fit model
    modeldb_expt_run.log_metric('model_fit_run_duration_in_secs', (time.time() - start_time))            
    # Save Model
    model_save_path = os.path.join(model_data_path_prefix, 'saved_model/')
    sort_model.save(model_save_path)
    # Log other metrics from model including validation data performance
    log_model_metrics(modeldb_expt_run, sort_model, model_save_path, validation_data)
    modeldb_expt_run.log_tag('success')
except:
    modeldb_expt_run.log_tag('failed_run')
    raise

Epoch 1/10

Epoch 00001: val_loss improved from inf to 4.22352, saving model to gs://personalization-tensorflow/models/text_features/text_features_4layer_1024_target_0_458_regression_bhcb1jke/checkpoints/
Epoch 2/10

Epoch 00002: val_loss did not improve from 4.22352
Epoch 3/10

Epoch 00003: val_loss did not improve from 4.22352
Epoch 4/10

Epoch 00004: val_loss did not improve from 4.22352
Epoch 5/10

Epoch 00005: val_loss did not improve from 4.22352
Epoch 6/10

Epoch 00006: val_loss did not improve from 4.22352
Epoch 7/10

Epoch 00007: val_loss did not improve from 4.22352
Epoch 8/10

Epoch 00008: val_loss did not improve from 4.22352
Epoch 9/10

Epoch 00009: val_loss did not improve from 4.22352
Epoch 10/10

Epoch 00010: val_loss did not improve from 4.22352
INFO:tensorflow:Assets written to: gs://personalization-tensorflow/models/text_features/text_features_4layer_1024_target_0_458_regression_bhcb1jke/saved_model/assets


[I 211015 05:51:48 builder_impl:774] Assets written to: gs://personalization-tensorflow/models/text_features/text_features_4layer_1024_target_0_458_regression_bhcb1jke/saved_model/assets




## Evaluate Performance of Model

In [34]:
# fairing:include-cell
st = time.time()

# Get predicted labels for validation data
pred_indices_raw = sort_model.predict(validation_data)
pred_indices = (pred_indices_raw > 0.5)

# Get true labels for test data
iterator = test_data.as_numpy_iterator()
true_labels = np.array([])
for x in iterator:
    true_labels = np.append(true_labels, x[-1])
print("Time taken for generating labels is {}secs".format(time.time() - st))

Time taken for generating labels is 15.988433837890625secs


In [35]:
# fairing:include-cell
# Making sure directory is removed if already exists
if os.path.exists('/tmp/plots'):        
    shutil.rmtree('/tmp/plots')
os.mkdir('/tmp/plots')

target_names = ['class 0', 'class 1']

# Print Some Performance Metrics
print(classification_report(true_labels, pred_indices, target_names = target_names, zero_division = 0))
cr = classification_report(true_labels, pred_indices, target_names = target_names, zero_division=0, output_dict = True)
class_0_recall = np.around(cr['class 0']['recall'], decimals=5)
class_1_recall = np.around(cr['class 1']['recall'], decimals=5)
modeldb_expt_run.log_metrics({'Recall_Class_0': class_0_recall, 'Recall_Class_1': class_1_recall, })


# Create ROC curve
figure = plt.figure(figsize=(5, 5))
fpr, tpr, thresholds = roc_curve(true_labels, pred_indices_raw)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.savefig('/tmp/plots/roc.png')
modeldb_expt_run.log_artifact('ROC', '/tmp/plots/roc.png')

# Create PR curve
figure = plt.figure(figsize=(5, 5))
precision, recall, thresholds = precision_recall_curve(true_labels, pred_indices_raw)
plt.plot(recall, precision)
plt.title('PR curve')
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.savefig('/tmp/plots/pr.png')
modeldb_expt_run.log_artifact('PR', '/tmp/plots/pr.png')

# Create confusion matrix
cm = confusion_matrix(true_labels, pred_indices)
cm_df = pd.DataFrame(cm, index = target_names, columns = target_names)
cm_normalize_df = pd.DataFrame(normalize(cm, 'l1', axis = 1), index = target_names, columns = target_names)
figure = plt.figure(figsize=(5, 5))
svm = sns.heatmap(cm_df, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
figure = svm.get_figure()    
figure.savefig('/tmp/plots/cm.png')

figure = plt.figure(figsize=(5, 5))
svm = sns.heatmap(cm_normalize_df, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
figure = svm.get_figure()    
figure.savefig('/tmp/plots/cm_norm.png')

# Saving confusion_matrix
modeldb_expt_run.log_artifact('confusion_matrix', '/tmp/plots/cm.png')
modeldb_expt_run.log_artifact('confusion_matrix_normalized', '/tmp/plots/cm_norm.png')

ValueError: Number of classes, 6, does not match size of target_names, 2. Try specifying the labels parameter

In [None]:
# fairing:include-cell
# Get LTR Metrics on a test-data set
if os.path.exists('/tmp/data'):        
    shutil.rmtree('/tmp/data')
os.mkdir('/tmp/data')

# Get predictions from the model
st = time.time()
model_predictions = sort_model.predict(test_dataset)
print('Process took {}secs'.format(time.time() - st))

# Construct a pandas dataframe with scores and target
st = time.time()
model_results_df = pd.DataFrame()
for f, t in test_dataset:
    temp = pd.DataFrame()
    temp[target_column] = t
    temp['customer_id'] = f['customer_id']
    model_results_df = model_results_df.append(temp)
print('Process took {}secs'.format(time.time() - st))
model_results_df['predicted'] = model_predictions
prediction_column = 'predicted'

# Call the function to evaluate LTR metrics
model_metrics = pd.DataFrame()
model_hit_rate, model_ndcg = evaluation_utilities.get_ltr_metrics(model_results_df, 
                                                                         max_rank, 
                                                                         target_column, 
                                                                         prediction_column)
model_metrics['hit_rate'] = model_hit_rate
model_metrics['ndcg'] = model_ndcg

modeldb_expt_run.log_metrics({'HR@10': model_hit_rate[9], 'NDCG@10': model_ndcg[9]})
model_ndcg.to_csv(f'/tmp/data/{modeldb_expt_run.name}.csv')
modeldb_expt_run.log_artifact('LTRMetrics', f'/tmp/data/{modeldb_expt_run.name}.csv')

In [None]:
# fairing:include-cell
modeldb_expt_run.log_metric('experiment_run_duration_in_secs', (time.time() - start_time))            

### Run below cells to get information about the model

In [None]:
#a_sort_model = get_sort_model(feature_normalizer)

In [None]:
#a_sort_model.summary()

In [None]:
#sort_model.get_layer(index=-1).summary()