# Babyweight Estimation with Transformed Data

In [128]:
import os

import tensorflow as tf
from tensorflow import data


from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema

from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io

In [3]:
!pip list | grep 'tensorflow'
!pip list | grep 'beam'
!pip list | grep 'cloud-dataflow'

tensorflow                         1.10.0     
tensorflow-hub                     0.1.0      
tensorflow-model-analysis          0.6.0      
tensorflow-transform               0.8.0      
apache-beam                        2.5.0      
google-cloud-dataflow              2.5.0      


In [73]:
BUCKET = 'ksalama-gcs-cloudml'
PROJECT = 'ksalama-gcp-playground'
REGION = 'europe-west1'
ROOT_DIR =  'babyweight_tft'
RUN_LOCAL = True

OUTPUT_DIR = ROOT_DIR if RUN_LOCAL==True else "gs://{}/{}".format(BUCKET,ROOT_DIR)
TRANSFORM_ARTEFACTS_DIR = os.path.join(OUTPUT_DIR,'transform')
TRANSFORMED_DATA_DIR = os.path.join(OUTPUT_DIR,'transformed')
TEMP_DIR = os.path.join(OUTPUT_DIR, 'tmp')
MODELS_DIR = os.path.join(OUTPUT_DIR,'models')

## Transform Metadata

In [81]:
transformed_metadata = transformed_metadata = metadata_io.read_metadata(
        os.path.join(TRANSFORM_ARTEFACTS_DIR,"transformed_metadata"))

TARGET_FEATURE_NAME = 'weight_pounds'

print transformed_metadata.schema

Schema({'_column_schemas': {u'mother_race': ColumnSchema({'_domain': IntDomain({'_min_value': -1, '_max_value': 10, '_vocabulary_file': u'', '_is_categorical': True, '_dtype': tf.int64}), '_axes': [], '_representation': FixedColumnRepresentation(None)}), u'weight_pounds': ColumnSchema({'_domain': FloatDomain({'_dtype': tf.float32}), '_axes': [], '_representation': FixedColumnRepresentation(None)}), u'is_male': ColumnSchema({'_domain': IntDomain({'_min_value': -1, '_max_value': 1, '_vocabulary_file': u'', '_is_categorical': True, '_dtype': tf.int64}), '_axes': [], '_representation': FixedColumnRepresentation(None)}), u'gestation_weeks_scaled': ColumnSchema({'_domain': FloatDomain({'_dtype': tf.float32}), '_axes': [], '_representation': FixedColumnRepresentation(None)}), u'mother_age_normalized': ColumnSchema({'_domain': FloatDomain({'_dtype': tf.float32}), '_axes': [], '_representation': FixedColumnRepresentation(None)}), u'mother_age_log': ColumnSchema({'_domain': FloatDomain({'_dtype'

## Input Function

In [118]:
def tfrecords_input_fn(files_name_pattern, transformed_metadata,
                       mode=tf.estimator.ModeKeys.EVAL,  
                       num_epochs=1, 
                       batch_size=500):
    
    dataset = tf.contrib.data.make_batched_features_dataset(
        file_pattern=files_name_pattern,
        batch_size=batch_size,
        features=transformed_metadata.schema.as_feature_spec(),
        reader=tf.data.TFRecordDataset,
        num_epochs=num_epochs,
        shuffle=True if mode == tf.estimator.ModeKeys.TRAIN else False,
        shuffle_buffer_size=1+(batch_size*2),
        prefetch_buffer_size=1
    )
    
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    target = features.pop(TARGET_FEATURE_NAME)
    return features, target

## Feature columns

In [119]:
def create_wide_and_deep_feature_columns(transformed_metadata, extend=False):
    
    deep_feature_columns = []
    wide_feature_columns = []
    
    column_schemas = transformed_metadata.schema.column_schemas
    for feature_name in column_schemas:
        if feature_name == TARGET_FEATURE_NAME:
            continue
        column_schema = column_schemas[feature_name]
        if isinstance(column_schema._domain, dataset_schema.FloatDomain):
            deep_feature_columns.append(tf.feature_column.numeric_column(feature_name))
        elif isinstance(column_schema._domain, dataset_schema.IntDomain):
            if column_schema._domain._is_categorical:
                wide_feature_columns.append(
                    tf.feature_column.categorical_column_with_identity(
                        feature_name, 
                        num_buckets=column_schema._domain._max_value+1)
                )
                
     
    if extend==True:
        mother_race_X_mother_age_bucketized = tf.feature_column.crossed_column(
            ['mother_age_bucketized', 'mother_race_index'],  55)
        
        wide_feature_columns.append(mother_race_X_mother_age_bucketized)
        
        mother_race_X_mother_age_bucketized_embedded = tf.feature_column.embedding_column(mother_race_X_mother_age_bucketized, 5)
        deep_feature_columns.append(mother_race_X_mother_age_bucketized_embedded)
        
    return wide_feature_columns, deep_feature_columns

wide,deep = create_wide_and_deep_feature_columns(transformed_metadata, True)
print wide
print ""
print deep

[_IdentityCategoricalColumn(key=u'mother_race', num_buckets=11, default_value=None), _IdentityCategoricalColumn(key=u'is_male', num_buckets=2, default_value=None), _IdentityCategoricalColumn(key=u'mother_age_bucketized', num_buckets=5, default_value=None), _CrossedColumn(keys=('mother_age_bucketized', 'mother_race_index'), hash_bucket_size=55, hash_key=None)]

[_NumericColumn(key=u'gestation_weeks_scaled', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key=u'mother_age_normalized', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key=u'mother_age_log', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('mother_age_bucketized', 'mother_race_index'), hash_bucket_size=55, hash_key=None), dimension=5, combiner='mean', layer_creator=<function _creator at 0x122985488>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainabl

## Estimator

In [120]:
def create_estimator(run_config, hparams):
  
    wide_feature_columns, deep_feature_columns = create_wide_and_deep_feature_columns(transformed_metadata, 
                                                                                      hparams.extend_feature_columns)
    estimator = tf.estimator.DNNLinearCombinedRegressor(
                linear_feature_columns = wide_feature_columns,
                dnn_feature_columns = deep_feature_columns,
                dnn_hidden_units=hparams.hidden_units,
                config = run_config
                )
    
    return estimator

## Experiment

In [121]:
hparams  = tf.contrib.training.HParams(
    num_epochs=10,
    batch_size=500,
    hidden_units=[32, 16],
    max_steps=100,
    extend_feature_columns=False,
    evaluate_after_sec=10
)

model_dir = os.path.join(MODELS_DIR,"dnn_estimator")
run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    model_dir=model_dir
)

In [122]:
train_data_files = os.path.join(TRANSFORMED_DATA_DIR, "train-*.tfrecords")
eval_data_files = os.path.join(TRANSFORMED_DATA_DIR, "eval-*.tfrecords")

# TrainSpec
train_spec = tf.estimator.TrainSpec(
  input_fn = lambda: tfrecords_input_fn(train_data_files,transformed_metadata,
    mode=tf.estimator.ModeKeys.TRAIN,
    num_epochs= hparams.num_epochs,
    batch_size = hparams.batch_size
  ),
  max_steps=hparams.max_steps,
)

# EvalSpec
eval_spec = tf.estimator.EvalSpec(
  input_fn =lambda: tfrecords_input_fn(eval_data_files,transformed_metadata),
  steps = None,
  throttle_secs = hparams.evaluate_after_sec # evalute after each 10 training seconds!
)

In [123]:
from datetime import datetime

if tf.gfile.Exists(model_dir):
    tf.gfile.DeleteRecursively(model_dir)

estimator = create_estimator(run_config, hparams)

tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("")
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 


tf.estimator.train_and_evaluate(
  estimator,
  train_spec,
  eval_spec
)


time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_global_id_in_cluster': 0, '_session_config': None, '_keep_checkpoint_max': 5, '_tf_random_seed': 19830610, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x122d0c910>, '_model_dir': 'babyweight_tft/models/dnn_estimator', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_device_fn': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}

Experiment started at 14:34:13
.......................................
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or

## Raw data metadata

In [129]:
CATEGORICAL_FEATURE_NAMES = ['is_male', 'mother_race']
NUMERIC_FEATURE_NAMES = ['mother_age', 'plurality', 'gestation_weeks']
TARGET_FEATURE_NAME = 'weight_pounds'
KEY_COLUMN = 'key'

def create_raw_metadata():  
    
    raw_data_schema = {}
    
    # key feature scehma
    raw_data_schema[KEY_COLUMN]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
    
    # target feature scehma
    raw_data_schema[TARGET_FEATURE_NAME]= dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
    
    # categorical features scehma
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in CATEGORICAL_FEATURE_NAMES})
    
    # numerical features scehma
    raw_data_schema.update({ column_name : dataset_schema.ColumnSchema(
        tf.float32, [], dataset_schema.FixedColumnRepresentation())
                            for column_name in NUMERIC_FEATURE_NAMES})
    
      # create dataset_metadata given raw_schema
    raw_metadata = dataset_metadata.DatasetMetadata(
        dataset_schema.Schema(raw_data_schema))
    
    return raw_metadata

import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(create_raw_metadata().schema.as_feature_spec())

{   'gestation_weeks': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'is_male': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
    'key': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'mother_age': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'mother_race': FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
    'plurality': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
    'weight_pounds': FixedLenFeature(shape=[], dtype=tf.float32, default_value=None)}


## Export Estimator to SavedModel

In [153]:
def serving_input_receiver_fn():
    
    from tensorflow_transform.saved import saved_transform_io
    
    # get the feature_spec of raw data
    raw_metadata = create_raw_metadata()
    raw_input_features = raw_metadata.schema.as_batched_placeholders()
    raw_input_features.pop(TARGET_FEATURE_NAME)
    raw_input_features.pop(KEY_COLUMN)

    # apply tranform_fn on raw features
    _, transformed_features = (
        saved_transform_io.partially_apply_saved_transform(
            os.path.join(TRANSFORM_ARTEFACTS_DIR,transform_fn_io.TRANSFORM_FN_DIR),
        raw_input_features)
    )
    
    return tf.estimator.export.ServingInputReceiver(
        transformed_features, raw_input_features)



export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=serving_input_receiver_fn
)

value: "\n\013\n\tConst_2:0\022\013mother_race"

value: "\n\013\n\tConst_6:0\022\007is_male"

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'serving_default' : Regression input must be a single string Tensor; got {'gestation_weeks': <tf.Tensor 'gestation_weeks:0' shape=(?,) dtype=float32>, 'is_male': <tf.Tensor 'is_male:0' shape=(?,) dtype=string>, 'mother_race': <tf.Tensor 'mother_race:0' shape=(?,) dtype=string>, 'plurality': <tf

'babyweight_tft/models/dnn_estimator/export/1535468534'

In [150]:
saved_model_dir=os.path.join(export_dir, os.listdir(export_dir)[0])

print saved_model_dir

def estimate_local(instance):
 
    predictor_fn = tf.contrib.predictor.from_saved_model(
        export_dir=saved_model_dir,
        signature_def_key="predict"
    )
    
    instance = dict((k, [v]) for k, v in instance.items())
    value = predictor_fn(instance)['predictions'][0][0]
    return value

instance = {
        'is_male': 'True',
        'mother_age': 26.0,
        'mother_race': 'Asian Indian',
        'plurality': 1.0,
        'gestation_weeks': 39
}

prediction = estimate_local(instance)
print(prediction)

babyweight_tft/models/dnn_estimator/export/1535468014
INFO:tensorflow:Restoring parameters from babyweight_tft/models/dnn_estimator/export/1535468014/variables/variables
1.7166116


In [151]:
%%bash

saved_models_base=babyweight_tft/models/dnn_estimator/export/
saved_model_dir=${saved_models_base}$(ls ${saved_models_base} | tail -n 1)
echo ${saved_model_dir}
ls ${saved_model_dir}
saved_model_cli show --dir=${saved_model_dir} --all

babyweight_tft/models/dnn_estimator/export/1535468014
assets
saved_model.pb
variables

MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:

signature_def['predict']:
  The given SavedModel SignatureDef contains the following input(s):
    inputs['gestation_weeks'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1)
        name: gestation_weeks:0
    inputs['is_male'] tensor_info:
        dtype: DT_STRING
        shape: (-1)
        name: is_male:0
    inputs['mother_age'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1)
        name: mother_age:0
    inputs['mother_race'] tensor_info:
        dtype: DT_STRING
        shape: (-1)
        name: mother_race:0
    inputs['plurality'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1)
        name: plurality:0
  The given SavedModel SignatureDef contains the following output(s):
    outputs['predictions'] tensor_info:
        dtype: DT_FLOAT
        shape: (-1, 1)
        name: add:0
  Method name is