In [1]:
import tensorflow as tf
import tensorflow.contrib.data as data
import shutil
import math
import multiprocessing
from tensorflow.python.feature_column import feature_column

from tensorflow.contrib.learn import learn_runner
from tensorflow.contrib.learn import make_export_strategy

print(tf.__version__)

1.3.0


## Steps to use the TF Estimator APIs
1. Define dataset **metadata**
2. Define **data input function** to read the data from the source + **apply pre-processing**
3. Create TF **feature columns** based on metadata + **extended feature columns**
4. Instantiate an **model function** with the required **feature columns, ops, & parameters**
5. Define a **serving function**
6. Run **Experiment** by supplying training and validation data, as well as required parameters
7. **Evaluate** the model using test data
8. Perform **predictions**

In [2]:
train_data_files = ['data/train-data.csv']
valid_data_files = ['data/valid-data.csv']
test_data_files = ['data/test-data.csv']

model_name = 'reg-model-04'

resume = False
train = True
preprocess = True

## 1. Define Dataset Metadata

In [3]:
HEADER = ['key','x','y','alpha','beta','target']
HEADER_DEFAULTS = [[0], [0.0], [0.0], ['NA'], ['NA'], [0.0]]

NUMERIC_FEATURE_NAMES = ['x', 'y']  

CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {'alpha':['ax01', 'ax02'], 'beta':['bx01', 'bx02']}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'target'

UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME})

print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {}".format(TARGET_NAME))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))

Header: ['key', 'x', 'y', 'alpha', 'beta', 'target']
Numeric Features: ['x', 'y']
Categorical Features: ['alpha', 'beta']
Target: target
Unused Features: ['key']


## 2. Define Data Input Function

### a. parsing and preprocessing logic

In [4]:
def parse_csv_row(csv_row):
    
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))
    
    for column in UNUSED_FEATURE_NAMES:
        features.pop(column)
    
    target = features.pop(TARGET_NAME)

    return features, target

def process_features(features):
    
    if preprocess:
        features["x_2"] = tf.square(features['x'])
        features["y_2"] = tf.square(features['y'])
        features["xy"] = tf.multiply(features['x'], features['y']) # features['x'] * features['y']
        features['dist_xy'] =  tf.sqrt(tf.squared_difference(features['x'],features['y']))
    
    return features

### b. data pipeline input function

In [5]:
def csv_input_fn(file_names, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=None, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False

    dataset = data.TextLineDataset(filenames=file_names)
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row))
    dataset = dataset.map(lambda features, target: (process_features(features), target))
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, target

In [6]:
features, target = csv_input_fn(file_names=train_data_files)
print("Feature read from CSV: {}".format(list(features.keys())))
print("Target read from CSV: {}".format(target))

Feature read from CSV: ['alpha', 'beta', 'dist_xy', 'x', 'x_2', 'xy', 'y', 'y_2']
Target read from CSV: Tensor("IteratorGetNext:8", shape=(?,), dtype=float32)


## 3. Define Feature Columns

In [7]:
def get_feature_columns():
    
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = ['x_2', 'y_2', 'xy', 'dist_xy']
    all_numeric_feature_names = NUMERIC_FEATURE_NAMES.copy() 
    
    if preprocess:
        all_numeric_feature_names += CONSTRUCTED_NUMERIC_FEATURES_NAMES

    numeric_columns = {feature_name: tf.feature_column.numeric_column(feature_name)
                       for feature_name in all_numeric_feature_names}

    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    feature_columns = {}

    if numeric_columns is not None:
        feature_columns.update(numeric_columns)

    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
        
    
    # apply feature extentions
    
    feature_columns['alpha_X_beta'] = tf.feature_column.crossed_column(
        [feature_columns['alpha'], feature_columns['beta']], 4)
    
    return feature_columns

feature_columns = get_feature_columns()
print("Feature Columns: {}".format(feature_columns))

Feature Columns: {'x': _NumericColumn(key='x', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y': _NumericColumn(key='y', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'x_2': _NumericColumn(key='x_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y_2': _NumericColumn(key='y_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'xy': _NumericColumn(key='xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'dist_xy': _NumericColumn(key='dist_xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'alpha': _VocabularyListCategoricalColumn(key='alpha', vocabulary_list=('ax01', 'ax02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'beta': _VocabularyListCategoricalColumn(key='beta', vocabulary_list=('bx01', 'bx02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'alpha_X_beta': _CrossedColumn(keys=(_VocabularyListCategoricalColumn(key=

## 4. Define Model Function

In [8]:
def regression_model_fn(features, labels, mode, params):

    hidden_units = params.hidden_units
    output_layer_size = 1

    feature_columns = list(get_feature_columns().values())

    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn),
               feature_columns
        )
    )

    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )

    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )


    # Create the input layers from the features
    input_layer = tf.feature_column.input_layer(features= features, 
                                                feature_columns= dense_columns+indicator_columns)

#     # Create only 1 hidden layer based on the first element of the hidden_units in the params
#     hidden_layer_size = hidden_units[0]

#     # Connect the input layer to the hidden layer
#     hidden_layer = tf.layers.dense(inputs= input_layer, 
#                                    units=hidden_layer_size, 
#                                    activation=tf.nn.relu)

    # Create a fully-connected layer-stack based on the hidden_units in the params
    hidden_layers = tf.contrib.layers.stack(inputs= input_layer,
                                            layer= tf.contrib.layers.fully_connected,
                                            stack_args= hidden_units)

    # Connect the output layer (logits) to the hidden layer (no activation fn)
    logits = tf.layers.dense(inputs=hidden_layers, 
                             units=output_layer_size)

    # Reshape output layer to 1-dim Tensor to return predictions
    output = tf.squeeze(logits)

    # Provide an estimator spec for `ModeKeys.PREDICT`.
    if mode == tf.estimator.ModeKeys.PREDICT:

        predictions = {
            'scores': output
        }

        export_outputs = {
            'predictions': tf.estimator.export.PredictOutput(predictions)
        }

        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs=export_outputs)

    # Calculate loss using mean squared error
    loss = tf.losses.mean_squared_error(labels, output)

    # Create Optimiser
    optimizer = tf.train.AdamOptimizer()

    # Create training operation
    train_op = optimizer.minimize(
        loss=loss, global_step=tf.train.get_global_step())

    # Calculate root mean squared error as additional eval metric
    eval_metric_ops = {
        "rmse": tf.metrics.root_mean_squared_error(
            labels, output)
    }

    # Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
    estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                loss=loss,
                                                train_op=train_op,
                                                eval_metric_ops=eval_metric_ops)
    return estimator_spec


def create_estimator(run_config, hparams):
    return tf.estimator.Estimator(model_fn=regression_model_fn, 
                                  params=hparams, 
                                  config=run_config)

## 5. Define Serving Funcion

In [9]:
def csv_serving_input_fn():
    
    SERVING_HEADER = ['x','y','alpha','beta']
    SERVING_HEADER_DEFAULTS = [[0.0], [0.0], ['NA'], ['NA']]

    rows_string_tensor = tf.placeholder(dtype=tf.string,
                                         shape=[None],
                                         name='rows_string_tensor')
    
    receiver_tensor = {'rows': rows_string_tensor}

    row_columns = tf.expand_dims(rows_string_tensor, -1)
    columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
    features = dict(zip(SERVING_HEADER, columns))

    return tf.estimator.export.ServingInputReceiver(
        process_features(features), receiver_tensor)

## 6. Run Experiment

### a. Define Experiment Function

In [10]:
def generate_experiment_fn(**experiment_args):

    def _experiment_fn(run_config, hparams):

        train_input_fn = lambda: csv_input_fn(
            train_data_files,
            mode = tf.contrib.learn.ModeKeys.TRAIN,
            num_epochs=hparams.num_epochs,
            batch_size=hparams.batch_size
        )

        eval_input_fn = lambda: csv_input_fn(
            valid_data_files,
            mode=tf.contrib.learn.ModeKeys.EVAL,
            batch_size=hparams.batch_size
        )

        estimator = create_estimator(run_config, hparams)

        return tf.contrib.learn.Experiment(
            estimator,
            train_input_fn=train_input_fn,
            eval_input_fn=eval_input_fn,
            **experiment_args
        )

    return _experiment_fn


### b. Set HParam and RunConfig

In [11]:
hparams  = tf.contrib.training.HParams(
    num_epochs = 1000,
    batch_size = 500,
    hidden_units=[8, 4], 
    dropout_prob = 0.1)

model_dir = 'trained_models/{}'.format(model_name)

run_config = tf.contrib.learn.RunConfig(
            model_dir=model_dir
)

print(run_config.model_dir)

trained_models/reg-model-04


### c. Run Experiment via learn_runner

In [12]:
if not resume:
    print("Removing previous artifacts...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("Resuming training...") 

if train:
    tf.logging.set_verbosity(tf.logging.INFO)
    
    print("")
    print("Starting experiment...") 
    print("") 
    
    learn_runner.run(
        experiment_fn=generate_experiment_fn(

            export_strategies=[make_export_strategy(
                csv_serving_input_fn,
                exports_to_keep=1
            )]
        ),
        run_config=run_config,
        schedule="train_and_evaluate",
        hparams=hparams
    )
    
    print("") 
    print("experiment finished...") 

Removing previous artifacts...

Starting experiment...

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11d1919b0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'trained_models/reg-model-04'}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into trained_models/reg-model-04/model.ckpt.
INFO:tensorflow:Starting evaluation at 2017-10-29-19:57:34
INFO:tensorflow:Rest

INFO:tensorflow:loss = 97.7146, step = 2801 (0.779 sec)
INFO:tensorflow:global_step/sec: 125.089
INFO:tensorflow:loss = 115.12, step = 2901 (0.799 sec)
INFO:tensorflow:global_step/sec: 131.757
INFO:tensorflow:loss = 99.0083, step = 3001 (0.759 sec)
INFO:tensorflow:global_step/sec: 118.013
INFO:tensorflow:loss = 90.8786, step = 3101 (0.847 sec)
INFO:tensorflow:global_step/sec: 145.937
INFO:tensorflow:loss = 112.087, step = 3201 (0.685 sec)
INFO:tensorflow:global_step/sec: 148.126
INFO:tensorflow:loss = 100.422, step = 3301 (0.675 sec)
INFO:tensorflow:global_step/sec: 151.065
INFO:tensorflow:loss = 105.145, step = 3401 (0.662 sec)
INFO:tensorflow:global_step/sec: 150.198
INFO:tensorflow:loss = 100.598, step = 3501 (0.665 sec)
INFO:tensorflow:global_step/sec: 145.582
INFO:tensorflow:loss = 110.852, step = 3601 (0.687 sec)
INFO:tensorflow:global_step/sec: 146.474
INFO:tensorflow:loss = 93.2669, step = 3701 (0.683 sec)
INFO:tensorflow:global_step/sec: 141.12
INFO:tensorflow:loss = 85.7701, 

INFO:tensorflow:global_step/sec: 151.31
INFO:tensorflow:loss = 91.617, step = 11301 (0.661 sec)
INFO:tensorflow:global_step/sec: 156.91
INFO:tensorflow:loss = 106.425, step = 11401 (0.637 sec)
INFO:tensorflow:global_step/sec: 154.961
INFO:tensorflow:loss = 98.8104, step = 11501 (0.645 sec)
INFO:tensorflow:global_step/sec: 156.563
INFO:tensorflow:loss = 90.868, step = 11601 (0.639 sec)
INFO:tensorflow:global_step/sec: 143.684
INFO:tensorflow:loss = 100.791, step = 11701 (0.696 sec)
INFO:tensorflow:global_step/sec: 145.266
INFO:tensorflow:loss = 99.4279, step = 11801 (0.688 sec)
INFO:tensorflow:global_step/sec: 150.992
INFO:tensorflow:loss = 101.837, step = 11901 (0.662 sec)
INFO:tensorflow:global_step/sec: 94.5234
INFO:tensorflow:loss = 110.169, step = 12001 (1.059 sec)
INFO:tensorflow:global_step/sec: 134.259
INFO:tensorflow:loss = 111.614, step = 12101 (0.744 sec)
INFO:tensorflow:global_step/sec: 151.113
INFO:tensorflow:loss = 99.4724, step = 12201 (0.662 sec)
INFO:tensorflow:global_s

INFO:tensorflow:global_step/sec: 154.636
INFO:tensorflow:loss = 96.0326, step = 19701 (0.647 sec)
INFO:tensorflow:global_step/sec: 141.28
INFO:tensorflow:loss = 101.932, step = 19801 (0.708 sec)
INFO:tensorflow:global_step/sec: 143.682
INFO:tensorflow:loss = 75.4871, step = 19901 (0.695 sec)
INFO:tensorflow:global_step/sec: 143.136
INFO:tensorflow:loss = 66.9889, step = 20001 (0.699 sec)
INFO:tensorflow:global_step/sec: 146.708
INFO:tensorflow:loss = 93.4751, step = 20101 (0.682 sec)
INFO:tensorflow:global_step/sec: 137.781
INFO:tensorflow:loss = 97.3343, step = 20201 (0.726 sec)
INFO:tensorflow:global_step/sec: 143.854
INFO:tensorflow:loss = 91.9645, step = 20301 (0.695 sec)
INFO:tensorflow:global_step/sec: 137.603
INFO:tensorflow:loss = 94.275, step = 20401 (0.728 sec)
INFO:tensorflow:global_step/sec: 135.634
INFO:tensorflow:loss = 87.0706, step = 20501 (0.736 sec)
INFO:tensorflow:global_step/sec: 138.93
INFO:tensorflow:loss = 100.39, step = 20601 (0.720 sec)
INFO:tensorflow:global_s

INFO:tensorflow:Restoring parameters from trained_models/reg-model-04/model.ckpt-24000
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'trained_models/reg-model-04/export/Servo/1509307227/saved_model.pb'

experiment finished...


## 7. Evaluate the Model

In [13]:
test_size = 5000

test_input_fn = lambda: csv_input_fn(file_names= test_data_files, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= test_size)

estimator = create_estimator(run_config, hparams)
results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print("")
print(results)
rmse = round(results["rmse"],5)
print("")
print("RMSE: {}".format(rmse))

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11d1919b0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'trained_models/reg-model-04'}
INFO:tensorflow:Starting evaluation at 2017-10-29-20:00:29
INFO:tensorflow:Restoring parameters from trained_models/reg-model-04/model.ckpt-24000
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-10-29-20:00:29
INFO:tensorflow:Saving dict for global step 24000: global_step = 24000, loss = 99.2471, rmse = 9.96228

{'loss': 99

## 8. Prediction

In [14]:
import itertools

predict_input_fn = lambda: csv_input_fn(file_names= test_data_files, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 5)

predictions = estimator.predict(input_fn=predict_input_fn)
print("")
print(list(itertools.islice(predictions, 5)))


INFO:tensorflow:Restoring parameters from trained_models/reg-model-04/model.ckpt-24000
[{'scores': 47.889103}, {'scores': -4.0050755}, {'scores': 17.920849}, {'scores': 2.5622969}, {'scores': 2.7062178}]
