In [1]:
import tensorflow as tf
import shutil
import math
import multiprocessing
from tensorflow.python.feature_column import feature_column
print(tf.__version__)

1.3.0


## Steps to use the TF Estimator APIs
1. Define dataset **metadata**
2. Define **data input function** to read the data from the source + **apply pre-processing**
3. Create TF **feature columns** based on metadata + **extended feature columns**
4. Instantiate an **estimator** with the required **feature columns & parameters**
5. **Train** estimator using training data
6. **Evaluate** estimator using test data
7. **Save & Load** the estimator
8.  Perform **predictions**

In [2]:
train_data_files = 'data/train-*.csv'
valid_data_files = 'data/valid-*.csv'
test_data_files = 'data/test-*.csv'

model_name = 'reg-model-01'

resume = True
train = False

## 1. Define Dataset Metadata

In [3]:
HEADER = ['key','x','y','alpha','beta','target']
HEADER_DEFAULTS = [[0], [0.0], [0.0], ['NA'], ['NA'], [0.0]]

NUMERIC_FEATURE_NAMES = ['x', 'y']  

CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {'alpha':['ax01', 'ax02'], 'beta':['bx01', 'bx02']}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'target'

UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME})

print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {}".format(TARGET_NAME))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))

Header: ['key', 'x', 'y', 'alpha', 'beta', 'target']
Numeric Features: ['x', 'y']
Categorical Features: ['alpha', 'beta']
Target: target
Unused Features: ['key']


## 2. Define Data Input Function

In [4]:
def process_features(features):
    
    features["x_2"] = tf.square(features['x'])
    features["y_2"] = tf.square(features['y'])
    features["xy"] = tf.multiply(features['x'], features['y']) # features['x'] * features['y']
    features['dist_xy'] =  tf.sqrt(tf.squared_difference(features['x'],features['y']))
    
    return features

In [5]:
def csv_input_fn(file_names, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=None, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    
    input_file_names = tf.train.match_filenames_once(file_names)
    
    print(input_file_names)

    filename_queue = tf.train.string_input_producer(
        input_file_names, num_epochs=num_epochs, shuffle=shuffle)

    reader = tf.TextLineReader(skip_header_lines=skip_header_lines)

    _, rows = reader.read_up_to(filename_queue, num_records=batch_size)
    
    row_columns = tf.expand_dims(rows, -1)
    columns = tf.decode_csv(row_columns, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))
    
    # Remove unused columns
    for column in UNUSED_FEATURE_NAMES:
        features.pop(column)
        
    if shuffle:
        features = tf.train.shuffle_batch(
            features,
            batch_size,
            min_after_dequeue=2 * batch_size + 1,
            capacity=batch_size * 10,
            num_threads=multiprocessing.cpu_count(),
            enqueue_many=True,
            allow_smaller_final_batch=True
        )
    else:
        features = tf.train.batch(
            features,
            batch_size,
            capacity=batch_size * 10,
            num_threads=multiprocessing.cpu_count(),
            enqueue_many=True,
            allow_smaller_final_batch=True
        )

    target = features.pop(TARGET_NAME)
    
    # apply pre-processing
    
#     features["x_2"] = tf.square(features['x'])
#     features["y_2"] = tf.square(features['y'])
#     features["xy"] = tf.multiply(features['x'], features['y']) # features['x'] * features['y']
#     features['dist_xy'] =  tf.sqrt(tf.squared_difference(features['x'],features['y']))
    
#     return features, target

    return process_features(features), target

In [6]:
features, target = csv_input_fn(file_names=train_data_files)
print("Feature read from CSV: {}".format(list(features.keys())))
print("Target read from CSV: {}".format(target))

<tf.Variable 'matching_filenames:0' shape=<unknown> dtype=string_ref>
Feature read from CSV: ['alpha', 'beta', 'x', 'y', 'x_2', 'y_2', 'xy', 'dist_xy']
Target read from CSV: Tensor("batch:2", shape=(?, 1), dtype=float32)


## 3. Define Feature Columns

In [7]:
def get_feature_columns():
    
    
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = ['x_2', 'y_2', 'xy', 'dist_xy']
    all_numeric_feature_names = NUMERIC_FEATURE_NAMES + CONSTRUCTED_NUMERIC_FEATURES_NAMES

    numeric_columns = {feature_name: tf.feature_column.numeric_column(feature_name)
                       for feature_name in all_numeric_feature_names}

    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    feature_columns = {}

    if numeric_columns is not None:
        feature_columns.update(numeric_columns)

    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
        
    
    # apply feature extentions
    
    feature_columns['alpha_X_beta'] = tf.feature_column.crossed_column(
        [feature_columns['alpha'], feature_columns['beta']], 4)
    
    return feature_columns

feature_columns = get_feature_columns()
print("Feature Columns: {}".format(feature_columns))

Feature Columns: {'x': _NumericColumn(key='x', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y': _NumericColumn(key='y', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'x_2': _NumericColumn(key='x_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'y_2': _NumericColumn(key='y_2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'xy': _NumericColumn(key='xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'dist_xy': _NumericColumn(key='dist_xy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 'alpha': _VocabularyListCategoricalColumn(key='alpha', vocabulary_list=('ax01', 'ax02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'beta': _VocabularyListCategoricalColumn(key='beta', vocabulary_list=('bx01', 'bx02'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'alpha_X_beta': _CrossedColumn(keys=(_VocabularyListCategoricalColumn(key=

## 4. Instantiate an Estimator

In [8]:
def create_estimator(run_config, hparams):
    
    feature_columns = list(get_feature_columns().values())
    
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn),
               feature_columns
        )
    )

    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )

    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )
    
    
    estimator = tf.estimator.DNNRegressor(
        
        feature_columns= dense_columns + indicator_columns ,
        hidden_units= hparams.hidden_units,
        
        optimizer= tf.train.AdamOptimizer(),
        activation_fn= tf.nn.elu,
        dropout= hparams.dropout_prob,
        
        config= run_config
    )
    
    return estimator

## 5. Train the Estimator

In [9]:
hparams  = tf.contrib.training.HParams(
    num_epochs = 1000,
    batch_size = 500,
    hidden_units=[8, 4], 
    dropout_prob = 0.1)


model_dir = 'trained_models/{}'.format(model_name)

run_config = tf.estimator.RunConfig().replace(model_dir=model_dir)
print(run_config.model_dir)

trained_models/reg-model-01


In [10]:
train_input_fn = lambda: csv_input_fn(file_names= train_data_files, 
                                      mode= tf.estimator.ModeKeys.TRAIN,
                                      num_epochs= hparams.num_epochs,
                                      batch_size= hparams.batch_size) 


estimator = create_estimator(run_config, hparams)

if not resume:
    shutil.rmtree(model_dir, ignore_errors=True)
    
tf.logging.set_verbosity(tf.logging.INFO)

if train:
    estimator.train(input_fn = train_input_fn)

INFO:tensorflow:Using config: {'_model_dir': 'trained_models/reg-model-01', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


## 6. Evaluate the Model

In [11]:
test_size = 5000

test_input_fn = lambda: csv_input_fn(file_names= test_data_files, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= test_size)

results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print("")
print(results)
rmse = round(math.sqrt(results["average_loss"]),5)
print("")
print("RMSE: {}".format(rmse))

<tf.Variable 'matching_filenames:0' shape=<unknown> dtype=string_ref>
INFO:tensorflow:Starting evaluation at 2017-10-27-16:40:34
INFO:tensorflow:Restoring parameters from trained_models/reg-model-01/model.ckpt-24000
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-10-27-16:40:35
INFO:tensorflow:Saving dict for global step 24000: average_loss = 119.773, global_step = 24000, loss = 598865.0

{'average_loss': 119.77299, 'loss': 598864.94, 'global_step': 24000}

RMSE: 10.94408


## 7. Prediction

In [12]:
# import itertools

# predict_input_fn = lambda: csv_input_fn(file_names= test_data_files, 
#                                       mode= tf.estimator.ModeKeys.PREDICT,
#                                       batch_size= 5)

# predictions = estimator.predict(input_fn=predict_input_fn)
# print("")
# print(list(itertools.islice(predictions, 5)))

## 8. Save & Load the Model

In [17]:
export_dir = model_dir + "/export"

def csv_serving_input_fn():
    
    SERVING_HEADER = ['x','y','alpha','beta']
    SERVING_HEADER_DEFAULTS = [[0.0], [0.0], ['NA'], ['NA']]

    rows_string_tensor = tf.placeholder(dtype=tf.string,
                                         shape=[None],
                                         name='rows_string_tensor')
    
    receiver_tensor = {'rows': rows_string_tensor}

    row_columns = tf.expand_dims(rows_string_tensor, -1)
    columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
    features = dict(zip(SERVING_HEADER, columns))

    return tf.estimator.export.ServingInputReceiver(
        process_features(features), receiver_tensor)

estimator.export_savedmodel(
    export_dir_base = export_dir,
    serving_input_receiver_fn = csv_serving_input_fn,
    as_text=False
)


INFO:tensorflow:Restoring parameters from trained_models/reg-model-01/model.ckpt-24000
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b'trained_models/reg-model-01/export/1509126103/saved_model.pb'


b'trained_models/reg-model-01/export/1509126103'

In [20]:
%%bash

MODEL_NAME='reg-model-01'
LAST=$(ls trained_models/${MODEL_NAME}/export | tail -1)
SAVE_MODEL_DIR=trained_models/$MODEL_NAME/export/$LAST
ls $SAVE_MODEL_DIR

gcloud ml-engine local predict --model-dir=$SAVE_MODEL_DIR --text-instances='data/new-data.csv'

saved_model.pb
variables


ERROR: (gcloud.ml-engine.local.predict) RuntimeError: Bad magic number in .pyc file

