<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#import-iris_data" data-toc-modified-id="import-iris_data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>import iris_data</a></span></li><li><span><a href="#Main" data-toc-modified-id="Main-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Main</a></span></li><li><span><a href="#Embedding-Column" data-toc-modified-id="Embedding-Column-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Embedding Column</a></span></li></ul></div>

# Estimators 

- https://www.tensorflow.org/get_started/premade_estimators
- https://github.com/tensorflow/models/tree/master/samples/core/get_started
- https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py

In [1]:
import tensorflow as tf 
import numpy as np 
import pandas as pd 

  from ._conv import register_converters as _register_converters


## import iris_data

In [2]:
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"

CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth',
                    'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Setosa', 'Versicolor', 'Virginica']

def maybe_download():
    train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
    test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)

    return train_path, test_path

def load_data(y_name='Species'):
    """Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
    train_path, test_path = maybe_download()

    train = pd.read_csv(train_path, names=CSV_COLUMN_NAMES, header=0)
    train_x, train_y = train, train.pop(y_name)

    test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
    test_x, test_y = test, test.pop(y_name)

    return (train_x, train_y), (test_x, test_y)

def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset


# The remainder of this file contains a simple example of a csv parser,
#     implemented using the `Dataset` class.

# `tf.parse_csv` sets the types of the outputs to match the examples given in
#     the `record_defaults` argument.
CSV_TYPES = [[0.0], [0.0], [0.0], [0.0], [0]]

def _parse_line(line):
    # Decode the line into its fields
    fields = tf.decode_csv(line, record_defaults=CSV_TYPES)

    # Pack the result into a dictionary
    features = dict(zip(CSV_COLUMN_NAMES, fields))

    # Separate the label from the features
    label = features.pop('Species')

    return features, label


def csv_input_fn(csv_path, batch_size):
    # Create a dataset containing the text lines.
    dataset = tf.data.TextLineDataset(csv_path).skip(1)

    # Parse each line.
    dataset = dataset.map(_parse_line)

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

## Main

In [3]:
# Fetch the data
(train_x, train_y), (test_x, test_y) = load_data()

print('train set size:',train_x.shape)
print('test set size:',test_x.shape)

train_x.head()

train set size: (120, 4)
test set size: (30, 4)


Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,6.4,2.8,5.6,2.2
1,5.0,2.3,3.3,1.0
2,4.9,2.5,4.5,1.7
3,4.9,3.1,1.5,0.1
4,5.7,3.8,1.7,0.3


In [8]:
print(type(train_x),type(train_y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [14]:
# Feature columns describe how to use the input.
my_feature_columns = []
print(train_x.keys())

for key in train_x.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
my_feature_columns

Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'], dtype='object')


[_NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [15]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/b9/_mrg0y217l161kz3qt4sgs8xd5_l1w/T/tmpfq_rgi3v', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb17859ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [24]:
# Train the Model.
batch_size = 100
train_steps = 1000

classifier.train(
    input_fn=lambda:train_input_fn(train_x, train_y, batch_size),
    steps=train_steps)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/b9/_mrg0y217l161kz3qt4sgs8xd5_l1w/T/tmpfq_rgi3v/model.ckpt.
INFO:tensorflow:loss = 505.33563, step = 1
INFO:tensorflow:global_step/sec: 682.602
INFO:tensorflow:loss = 18.938951, step = 101 (0.148 sec)
INFO:tensorflow:global_step/sec: 1083.35
INFO:tensorflow:loss = 7.317295, step = 201 (0.091 sec)
INFO:tensorflow:global_step/sec: 1089.38
INFO:tensorflow:loss = 9.6496315, step = 301 (0.092 sec)
INFO:tensorflow:global_step/sec: 1068.79
INFO:tensorflow:loss = 11.800392, step = 401 (0.094 sec)
INFO:tensorflow:global_step/sec: 1019.21
INFO:tensorflow:loss = 6.266605, step = 501 (0.098 sec)
INFO:tensorflow:global_step/sec: 1068.9
INFO:tensorflow:loss = 5.829021, step = 601 (0.093 sec)
INFO:tensorflow:g

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0xb178597f0>

In [27]:
# Evaluate the model.
eval_result = classifier.evaluate(
    input_fn=lambda:eval_input_fn(test_x, test_y,batch_size))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-19-21:31:49
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/b9/_mrg0y217l161kz3qt4sgs8xd5_l1w/T/tmpfq_rgi3v/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-06-19-21:31:49
INFO:tensorflow:Saving dict for global step 1000: accuracy = 1.0, average_loss = 0.050819945, global_step = 1000, loss = 1.5245984

Test set accuracy: 1.000



In [28]:
# Generate predictions from the model
expected = ['Setosa', 'Versicolor', 'Virginica']
predict_x = {
    'SepalLength': [5.1, 5.9, 6.9],
    'SepalWidth': [3.3, 3.0, 3.1],
    'PetalLength': [1.7, 4.2, 5.4],
    'PetalWidth': [0.5, 1.5, 2.1],
}

In [32]:
predictions = classifier.predict(
    input_fn=lambda:eval_input_fn(predict_x,labels=None,batch_size=batch_size))
template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')

for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print(template.format(SPECIES[class_id],
                          100 * probability, expec))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/b9/_mrg0y217l161kz3qt4sgs8xd5_l1w/T/tmpfq_rgi3v/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Prediction is "Setosa" (99.7%), expected "Setosa"

Prediction is "Versicolor" (99.3%), expected "Versicolor"

Prediction is "Virginica" (96.1%), expected "Virginica"


In [35]:
predictions = classifier.predict(
    input_fn=lambda:eval_input_fn(predict_x,labels=None,batch_size=batch_size))

for pred_dict, expec in zip(predictions, expected):
    print(pred_dict)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/b9/_mrg0y217l161kz3qt4sgs8xd5_l1w/T/tmpfq_rgi3v/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'logits': array([ 11.602174 ,   5.9285774, -20.740744 ], dtype=float32), 'probabilities': array([9.9657625e-01, 3.4237255e-03, 8.9569372e-15], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}
{'logits': array([-5.1370087,  2.4395337, -2.6359398], dtype=float32), 'probabilities': array([5.0888932e-04, 9.9328494e-01, 6.2061688e-03], dtype=float32), 'class_ids': array([1]), 'classes': array([b'1'], dtype=object)}
{'logits': array([-10.939845 ,   0.5340964,   3.7451155], dtype=float32), 'probabilities': array([4.0293733e-07, 3.8753141e-02, 9.6124643e-01], dtype=float32), 'class_ids': array([2]), 'classes': array([b'2'], dtype=object)}


## Embedding Column

A bit off topic, but the following is a general guideline on how you use embedding column function. 

column of interest = `your_column`

In [None]:
# convert country_name to categorical_column 
categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key="your_column", vocabulary_list=df.your_column.unique())

In [None]:
# calculate embedding dimension 
import math 
number_of_categories= len(df.your_column.unique())
print(number_of_categories)
embedding_dimensions = (math.ceil(number_of_categories**0.25))
print(embedding_dimensions)

In [None]:
# convert to embedding 
embedded_col = tf.feature_column.embedding_column(categorical_column, embedding_dimensions)
# add to feature columns 
my_feature_columns.append(embedded_col)

# feed the my_feature_column into the DNNClassifier or other estimators. 