In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'1.3.0'

In [4]:
tf_version = tf.__version__
print("TensorFlow version: {}".format(tf_version))
assert "1.3" <= tf_version, "TensorFlow r1.3 or later is needed"

TensorFlow version: 1.3.0


## Introducing The Datasets

Datasets is a new way to create input pipelines to TensorFlow models. This API is much more performant than using feed_dict or the queue-based pipelines, and it's cleaner and easier to use. 

![](datasets.jpg)

Where:
  - `Dataset`: Base class containing methods to create and transform datasets.   
    Also allows you initialize a dataset from data in memory, or from a Python generator.
  - `TextLineDataset`: Reads lines from text files.
  - `TFRecordDataset`: Reads records from `TFRecord` files.
  - `FixedLengthRecordDataset`: Reads fixed size records from binary files.
  - `Iterator`: Provides a way to access one dataset element at a time.

In [9]:
import os
from urllib.request import urlopen

In [17]:
PATH_DATASET = "./dataset"
PATH_MODEL = './model'

FILE_TRAIN = PATH_DATASET + os.sep + "iris_training.csv"
FILE_TEST = PATH_DATASET + os.sep + "iris_test.csv"
URL_TRAIN = "http://download.tensorflow.org/data/iris_training.csv"
URL_TEST = "http://download.tensorflow.org/data/iris_test.csv"

In [7]:
def downloadDataset(url, file):
    if not os.path.exists(PATH_DATASET):
        os.makedirs(PATH_DATASET)
    if not os.path.exists(file):
        data = urlopen(url).read()
        with open(file, "wb") as f:
            f.write(data)
            f.close()

In [10]:
downloadDataset(URL_TRAIN, FILE_TRAIN)
downloadDataset(URL_TEST, FILE_TEST)

In [11]:
tf.logging.set_verbosity(tf.logging.INFO)

## Representing our dataset

In [3]:
feature_names = [
    'SepalLength',
    'SepalWidth',
    'PetalLength',
    'PetalWidth',
]

In [14]:
# create an input reading a file using the Dataset API
# Then provide the results to Estimator API

def my_input_fn(file_path, perform_shuffle=False, repeat_count=1):
    """
    file_path: The data file to read.
    perform_shuffle: Whether the record order should be randomized.
    repeat_count: The number of times to iterate over the records in the dataset. 
                For example, if we specify 1, then each record is read once. 
                If we specify None, iteration will continue forever.
    """
    
    def decode_csv(line):
        parsed_line = tf.decode_csv(line, [[0.],[0.],[0.],[0.],[0]])
        label = parsed_line[-1:]
        del parsed_line[-1]
        features = parsed_line
        d = dict(zip(feature_names, features)), label
        return d
    
    dataset = (tf.contrib.data.TextLineDataset(file_path)
               .skip(1) # skip header row
               .map(decode_csv))
    
    if perform_shuffle:
        # Randomizes input using a window of 256 elements (read into memory)
        dataset = dataset.shuffle(buffer_size=256)
    dataset = dataset.repeat(repeat_count)
    dataset = dataset.batch(32)
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    
    return batch_features, batch_labels

the return value of `my_input_fn` must be two-element tuple organized as following:
  - `batch_features` must be a dict in which each input feature is a key and then a list of values for the training batch.
  - `batch_labels` is a 1-d tensor of labels for the training batch.

In [15]:
next_batch = my_input_fn(FILE_TRAIN, True) # return 32 random elements

In [16]:
# Create the feature_columns, which specifies the input to our model
# All our input features are numeric so use numeric_column for each one
feature_columns = [tf.feature_column.numeric_column(k) for k in feature_names]

### Introducing Estimators

Estimators is a high-level API that reduces much of the boilerplate code you previously needed to write when training a Tensorflow model. Estimator are also very flexible, allowing you toe override the default behavior if you have specific requestments for your model.  

There are two possible ways can build your model using Estimators:
  - `Pre-made Estimator`: There are predifined estimators, created to generate a specific type of model.
  - `Estimator(base class)` Gives you complete control of how your model should be created by using a `model_fn` function.
  
![](estimator.jpg)

In [18]:
# Create a deep neural network regression classifier
# Use the DNNClassifier pre-made estimator
classifier = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[10, 10], # Two layers, each with 10 neurons
    n_classes=3,
    model_dir=PATH_MODEL)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


## Traing the model

In [19]:
# Train our model, use the previously function my_input_fn
# Input to training is a file with training example
# Stop training after 8 iterations of train data (epochs)

classifier.train(input_fn=lambda: my_input_fn(FILE_TRAIN, True, 8))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into ./model/model.ckpt.
INFO:tensorflow:loss = 52.255, step = 1
INFO:tensorflow:Saving checkpoints for 30 into ./model/model.ckpt.
INFO:tensorflow:Loss for final step: 12.1572.


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1a17c52e80>

Estimator request a `input_fn` with no arguments, so we create one with `lambda`

## Evaluating trained model

In [22]:
# Evaluate our model using the examples contained in FILE_TEST
# return value will contain evaluation_metrics such as: loss & average_loss

evaluate_result = classifier.evaluate(input_fn=lambda: my_input_fn(FILE_TEST, False, 4))
print("Evaluation results")
for key in evaluate_result:
    print("   {}, was: {}".format(key, evaluate_result[key]))

INFO:tensorflow:Starting evaluation at 2017-10-14-08:02:02
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-30
INFO:tensorflow:Finished evaluation at 2017-10-14-08:02:02
INFO:tensorflow:Saving dict for global step 30: accuracy = 0.7, average_loss = 0.448881, global_step = 30, loss = 13.4664
Evaluation results
   accuracy, was: 0.699999988079071
   average_loss, was: 0.44888123869895935
   loss, was: 13.466437339782715
   global_step, was: 30


## Predicte trained model

In [23]:
# Predict the type of some Iris flowers.
predict_results = classifier.predict(input_fn=lambda: my_input_fn(FILE_TEST, False, 1))
print("Predictions on test file")
for prediction in predict_results:
    # Will print the predicted class, i.e: 0, 1, or 2 if the prediction
    # is Iris Sentosa, Vericolor, Virginica, respectively.
    print(prediction["class_ids"][0])

Predictions on test file
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-30
2
2
0
2
2
1
0
2
2
2
2
0
2
2
1
0
1
0
0
2
0
2
2
2
1
2
0
2
2
1


In [24]:
# Let create a dataset for prediction
# We've taken the first 3 examples in FILE_TEST
prediction_input = [[5.9, 3.0, 4.2, 1.5],  # -> 1, Iris Versicolor
                    [6.9, 3.1, 5.4, 2.1],  # -> 2, Iris Virginica
                    [5.1, 3.3, 1.7, 0.5]]  # -> 0, Iris Sentosa

In [26]:
def new_input_fn():
    def decode(x):
        x = tf.split(x, 4) # Need to split into 4 features.
        return dict(zip(feature_names, x))
    
    dataset = tf.contrib.data.Dataset.from_tensor_slices(prediction_input)
    dataset = dataset.map(decode)
    iterator = dataset.make_one_shot_iterator()
    next_feature_batch = iterator.get_next()
    return next_feature_batch, None # In prediction, we have no labels

In [27]:
# Predict all our prediction_input
predict_results = classifier.predict(input_fn=new_input_fn)

print("Predictions on memory")
for idx, prediction in enumerate(predict_results):
    _type = prediction["class_ids"][0] # the the predicted class
    if _type == 0:
        print("I think: {}, is Iris Sentosa".format(prediction_input[idx]))
    elif _type == 1:
        print("I think: {}, is Iris Versicolor".format(prediction_input[idx]))
    else:
        print("I think: {}, is Iris Virginica".format(prediction_input[idx]))

Predictions on memory
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-30
I think: [5.9, 3.0, 4.2, 1.5], is Iris Virginica
I think: [6.9, 3.1, 5.4, 2.1], is Iris Virginica
I think: [5.1, 3.3, 1.7, 0.5], is Iris Sentosa


In [30]:
import numpy as np
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = b"<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [31]:
show_graph(tf.get_default_graph())