# TF.data performance sandbox

In [1]:
import numpy as np
import tensorflow as tf
import urllib

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds

import IPython.display as display

## Define the model

In [2]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
  tf.keras.layers.Dense(128,activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               100480    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


## Create input pipelines

In [29]:
def get_dataset(folder):
    
    feature_description = {
        'label': tf.io.FixedLenFeature([], tf.int64, default_value=0),
        'image': tf.io.FixedLenFeature([], tf.string, default_value=''),
    }

    def _parse_function(example_proto):
        parsed_example = tf.io.parse_single_example(example_proto, feature_description)
        return tf.io.decode_png(parsed_example['image']), parsed_example['label']
    
    filenames = [os.path.join(folder, file) 
                 for file in tf.io.gfile.listdir(folder)]
    ds = tf.data.TFRecordDataset(filenames)
    ds = ds.repeat(20)
    ds = ds.map(_parse_function)
    ds = ds.batch(2)
    ds = ds.cache()

    return ds


In [30]:
#train_files = '/tmp/mnist/train'
train_files = '/tmp/mnist/train_small'
valid_files = '/tmp/mnist/test'

train_ds = get_dataset(train_files)
valid_ds = get_dataset(valid_files)

In [31]:
# Create a TensorBoard callback
local_dir = 'gs://jk-mlops-workspace/logs'

if tf.io.gfile.exists(local_dir):
    tf.io.gfile.rmtree(local_dir)
tf.io.gfile.makedirs(local_dir)


tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = local_dir,
                                                 #histogram_freq = 1,
                                                 profile_batch = '1,40')

callbacks = [tboard_callback]

model.fit(
    x=train_ds,
    # epochs=2,
    #validation_data=valid_ds,
    # steps_per_epoch=10,
    #validation_steps=10,
    callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x7f99d47cdad0>

## Writing tfrecords

In [8]:
input_filename = os.path.join(train_files, tf.io.gfile.listdir(train_files)[0])
output_filename = '/tmp/mnist/train_small/mnist-train-small.tfrecord'

In [12]:
dataset = tf.data.TFRecordDataset([input_filename])

In [15]:
with tf.io.TFRecordWriter(output_filename) as writer:
    for record in dataset.take(8):
        print(record.numpy())
        writer.write(record.numpy())

b'\n\xa8\x02\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x04\n\x95\x02\n\x05image\x12\x8b\x02\n\x88\x02\n\x85\x02\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x1c\x00\x00\x00\x1c\x08\x00\x00\x00\x00Wf\x80H\x00\x00\x00\xccIDAT(\x91c`\x18\xf4 \xe4_*\x8c\xc9\x84!\x19\xf5_\x08\xa7F\xf9\xef\xa7d1u2220000\xe4\xb1\xdd{\x8c)\xe9x\xde\x82\x81\x81\x81A\x97\xe1\x02\x16\xf3,\xff\x96300\xc8\xfc\xfc \t\x17B\xe8|\xc9\xc0\xc0\xc0\xc0\x10\xc8z\xf59\x16Ia\x06\x06\x06\x06\x06)\x86\x03\x0cX$\x03\x18\x19\x18\x18\xa43\x19\xe7!$\x19a\x0c\xf6\'B\x97\x8f\x0b\xe9\xa9]4\xf9\x87\xe9\x9e\xc4\xbf\x7f\xff\xfe\xfb\xfb\xf7o\x18\x92\x18\x0b\x8ca\xfa}\xee\xb3woW3\xec\xc0\xe2\x11\x08\x08\xf9\xb7\x06\x99\x8b\x1a\xb6Q\xffO\xe3\xd4\xc8\xf0\xf6\x8f\x05N9\xa3\x8f\xdbpk\xdc\xfd\xefg&N;\xff\xff\xbf\xba\x86\x01\x17x\xfcI\x01\xb7\xb1on\xe3\x96\xa3"\x00\x00\xf0q7\xda\xb4\x7f\xabC\x00\x00\x00\x00IEND\xaeB`\x82'
b'\n\xf6\x01\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x01\n\xe3\x01\n\x05image\x12\xd9\x01\n\xd6\x01\n\xd3\x01\x89PNG\r\n\x1a\n

In [16]:
dataset = tf.data.TFRecordDataset([output_filename])

In [19]:
count = 0
for record in dataset:
    count += 1
    print(record)

print('**********')
print(count)

tf.Tensor(b'\n\xa8\x02\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x04\n\x95\x02\n\x05image\x12\x8b\x02\n\x88\x02\n\x85\x02\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x1c\x00\x00\x00\x1c\x08\x00\x00\x00\x00Wf\x80H\x00\x00\x00\xccIDAT(\x91c`\x18\xf4 \xe4_*\x8c\xc9\x84!\x19\xf5_\x08\xa7F\xf9\xef\xa7d1u2220000\xe4\xb1\xdd{\x8c)\xe9x\xde\x82\x81\x81\x81A\x97\xe1\x02\x16\xf3,\xff\x96300\xc8\xfc\xfc \t\x17B\xe8|\xc9\xc0\xc0\xc0\xc0\x10\xc8z\xf59\x16Ia\x06\x06\x06\x06\x06)\x86\x03\x0cX$\x03\x18\x19\x18\x18\xa43\x19\xe7!$\x19a\x0c\xf6\'B\x97\x8f\x0b\xe9\xa9]4\xf9\x87\xe9\x9e\xc4\xbf\x7f\xff\xfe\xfb\xfb\xf7o\x18\x92\x18\x0b\x8ca\xfa}\xee\xb3woW3\xec\xc0\xe2\x11\x08\x08\xf9\xb7\x06\x99\x8b\x1a\xb6Q\xffO\xe3\xd4\xc8\xf0\xf6\x8f\x05N9\xa3\x8f\xdbpk\xdc\xfd\xefg&N;\xff\xff\xbf\xba\x86\x01\x17x\xfcI\x01\xb7\xb1on\xe3\x96\xa3"\x00\x00\xf0q7\xda\xb4\x7f\xabC\x00\x00\x00\x00IEND\xaeB`\x82', shape=(), dtype=string)
tf.Tensor(b'\n\xf6\x01\n\x0e\n\x05label\x12\x05\x1a\x03\n\x01\x01\n\xe3\x01\n\x05image\x12