In [0]:
import numpy as np
import skimage
print(np.__version__)
print(skimage.__version__)

import time, math
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
import tensorflow.contrib.eager as tfe
import gc
print(tf.__version__)

import matplotlib.pyplot as plt
% matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


from PIL import Image
import random

[mnist_tfrecord](https://keras.io/examples/mnist_tfrecord/)

[tensorflows-tfrecord-to-train-keras-model](https://www.dlology.com/blog/how-to-leverage-tensorflows-tfrecord-to-train-keras-model/)

# Cifar10 dataset

In [0]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
classes = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']
len_train, len_test = len(x_train), len(x_test)
y_train = y_train.astype('int64').reshape(len_train)
y_test = y_test.astype('int64').reshape(len_test)

train_mean = np.mean(x_train, axis=(0,1,2))
train_std = np.std(x_train, axis=(0,1,2))

normalize = lambda x: ((x - train_mean) / train_std).astype('float32') # todo: check here
pad4 = lambda x: np.pad(x, [(0, 0), (4, 4), (4, 4), (0, 0)], mode='reflect')

x_train = normalize(pad4(x_train))
x_test = normalize(x_test)

In [0]:
x_test.shape

# Feature Description

In [0]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_function(x, y):
    example = tf.train.Example(features=tf.train.Features(
                    feature={
                            'image': _bytes_feature(tf.compat.as_bytes(x.tostring())),
                            'label': _int64_feature(int(y))
                    }))
    return example.SerializeToString()



# Mount Google drive

In [0]:
# from google.colab import drive
# drive.mount('/content/gdrive/My\ Drive')

In [0]:
# !mkdir ./gdrive/My\ Drive/tf_data

In [0]:
!ls -lh

In [0]:
# !ls gdrive/My\ Drive/tf_data

In [0]:
def convert_to_tfrecord(data_set, file_name):
    with tf.python_io.TFRecordWriter(file_name) as record_writer:
        for x, y in data_set:
            print(x.shape, y.shape)
            if isinstance(x, (np.ndarray, np.generic)) and isinstance(y, (np.ndarray, np.generic)):
                record = serialize_function(x, y)
            else:
                record = serialize_function(x.numpy(), y.numpy())
#                 print(record)
            record_writer.write(record)
#             break

In [0]:
path = './test.tfrecords'
data_to_write = zip(x_test, y_test)

In [0]:
convert_to_tfrecord(data_to_write, path)


In [0]:
!ls -lh

# Extract data

# Mathod 1

In [0]:
def parser(record):
    keys_to_features = {
        "image": tf.FixedLenFeature([], tf.string),
        "label":     tf.FixedLenFeature([], tf.int64)
    }
    parsed = tf.parse_single_example(record, keys_to_features)
    image = tf.decode_raw(parsed["image"], tf.uint8)
    image = tf.cast(image, tf.float32)
    #image = tf.reshape(image, shape=[224, 224, 3])
    label = tf.cast(parsed["label"], tf.int32)

    return {'image': image}, label


def input_foo(filenames):
    dataset = tf.data.TFRecordDataset(filenames=filenames, num_parallel_reads=40)
    
    dataset = dataset.apply(
        tf.contrib.data.shuffle_and_repeat(buffer_size=1024, 
                                           seed=1)
    )
    dataset = dataset.apply(
        tf.contrib.data.map_and_batch(
                                  map_func=parser, 
                                  batch_size=32, 
                                  num_parallel_calls=tf.data.experimental.AUTOTUNE)
        )
    dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/GPU:0', buffer_size=None))
#     dataset = dataset.map(parser, num_parallel_calls=12)
#     dataset = dataset.batch(batch_size=1000)
#     dataset = dataset.prefetch(buffer_size=2)
    return dataset

In [0]:
??tf.contrib.data.prefetch_to_device

In [0]:
kk = input_foo(path)

In [0]:
kk

In [0]:
  feature_description = {
    'image': tf.FixedLenFeature([], tf.string, default_value=''),
    'label': tf.FixedLenFeature([], tf.int64, default_value=0)
    # 'feature2': tf.FixedLenFeature([], tf.string, default_value=''),
    # 'feature3': tf.FixedLenFeature([], tf.float32, default_value=0.0),
        }

def _parse_function(example_proto):
#     Parse the input tf.Example proto using the dictionary above.
    parsed_features = tf.parse_single_example(example_proto, feature_description)
    parsed_features['image'] = tf.decode_raw(parsed_features['image'], tf.uint8)
    return parsed_features['image'], parsed_features["label"]

In [0]:
file  = tf.data.Dataset.list_files(file_pattern='.tfrecords')
dataset = tf.data.TFRecordDataset(file)

dataset = dataset.shuffle(1000)
dataset = dataset.repeat(2)
dataset = dataset.map(lambda x : tf.parse_single_example(x, feature_description))
dataset = dataset.batch(16)
"""
<DatasetV1Adapter shapes: {image: (?,), label: (?,)}, types: {image: tf.string, label: tf.int64}>
"""

# Other Mathod

[tf_records_DOC](https://www.tensorflow.org/tutorials/load_data/tf_records)

In [0]:
raw_dataset = tf.data.TFRecordDataset('test.tfrecords')
print(raw_dataset)

for raw_record in raw_dataset.take(10):
    print(repr(raw_record))
    break
    
parsed_dataset = raw_dataset.map(_parse_function)
print(parsed_dataset)


for parsed_record in parsed_dataset.take(10):
    print(repr(parsed_record))
    break

In [0]:
iterator = dataset.make_one_shot_iterator()
feature = iterator.get_next()
"""
{'image': <tf.Tensor 'IteratorGetNext_5:0' shape=(?,) dtype=string>,
 'label': <tf.Tensor 'IteratorGetNext_5:1' shape=(?,) dtype=int64>}
"""

# look at single record (Decoding)

In [0]:
record_iterator = tf.python_io.tf_record_iterator(path)

for string_record in record_iterator:
    example = tf.train.Example()
    example.ParseFromString(string_record)

    print(example)

    # Exit after 1 iteration as this is purely demonstrative.
    break

In [0]:
print(dict(example.features.feature))

In [0]:
example.features.feature['image']

In [0]:
keys_to_features = {
        "image": tf.FixedLenFeature([], tf.string),
        "label":     tf.FixedLenFeature([], tf.int64)
    }
parsed = tf.parse_single_example(string_record, keys_to_features)
image = tf.decode_raw(parsed["image"], tf.uint8)
image = tf.cast(image, tf.float32)
image = tf.reshape(image, shape=[32, 32, 3])
label = tf.cast(parsed["label"], tf.int32)


In [0]:
print(image, label)

In [0]:
# ??tf.data.Dataset.list_files

In [0]:
raw_dataset

In [0]:
parsed_dataset = raw_dataset.map(_parse_function, num_parallel_calls=tf.data.experimental.AUTOTUNE)
parsed_dataset

In [0]:
import pandas as pd
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_function(x, y):
    example = tf.train.Example(features=tf.train.Features(
                    feature={
                            'image': _bytes_feature(tf.compat.as_bytes(x.tostring())),
                            'label': _int64_feature(int(y))
                    }))
    return example.SerializeToString()


def convert_to_tfrecord(data_set, file_name):
    with tf.python_io.TFRecordWriter(file_name) as record_writer:
        for x, y in data_set:
            if isinstance(x, (np.ndarray, np.generic)) and isinstance(y, (np.ndarray, np.generic)):
                record = serialize_function(x, y)
            else:
                record = serialize_function(x.numpy(), y.numpy())
            record_writer.write(record)
        print(f"TFRecord id created at path : '{file_name}', Done")


def parser(record):
    feature_description = {
    'image': tf.FixedLenFeature([], tf.string, default_value=''),
    'label': tf.FixedLenFeature([], tf.int64, default_value=0)
    # 'feature2': tf.FixedLenFeature([], tf.string, default_value=''),
    # 'feature3': tf.FixedLenFeature([], tf.float32, default_value=0.0),
        }

    parsed = tf.parse_single_example(record, feature_description)
    image = tf.decode_raw(parsed["image"], tf.uint8)
    image = tf.cast(image, tf.float32)
    #image = tf.reshape(image, shape=[224, 224, 3])
    label = tf.cast(parsed["label"], tf.int32)
    return {'image': image}, label


def input_foo(filenames):
    dataset = tf.data.TFRecordDataset(filenames=filenames, num_parallel_reads=40)
    
    dataset = dataset.apply(
        tf.contrib.data.shuffle_and_repeat(buffer_size=1024, 
                                           seed=1)
    )
    dataset = dataset.apply(
        tf.contrib.data.map_and_batch(
                                  map_func=parser, 
                                  batch_size=32, 
                                  num_parallel_calls=tf.data.experimental.AUTOTUNE)
        )
    dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/GPU:0', buffer_size=None))
#     dataset = dataset.map(parser, num_parallel_calls=12)
#     dataset = dataset.batch(batch_size=1000)
#     dataset = dataset.prefetch(buffer_size=2)
    return dataset

class CreateTFRecord(object):
    """docstring for CreateTFRecord"""
    def __init__(self, data_to_write, file_name):
        super().__init__()
        self.data_to_write = data_to_write
        self.file_name = file_name
        convert_to_tfrecord(self.data_to_write, self.file_name)


In [0]:
file_name = './test.tfrecords'
data_to_write = zip(x_test, y_test)

CreateTFRecord(data_to_write, file_name)