## TFRecord binary TF data format

In [5]:
import tensorflow as tf
import numpy as np
import os

## Binary format containing sequence of binary records of varying sizes with records of the form
### [length, CRC of the length, data, CRC of the data]

## Creating a TFRecord file with compression:

In [7]:
options = tf.io.TFRecordOptions(compression_type='GZIP')

with tf.io.TFRecordWriter('my_data.tfrecord', options) as f:
    f.write(b'First record')
    f.write(b'Second record')

In [8]:
!cat ./my_data.tfrecord

      �a��/'�q�e�(�&���������1�a�K�J^�z� ��A9   

## Reading compressed TFrecord

### Speeding up reading multiple files:
* ### Pass num_parallel_reads = 2 or more to TFRecordDataset
### Or
* ### Create dataset of filenames with list_files() and use interleave() to mix them

In [9]:
filepaths = ['my_data.tfrecord']

dataset = tf.data.TFRecordDataset(filepaths, compression_type='GZIP')

for item in dataset:
    print(item)

tf.Tensor(b'First record', shape=(), dtype=string)
tf.Tensor(b'Second record', shape=(), dtype=string)


## TFRecord uses serialized protobuf format by default

### Numbers are field identifiers used in record's binary representation
### repeated indicated multiple string occurance in field nr 3

## Definition of a particular protobuf is stored in a .proto file and compiled using protoc This gives access classes in Python.

## Protobuf objects are meant to be serialized and transmitted, so they are called messages

In [None]:
syntax = 'proto3'
message Person {
    string name = 1;
    int32 id = 2;
    reapeted string email = 3;
}

## Upon compiling wth protoc module can be loaded

In [12]:
from person_pb2 import Person

## Basic operations on a defined protobuf object

In [13]:
person = Person(name="AI", id=13, email=['a@b.c'])

In [14]:
print(person)

name: "AI"
id: 13
email: "a@b.c"



In [15]:
person.name

'AI'

In [17]:
person.email[0]

'a@b.c'

In [18]:
person.email.append('c@dd.ll')

In [19]:
person.email

['a@b.c', 'c@dd.ll']

In [20]:
serialized = person.SerializeToString()

In [21]:
serialized

b'\n\x02AI\x10\r\x1a\x05a@b.c\x1a\x07c@dd.ll'

In [22]:
person2 = Person()

In [23]:
person2.ParseFromString(serialized)

22

In [24]:
person == person2

True

## In most cases TF builtin protobus are used instead or cust types like above

## For builtin types there are provided dedicated parsers

## TF Protobufs

## The main protobuf typically used in TFRecord is the Example protobuf

## It represents one instance in a dataset

In [None]:
syntax = 'proto3'
message ByteList { repeated bytes value = 1; }
message FloatList { repeated float value = 1 [packed = true]; } # Packed - more compact numerical representation
message Int64List { repeated int64 value = 1 [packed = true]; }
message Feature { 
    oneof kind {
        BytesList bytes_list = 1;
        FloatList bytes_list = 1;
        ByteList bytes_list = 1;
    }
};
message Features { map<string, Feature> feature = 1; };
message Example { Features features  = 1; };

In [26]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

In [29]:
person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b'Alice'])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b'a@b.c', b'f@d.p']))
        }))

## Such a protobuf can be serialized and stored in TFR file

In [30]:
with tf.io.TFRecordWriter('my_contacts.tfrecord') as f:
    # Writing serveral times just for example
    for _ in range(5):
        f.write(person_example.SerializeToString())

## Serialized protobufs stored in TFRecord files need to be parsed once loaded into a dataset to be used by a model

## Parsing happens with tf.io.parse_single_example, which requres a string with serialized data and description of each feature

In [31]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string),
}

In [32]:
def parse(serialized_example):
    return tf.io.parse_single_example(serialized_example, feature_description)

In [33]:
dataset = tf.data.TFRecordDataset(['my_contacts.tfrecord']).map(parse)

In [34]:
for parsed_example in dataset:
    print(parsed_example)

{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]], shape=(2, 1), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p'], shape=(2,), dtype=string), dense_shape=tf.Tensor([2], shape=(1,), dtype=int64)), 'id': <tf.Tensor: shape=(), dtype=int64, numpy=123>, 'name': <tf.Tensor: shape=(), dtype=string, numpy=b'Alice'>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0]
 [1]

## Sparse tensor representing variable-length feature can be converted to a dense tensor using tf.sparse.to_dense()

In [35]:
tf.sparse.to_dense(parsed_example['emails'], default_value=b"")

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.c', b'f@d.p'], dtype=object)>

In [36]:
parsed_example['emails'].values

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.c', b'f@d.p'], dtype=object)>

## Parsing can be applied to whole batches

In [37]:
def parse(serialized_examples):
    return tf.io.parse_example(serialized_examples, feature_description)

In [38]:
dataset = tf.data.TFRecordDataset(['my_contacts.tfrecord']).batch(2).map(parse)

In [43]:
for parsed_examples in dataset:
    print(parsed_examples)

{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p' b'a@b.c' b'f@d.p'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alice', b'Alice'], dtype=object)>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p' b'a@b.c' b'f@d.p'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alice', b'Alice'], dtype=object)>}
{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]], shape=(2, 2), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p'], shape=(2,), dtype=string), dense_shape=tf.Tensor([1 2], shape=(2,

In [44]:
for parsed_examples in dataset.take(1):
    print(parsed_examples)

{'emails': SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]], shape=(4, 2), dtype=int64), values=tf.Tensor([b'a@b.c' b'f@d.p' b'a@b.c' b'f@d.p'], shape=(4,), dtype=string), dense_shape=tf.Tensor([2 2], shape=(2,), dtype=int64)), 'id': <tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>, 'name': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Alice', b'Alice'], dtype=object)>}


In [45]:
parsed_examples['emails'].values

<tf.Tensor: shape=(4,), dtype=string, numpy=array([b'a@b.c', b'f@d.p', b'a@b.c', b'f@d.p'], dtype=object)>

In [49]:
parsed_examples['id']

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([123, 123])>

## For general data like images or raw numeric data a ByteList can be used.
### For images e.g. tf.io.encode_jpeg() can encode jped which can be stored in ByteList.
### Later during example parsing a tf.io.decode_jpeg() or tf.io.decode_image() will restore the serialized image
### In general any tensor can be serialized with tf.io.serialize_tensor() and then stored in ByteList feature
### Later during example parsing such tensor can be restored with tf.io.parse_tensor()
### Examples of storing images and tensors https://homl.info/colab3

## For lists of lists a SequenceExample protobuf is used

### SequenceExample protobuf

In [None]:
message FeatureList {repeated Feature feature = 1; };
message FeatureLists {map<string, FeatureList> feature_list = 1; };
message SequenceExample {
    Features context = 1;
    FeatureLists feature_lists = 2;
};

## Parsing SequenceExamples is analogous to ordinary Examples parsing but one must use
* ### tf.io.parse_single_sequence_example() or tf.io.parse_sequence_example()

## Upon parsing a tuple with context features and features lists is returned

In [None]:
parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(
    serialized_sequence_example,
    context_feature_descriptions)

parsed_content = tf.RaggedTensor.from_sparse(parsed_feature_lists["content"])