# TFRecord Serialization (Without Apache Beam)

In this notebook, we learn how to:
- Write TFRecord files
- Parse TFRecord files
- Compare performance to raw image loading


In [3]:
import os
import tensorflow as tf
import tensorflow_datasets as tfds

# ---------------------------------
# Configuration
# ---------------------------------
IMG_SIZE = 224
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE
TFRECORD_PATH = "sample_tf_flowers.tfrecord"

# ---------------------------------
# Load small subset of dataset
# ---------------------------------
train_raw = tfds.load(
    "tf_flowers",
    split="train[:10%]",
    as_supervised=True
)

# ---------------------------------
# Preprocessing
# ---------------------------------
def preprocess(image, label):
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = tf.cast(image, tf.float32) / 255.0
    return image, label

train_ds = (
    train_raw
    .map(preprocess, num_parallel_calls=AUTOTUNE)
    .batch(BATCH_SIZE)
)

# ---------------------------------
# TFRecord Serialization Function
# ---------------------------------
def serialize_example(image, label):
    # Convert back to uint8 for JPEG encoding
    image = tf.cast(image * 255.0, tf.uint8)

    # Encode to JPEG and convert to raw bytes
    image_bytes = tf.io.encode_jpeg(image).numpy()

    feature = {
        "image": tf.train.Feature(
            bytes_list=tf.train.BytesList(value=[image_bytes])
        ),
        "label": tf.train.Feature(
            int64_list=tf.train.Int64List(value=[int(label.numpy())])
        ),
    }

    example = tf.train.Example(
        features=tf.train.Features(feature=feature)
    )

    return example.SerializeToString()

# ---------------------------------
# Write TFRecord File
# ---------------------------------
with tf.io.TFRecordWriter(TFRECORD_PATH) as writer:
    for image, label in train_ds.unbatch():
        serialized = serialize_example(image, label)
        writer.write(serialized)

print(f"TFRecord file written to: {TFRECORD_PATH}")

# ---------------------------------
# Quick Verification
# ---------------------------------
for image, label in train_ds.unbatch().take(1):
    serialized = serialize_example(image, label)
    print("Serialized example type:", type(serialized))


TFRecord file written to: sample_tf_flowers.tfrecord
Serialized example type: <class 'bytes'>


In [5]:
# Write file

tfrecord_path = "flowers.tfrecord"

with tf.io.TFRecordWriter(tfrecord_path) as writer:
    for image, label in train_ds.unbatch().take(100):
        writer.write(serialize_example(image, label))

print("TFRecord file created.")
print("File size (KB):", round(os.path.getsize(tfrecord_path) / 1024, 2))


TFRecord file created.
File size (KB): 2448.78


In [6]:
# Read TFRecord

feature_description = {
    "image": tf.io.FixedLenFeature([], tf.string),
    "label": tf.io.FixedLenFeature([], tf.int64),
}

def parse_example(example_proto):
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    image = tf.image.decode_jpeg(parsed["image"], channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    return image, parsed["label"]

raw_dataset = tf.data.TFRecordDataset("flowers.tfrecord")
parsed_dataset = raw_dataset.map(parse_example)

record_count = 0
for _ in raw_dataset:
    record_count += 1

print("Number of records in TFRecord:", record_count)


Number of records in TFRecord: 100


In [7]:
# Compare speed

import time

start = time.time()
for _ in parsed_dataset.batch(32).take(10):
    pass
print("TFRecord time:", time.time() - start)


TFRecord time: 0.03161787986755371


# Discussion:
- TFRecords improve scalability and consistency.
- For small local experiments, raw images are fine.
- For large production systems, serialized datasets are essential.
