In [None]:
import os
import cv2 as cv
cv2 = cv
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
train_ds, val_ds, test_ds = tfds.load('voc/2007', split=['train', 'validation', 'test'])

print(tf.data.experimental.cardinality(train_ds))
print(tf.data.experimental.cardinality(val_ds))
print(tf.data.experimental.cardinality(test_ds))

# Create TFRecords

In [None]:
def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def int64_list_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def bytes_list_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def float_list_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def serialize_tfr(img, labels, bboxes):
    def _serialize(img, labels, bboxes):
        h, w = img.shape[:2]
        img = tf.io.encode_jpeg(img)
        # img = tf.io.serialize_tensor(img)  #creates tf.records atleast 4times larges
        bboxes = tf.io.serialize_tensor(bboxes)
        features = {
            'image/h': int64_feature(h),
            'image/w': int64_feature(w),
            'image/img': _bytes_feature(img),
            'image/bbox': _bytes_feature(bboxes),
            'image/labels': int64_list_feature(labels)
            }

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        return tf_example.SerializeToString()
    
    tf_string = tf.py_function(_serialize,
                            (img, labels, bboxes), tf.string)      
    return tf.reshape(tf_string, ()) 


map_fn = lambda d: serialize_tfr(d['image'], d['labels'], d['objects']['bbox'])

serialized_train_ds = train_ds.map(map_fn, num_parallel_calls= tf.data.experimental.AUTOTUNE)
serialized_val_ds = val_ds.map(map_fn, num_parallel_calls= tf.data.experimental.AUTOTUNE)
serialized_test_ds = test_ds.map(map_fn, num_parallel_calls= tf.data.experimental.AUTOTUNE)

In [None]:
folder = '/content/voc2007'
os.makedirs(folder, exist_ok=True)
train_name = 'train'
val_name = 'val'
test_name = 'test'

#https://stackoverflow.com/a/64540388/6393479
def write_generator(dataset, items_per_file, save_path, name):
    for i, chunk in enumerate(dataset.batch(items_per_file)):
        batch_ds = tf.data.Dataset.from_tensor_slices(chunk)
        filename = f"{os.path.join(save_path, name)}_{str(i)}.tfrecord"
        writer = tf.data.experimental.TFRecordWriter(filename)
        yield batch_ds, writer, filename
  
for ds_chunk, wri, i in write_generator(serialized_train_ds, 512, folder, train_name):
    wri.write(ds_chunk)

for ds_chunk, wri, i in write_generator(serialized_val_ds, 512, folder, val_name):
    wri.write(ds_chunk)

for ds_chunk, wri, i in write_generator(serialized_test_ds, 512, folder, test_name):
    wri.write(ds_chunk)

# Parsing TFrecords check

In [None]:
def parse_from_tfr(element):
    # feature_description = \
    #     {'image': tf.io.FixedLenFeature([], tf.string, default_value=''),
    #     'label': tf.io.FixedLenFeature([], tf.float32, default_value=0.0)}

    feature_description = {
            'image/h': tf.io.FixedLenFeature([], tf.int64, default_value=0),
            'image/w': tf.io.FixedLenFeature([], tf.int64, default_value=0),
            'image/img': tf.io.FixedLenFeature([], tf.string, default_value=''),
            'image/bbox': tf.io.FixedLenFeature([], tf.string, default_value=''),
            'image/labels':tf.io.VarLenFeature(tf.int64),
            }
    output = tf.io.parse_example(element, feature_description)
    # h = output['image/h']
    # w = output['image/w']
    img = output['image/img']
    img = tf.io.decode_jpeg(output['image/img'])
    # img = tf.io.parse_tensor(output['image/img'], out_type = tf.uint8)
    bbox = tf.io.parse_tensor(output['image/bbox'], out_type= tf.float32)
    label = output['image/labels']
    label = tf.sparse.to_dense(label)
    return img, bbox, label

te = tf.data.TFRecordDataset(['/content/voc2007/train_0.tfrecord'])
te = te.map(parse_from_tfr)

for img,_,i in te.take(10):
    print(img.shape)

# Upload to Kaggle

In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
folder = "/content/voc2007"
%cd {folder}
! kaggle datasets init

In [None]:
! kaggle datasets create -p {folder}