CIFAR-10 pickle files to tfrecords conversion.  Source data stored in GCS.

In [None]:
import tensorflow as tf
import os
import numpy as np
import urllib.request
from six.moves import cPickle as pickle
from random import shuffle

In [None]:
# Create X/Y arrays from pickle files.

path = 'https://storage.googleapis.com/tsaikevin-data/cifar-10-batches-py'

x_temp = []
y_temp = []

for i in range (1,6):
    filename = os.path.join(path,'data_batch_%d'%i)
    file = urllib.request.urlopen(filename)
    data = pickle.load(file, encoding='latin1')
    x = data['data'].reshape(-1,3,32,32).transpose(0,2,3,1).astype("float")
    y = np.array(data['labels'])
    x_temp.append(x)
    y_temp.append(y)

X = np.concatenate(x_temp)
Y = np.concatenate(y_temp)
n = Y.shape[0]
print(X.shape,n)

In [None]:
ix = list(range(X.shape[0]))
shuffle(ix)
ix[:3]

In [None]:
# set shard size based on number of shards; last may be smaller

from math import ceil
shards = 11
shard_size = ceil(len(ix)/shards)
shard_size

In [None]:
# Helper function for creating tfrecords files

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[value]))

In [None]:
# Iterate through filenames and serialize images

f_prefix = 'cifar10_data_'
f_digits = 3
f_suffix = 0
f_name = f_prefix + str(f_suffix).zfill(f_digits) + '.tfrecords'
# create tfrecord file
writer = tf.python_io.TFRecordWriter(f_name)
print('Writing...'+f_name)
ct = 0
for idx in ix:
    image = X[idx,:,:,:]
    label = Y[idx]
    e = tf.train.Example(features=tf.train.Features(feature={
        'idx'     : _int64_feature(idx),
        'label'   : _int64_feature(label),
        'image'   : _bytes_feature(image.tostring())
        }))
    writer.write(e.SerializeToString())
    ct += 1
    if ct == shard_size:
        print('finished writing '+f_name+' with '+str(ct)+' examples')
        f_suffix += 1
        f_name = f_prefix + str(f_suffix).zfill(f_digits) + '.tfrecords'
        # create new tfrecords file
        writer = tf.python_io.TFRecordWriter(f_name)
        print('Writing...'+f_name)
        ct = 0
print ('finished writing '+f_name+' with '+str(ct)+' examples')
    
