In [1]:
import numpy as np
from pathlib import Path
import random
import re
import tensorflow as tf
from zipfile import ZipFile


2023-10-21 12:52:24.635654: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-21 12:52:24.678899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.__version__

'2.13.0'

In [3]:
DATADIR = Path("/scratch/tanhuanp/cats_vs_dogs")
datasetfile = DATADIR / "kagglecatsanddogs_5340.zip"
assert datasetfile.is_file()

In [4]:
with ZipFile(datasetfile) as myzip:
    filenames = myzip.namelist()
    print('Total number of JPG images:', sum(['jpg' in f for f in filenames]))
    print('First 10 files:', filenames[:10])

Total number of JPG images: 25000
First 10 files: ['PetImages/Cat/', 'PetImages/Cat/0.jpg', 'PetImages/Cat/1.jpg', 'PetImages/Cat/10.jpg', 'PetImages/Cat/100.jpg', 'PetImages/Cat/1000.jpg', 'PetImages/Cat/10000.jpg', 'PetImages/Cat/10001.jpg', 'PetImages/Cat/10002.jpg', 'PetImages/Cat/10003.jpg']


In [5]:
def image_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def str_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def create_example(image, filename, classname, classidx):
    feature = {
        "image": image_feature(image),
        "filename": str_feature(filename),
        "classname": str_feature(classname),
        "classidx": int64_feature(classidx),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
N_SHARDS = 10
STORE_DECODED_IMAGES = True

myzip = ZipFile(datasetfile)
classindices = {}
shards = np.array_split(myzip.infolist(), N_SHARDS)

for i, shard in enumerate(shards):
    tfrecfilename = "images_{:03d}.tfrec".format(i)
    with tf.io.TFRecordWriter(str(DATADIR / tfrecfilename)) as writer:
        n_images = 0
        n_not_jfif = 0
        for member in shard:
            if member.filename.endswith(".jpg"):
                #print(myzip.extract(member))
                imagedata = myzip.read(member)
                #print(imagedata[:10])
                is_jfif = b"JFIF" in imagedata[:10]
                
                if is_jfif and STORE_DECODED_IMAGES:
                    # there are corrupt images
                    try:
                        imagedata = tf.image.decode_image(imagedata)
                        imagedata = tf.image.resize(imagedata, [256, 256])
                        imagedata = tf.cast(imagedata, tf.uint8)
                        if tf.shape(imagedata).numpy()[2] != 3:
                            print("  skipping {} with {} channels"
                                  .format(member.filename, tf.shape(imagedata).numpy()[2]))
                            continue
                        imagedata = tf.io.serialize_tensor(imagedata).numpy()
                    except Exception as exp:
                        print(myzip.extract(member))
                        print(exp)
                n_images += 1
                classname = re.findall("^.*PetImages/(.*)/\d+.jpg$", member.filename)[0]
                if not (classname in classindices):
                    classindices[classname] = len(classindices)

                if is_jfif:
                    example = create_example(imagedata, member.filename,
                                             classname, classindices[classname])
                    writer.write(example.SerializeToString())
                else:
                    n_not_jfif = n_not_jfif + 1
        print("Wrote {} records in {} (shard {}, n_not_jfif {})".format(n_images, tfrecfilename, i, n_not_jfif))
myzip.close()

In [None]:
STORE_DECODED_IMAGES = True
EXAMPLES_PER_FILE = 1000

myzip = ZipFile(datasetfile)
classindices = {}

n_images = 0
n_test = 0
n_train = 0
n_val = 0
n_not_jfif = 0
n_decode_errors = 0

infolist = myzip.infolist()
random.shuffle(infolist)
for member in infolist:
    print(member.filename)
    if not member.filename.endswith(".jpg"):
        continue
        
    imagedata = myzip.read(member)
    #print(imagedata[:10])
    is_jfif = b"JFIF" in imagedata[:10]
    if not is_jfif:
        n_not_jfif += 1
        continue

    decode_ok = False
    if STORE_DECODED_IMAGES:
        # there are corrupt images
        try:
            imagedata = tf.image.decode_image(imagedata)
            imagedata = tf.image.resize(imagedata, [256, 256])
            #print(imagedata.shape)
            #break
            imagedata = tf.cast(imagedata, tf.uint8)
            if tf.shape(imagedata).numpy()[2] != 3:
                raise ValueError("  skipping {} with {} channels"
                      .format(member.filename, tf.shape(imagedata).numpy()[2]))
            imagedata = tf.io.serialize_tensor(imagedata).numpy()
            decode_ok = True
        except Exception as exp:
            n_decode_errors += 1
            print(myzip.extract(member))
            print(exp)
            
    if not decode_ok:
        continue
        
    n_images += 1

    classname = re.findall("^.*PetImages/(.*)/\d+.jpg$", member.filename)[0]
    if not (classname in classindices):
        classindices[classname] = len(classindices)

    example = create_example(imagedata, member.filename, 
                             classname, classindices[classname])

    # 70/20/10% split for train/val/test datasets
    # https://stackoverflow.com/questions/51125266/how-do-i-split-tensorflow-datasets
    #test_dataset = dataset.enumerate().filter(lambda x,y: x%10==7).map(lambda x,y: y) 
    #val_dataset = dataset.enumerate().filter(lambda x,y: x%10>7).map(lambda x,y: y) 
    #train_dataset = dataset.enumerate().filter(lambda x,y: x%10<7).map(lambda x,y: y)
    
    if n_images % 10 < 7:
        tfrecfilename = "images_train_{:03d}.tfrec".format(n_train // EXAMPLES_PER_FILE)
        n_train += 1
    if n_images % 10 > 7:
        tfrecfilename = "images_val_{:03d}.tfrec".format(n_val // EXAMPLES_PER_FILE)
        n_val += 1
    if n_images % 10 == 7:
        tfrecfilename = "images_test_{:03d}.tfrec".format(n_val // EXAMPLES_PER_FILE)
        n_test += 1
    
    with tf.io.TFRecordWriter(str(DATADIR / tfrecfilename)) as writer:
        writer.write(example.SerializeToString())

    print("n_train={}, n_val={}, n_test={}".format(n_train, n_val, n_test))
    
myzip.close()

In [None]:
l=12345
n=100
print(l%n)
print(n-l%n)
print(l//n+1)
print(l//n)
(l%n)*(l//n+1) + (n-l%n)*(l//n)

In [8]:
STORE_DECODED_IMAGES = True
EXAMPLES_PER_FILE = 1000

myzip = ZipFile(datasetfile)

infolist = [info for info in myzip.infolist() if info.filename.endswith('.jpg')]
random.shuffle(infolist)

list_train = []
list_val   = []
list_test  = []

i = 0
for i, info in enumerate(infolist):
    if i % 10 < 7:
        list_train.append(info)
    if i % 10 > 7:
        list_val.append(info)
    if i % 10 == 7:
        list_test.append(info)


classindices = {}

n_images = 0
n_test = 0
n_train = 0
n_val = 0
n_not_jfif = 0
n_decode_errors = 0

def decode_image(imagedata):
    imagedata = tf.image.decode_image(imagedata)
    imagedata = tf.image.resize(imagedata, [256, 256])
    #print(imagedata.shape)
    #break
    imagedata = tf.cast(imagedata, tf.uint8)
    if tf.shape(imagedata).numpy()[2] != 3:
        raise ValueError("  skipping {} with {} channels"
              .format(member.filename, tf.shape(imagedata).numpy()[2]))
    imagedata = tf.io.serialize_tensor(imagedata).numpy()

def create_tfrec_files(myzip, infolist, dataset_label):
    n_shards = len(infolist) // EXAMPLES_PER_FILE
    
    n_images = 0
    shards = np.array_split(infolist, n_shards)
    for i, shard in enumerate(shards):
        tfrecfilename = "images_{}_{:03d}.tfrec".format(dataset_label, i)
        with tf.io.TFRecordWriter(str(DATADIR / tfrecfilename)) as writer:
            n_not_jfif = 0
            for member in shard:
                #print(member.filename)
        
                imagedata = myzip.read(member)
                #print(imagedata[:10])
                is_jfif = b"JFIF" in imagedata[:10]
                if not is_jfif:
                    n_not_jfif += 1
                    continue

                decode_ok = False
                if STORE_DECODED_IMAGES:
                    # there are corrupt images
                    try:
                        imagedata = tf.image.decode_image(imagedata)
                        imagedata = tf.image.resize(imagedata, [256, 256])
                        #print(imagedata.shape)
                        #break
                        imagedata = tf.cast(imagedata, tf.uint8)
                        if tf.shape(imagedata).numpy()[2] != 3:
                            raise ValueError("  skipping {} with {} channels"
                                  .format(member.filename, tf.shape(imagedata).numpy()[2]))
                        imagedata = tf.io.serialize_tensor(imagedata).numpy()
                        decode_ok = True
                    except Exception as exp:
                        pass
                        #print(myzip.extract(member))
                        #print(exp)

                if not decode_ok:
                    continue
        
                n_images += 1

                classname = re.findall("^.*PetImages/(.*)/\d+.jpg$", member.filename)[0]
                if not (classname in classindices):
                    classindices[classname] = len(classindices)

                example = create_example(imagedata, member.filename, 
                                         classname, classindices[classname])

                writer.write(example.SerializeToString())

    print("n_{}={}".format(dataset_label, n_images))


datasets = {"train":list_train, "val":list_val, "test":list_test}
for key in datasets.keys():
    create_tfrec_files(myzip, datasets[key], key)


myzip.close()

2023-10-21 12:54:18.576011: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
Corrupt JPEG data: 214 extraneous bytes before marker 0xd9
Corrupt JPEG data: 252 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 228 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9


n_train=16231


Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9


n_val=4631


Corrupt JPEG data: 2226 extraneous bytes before marker 0xd9
Corrupt JPEG data: 396 extraneous bytes before marker 0xd9


n_test=2331


In [9]:
print(len(list_train))
print(len(list_val))
print(len(list_test))


17500
5000
2500


In [None]:
print(n_train / n_images)
print(n_val / n_images)
print(n_test / n_images)


In [None]:
def preprocess_image(image):
    if STORE_DECODED_IMAGES:
        return tf.io.parse_tensor(image, tf.uint8)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [256, 256])
    return tf.cast(image, tf.uint8)

feature_description = {
    "image": tf.io.FixedLenFeature((), tf.string),
    "filename": tf.io.FixedLenFeature((), tf.string),
    "classname": tf.io.FixedLenFeature((), tf.string),
    "classidx": tf.io.FixedLenFeature((), tf.int64)}

def load_image(example_proto):
    ex = tf.io.parse_single_example(example_proto, feature_description)
    return (preprocess_image(ex["image"]), ex["classidx"])

In [None]:
tfrec_filenames = [str(DATADIR)+"/images_{:03d}.tfrec".format(i)
                   for i in range(N_SHARDS)]
random.shuffle(tfrec_filenames)

full_dataset = tf.data.TFRecordDataset(tfrec_filenames)
full_dataset = full_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)


In [None]:
batch_size = 32

# Determine the size of your dataset
# You can use the `reduce` method to count the number of elements in the dataset
#dataset_size = dataset.reduce(tf.constant(0, dtype=tf.int64), lambda x, _: x + 1).numpy()

# Calculate the number of elements in the dataset
#dataset_size = 0
#for _ in dataset:
#    dataset_size += 1

train_dataset, test_dataset = tf.keras.utils.split_dataset(full_dataset, left_size=0.8)

# Define the split ratio
#train_ratio = 0.8
#train_size = int(dataset_size * train_ratio)
#test_size = dataset_size - train_size

# Split the dataset into training and test sets
#train_dataset = dataset.take(train_size)
#test_dataset = dataset.skip(train_size)

train_dataset = train_dataset.shuffle(128).batch(batch_size, drop_remainder=False)
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

test_dataset = test_dataset.batch(batch_size, drop_remainder=False)

In [None]:
b = train_dataset.take(1).cardinality()
b

In [None]:
train_dataset.cardinality().dtype
#train_filenames
#classindices

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(13,13))
classnames = list(classindices.keys())
for batch, labelidx in train_dataset.take(1):
    print(batch.get_shape())
    print(labelidx)
    for i in range(16):    
        plt.subplot(4,4,i+1)
        plt.imshow(batch[i])
        lidx = labelidx[i].numpy()
        plt.title("{} ({})".format(classnames[lidx], lidx))
        plt.grid(False)
        plt.xticks([])
        plt.yticks([])
    plt.suptitle('Some images from the Kaggle Cat vs. Dog dataset', fontsize=16, y=0.93)

In [None]:
classnames