In [1]:
import os
import numpy as np
import random
import tensorflow as tf
import pathlib

In [19]:
print(tf.__version__)
SEED = 42
BATCH_SIZE = 32

2.3.1


In [9]:
data_path = '../data'
images_dir = pathlib.Path(f'{data_path}/images')
image_count = len(list(images_dir.glob('*/*.jpg')))
masks_dir = pathlib.Path(f'{data_path}/images_masks')
masks_count = len(list(masks_dir.glob('*.jpg')))

print(image_count, masks_count, image_count == masks_count)

17712 17712 True


In [10]:
list_ds = tf.data.Dataset.list_files(str(images_dir/'*/*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

In [11]:
for f in list_ds.take(5):
    print(f.numpy())

b'../data/images/good/71f2ce9ab_5.jpg'
b'../data/images/good/112942aed_1.jpg'
b'../data/images/bad/a16672b15_6.jpg'
b'../data/images/good/046c35525_6.jpg'
b'../data/images/good/8c3539cdd_16.jpg'


In [12]:
network_split = 0.6
validation_split = 0.2
ae_size = int(image_count * network_split)
ae_val_size = int(ae_size * validation_split)
conv_val_size = int((image_count-ae_size) * validation_split)
print(ae_size, ae_val_size, conv_val_size)
conv_ds = list_ds.skip(ae_size)
conv_train_ds = conv_ds.skip(conv_val_size)
conv_val_ds = conv_ds.take(conv_val_size)
ae_ds = list_ds.take(ae_size)
ae_train_ds = ae_ds.skip(ae_val_size)
ae_val_ds = ae_ds.take(ae_val_size)

10627 2125 1417


In [13]:
print(tf.data.experimental.cardinality(list_ds).numpy())
print(tf.data.experimental.cardinality(conv_ds).numpy())
print(tf.data.experimental.cardinality(conv_train_ds).numpy())
print(tf.data.experimental.cardinality(conv_val_ds).numpy())
print(tf.data.experimental.cardinality(ae_ds).numpy())
print(tf.data.experimental.cardinality(ae_train_ds).numpy())
print(tf.data.experimental.cardinality(ae_val_ds).numpy())

17712
7085
5668
1417
10627
8502
2125


In [16]:
class_names = np.array(sorted([item.name for item in images_dir.glob('*') if item.name != ".gitkeep"]))
print(class_names)

def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    one_hot = parts[-2] == class_names
    # Integer encode the label
    return tf.argmax(one_hot)

def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # resize the image to the desired size
    return tf.image.resize(img, [256, 256])

def process_path_ae(file_path):
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, img

def process_path_conv(file_path):
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    label = get_label(file_path)
    return img, label

['bad' 'good']


In [17]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
conv_train_ds = conv_train_ds.map(process_path_conv, num_parallel_calls=AUTOTUNE)
conv_val_ds = conv_val_ds.map(process_path_conv, num_parallel_calls=AUTOTUNE)
ae_train_ds = ae_train_ds.map(process_path_ae, num_parallel_calls=AUTOTUNE)
ae_val_ds = ae_val_ds.map(process_path_ae, num_parallel_calls=AUTOTUNE)

In [20]:
def configure_for_performance(ds):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(BATCH_SIZE)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

conv_train_ds = configure_for_performance(conv_train_ds) 
conv_val_ds = configure_for_performance(conv_val_ds) 
ae_train_ds = configure_for_performance(ae_train_ds) 
ae_val_ds = configure_for_performance(ae_val_ds) 

In [21]:
def save(ds, name):
    out_path = os.path.join(data_path, name)
    if os.path.exists(out_path):
        raise Exception('Path exists already')
    tf.data.experimental.save(ds, out_path)

In [22]:
conv_train_ds = save(conv_train_ds, 'conv_train_ds') 
conv_val_ds = save(conv_val_ds, 'conv_val_ds') 
ae_train_ds = save(ae_train_ds, 'ae_train_ds') 
ae_val_ds = save(ae_val_ds, 'ae_val_ds') 