# IMPORTS

In [1]:
import os, sys, math
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output
import multiprocessing
 
from tensorflow.keras.utils import Sequence
if 'google.colab' in sys.modules: # Colab-only Tensorflow version selector
  %tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE # used in tf.data.Dataset API

Tensorflow version 2.2.0


# PARAMETERS

In [3]:
FORMAT = 'jpg'
IMAGE_SIZE = 256*6
TARGET_SIZE = (IMAGE_SIZE, IMAGE_SIZE)
CLASSES = ['0', '1', '2', '3', '4', '5'] # do not change, maps to the labels in the data (folder names)
SHARD_MEM = 150 # tfrecord max size in Mb to optimize speed loading
SHARD_SIZE = math.ceil((SHARD_MEM*10**7/(IMAGE_SIZE**2*3))) # max number of image in a tfrecord
SHARD_SIZE = 512 # Num files in a tfrecord
GCS_PATH = 'gs://huynhdoo/prostate-cancer/'
GCS_PATTERN = GCS_PATH + 'mounting/256x36/*/*/*.' + FORMAT
GCS_OUTPUT = GCS_PATH + 'mounting/256x36/tfrec-' + FORMAT + '-' + str(IMAGE_SIZE) + 'x' + str(IMAGE_SIZE) + '/'  # prefix for output file names
NB_IMAGES = len(tf.io.gfile.glob(GCS_PATTERN))
SHARDS = math.ceil(NB_IMAGES / SHARD_SIZE) 
 
print('IMAGES :', NB_IMAGES)
print('SOURCE :', GCS_PATTERN)
print('DESTINATION :', GCS_OUTPUT)

IMAGES : 21230
SOURCE : gs://huynhdoo/prostate-cancer/mounting/256x36/*/*/*.jpg
DESTINATION : gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/


In [4]:
#@title Display utilities
def display_images_from_dataset(dataset, num):
  plt.figure(figsize=(13,13))
  subplot=331
  slide = 0
  for i, (image, label, provider) in enumerate(dataset):
    plt.subplot(subplot)
    plt.axis('off')
    plt.imshow(image.numpy().astype(np.uint8))
    plt.title(provider.numpy().decode("utf-8") + ' - ' + label.numpy().decode("utf-8"), fontsize=16)
    subplot += 1
    slide +=1
    if slide==num:
      break
  plt.tight_layout()
  plt.subplots_adjust(wspace=0.1, hspace=0.1)
  plt.show()

# GOOGLE CLOUD CREDENTIALS

In [5]:
if 'google.colab' in sys.modules:
   from google.colab import auth
   auth.authenticate_user()

# LOAD IMAGES

In [6]:
# Extract image and label
def decode_image_and_label(filename):
  bits = tf.io.read_file(filename)
  image = tf.image.decode_image(bits, channels=3)
  # parse biopsy name from containing directory
  features = tf.strings.split(tf.expand_dims(filename, axis=-1), sep='/')
  provider = features.values[6]
  label = features.values[7]
  return image, label, provider
 
# Load data from GCS
def load_data():
    NB_IMAGES = len(tf.io.gfile.glob(GCS_PATTERN))
    SHARDS = math.ceil(NB_IMAGES / SHARD_SIZE) 
    # print("Pattern matches {} images which will be rewritten as {} tfrec files containing {} images each.".format(NB_IMAGES, SHARDS, SHARD_SIZE))
 
    filenames = tf.data.Dataset.list_files(GCS_PATTERN, seed=35155) # This also shuffles the images
    return filenames.map(decode_image_and_label, num_parallel_calls=AUTO)

# RESIZE AND CROP IMAGES

In [7]:
def resize_and_crop_image(image, label, provider):
  # Resize and crop using "fill" algorithm:
  # always make sure the resulting image
  # is cut out from the source image so that
  # it fills the TARGET_SIZE entirely with no
  # black bars and a preserved aspect ratio.
  w = tf.shape(image)[0]
  h = tf.shape(image)[1]
  tw = TARGET_SIZE[0]
  th = TARGET_SIZE[1]
  resize_crit = (w * th) / (h * tw)
  print(w,h,tw,th)
  image = tf.cond(resize_crit < 1,
                  lambda: tf.image.resize(image, [w*tw/w, h*tw/w]), # if true
                  lambda: tf.image.resize(image, [w*th/h, h*th/h])  # if false
                 )
  nw = tf.shape(image)[0]
  nh = tf.shape(image)[1]
  image = tf.image.crop_to_bounding_box(image, (nw - tw) // 2, (nh - th) // 2, tw, th)
  return image, label, provider
 
def resize_data(dataset):  
    return dataset.map(resize_and_crop_image, num_parallel_calls=AUTO)

# COMPRESS IMAGES

In [8]:
# Bandwidth savings outweight the decoding CPU cost
def compress_image(image, label, provider):
  image = tf.cast(image, tf.uint8)
  image = tf.io.encode_jpeg(image, format='rgb', optimize_size=True, chroma_downsampling=True)
  return image, label, provider
 
def compress_data(dataset):
    return dataset.map(compress_image, num_parallel_calls=AUTO)

# EXPORT TO TFRECORDS

## Export utilities

In [9]:
# Three types of data can be stored in TFRecords: bytestrings, integers and floats
# They are always stored as lists, a single data element will be a list of size 1
 
def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))
 
def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))
 
def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
  
 
def to_tfrecord(tfrec_filewriter, img_bytes, label, provider):  
  #class_num = np.argmax(np.array(CLASSES)==label) # 'roses' => 2 (order defined in CLASSES)
  class_num = int(label)
  one_hot_class = np.eye(len(CLASSES))[class_num]     # [0, 0, 1, 0, 0] for class #2, roses
  
  feature = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      "class_num": _int_feature([class_num]),        # one class in the list
      
      # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
      "label":         _bytestring_feature([label]),          # fixed length (1) list of strings, the text label
      "provider":      _bytestring_feature([provider]),       # fixed length (1) list of strings, the text label
      "one_hot_class": _float_feature(one_hot_class.tolist()) # variable length  list of floats, n=len(CLASSES)
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))
 
# sharding: there will be one "batch" of images per file 
def batch_data(dataset):
    return dataset.batch(SHARD_SIZE)

def export_tfrecord(shard, image, label, provider):
    # batch size
    batch_size = image.numpy().shape[0]
    
    # good practice to have the number of records in the filename
    filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, batch_size)
    
    with tf.io.TFRecordWriter(filename) as out_file:
        for i in range(batch_size):
            example = to_tfrecord(out_file,
                                  image.numpy()[i], # re-compressed image: already a byte string
                                  label.numpy()[i],
                                  provider.numpy()[i])      
            out_file.write(example.SerializeToString())
    return (filename, batch_size)
 
def proceed(result):
    filename, batch_size = result
    print("Wrote file {} containing {} records".format(filename, batch_size))

## Export processing

In [10]:
# Create dataset
dataset = load_data()
#dataset = resize_data(dataset)
dataset = compress_data(dataset)
dataset = batch_data(dataset)
 
# Create and send TFRecords to GCS (multiprocess)
print("Writing {} images into {} tfrec files containing {} images each...".format(NB_IMAGES, SHARDS, SHARD_SIZE))
for shard, (image, label, provider) in enumerate(dataset):
    #result = pool.apply_async(export_tfrecord, args = (shard, image, label, provider), callback = proceed)
    result = export_tfrecord(shard, image, label, provider)
    proceed(result)

Writing 21230 images into 42 tfrec files containing 512 images each...
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/00-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/01-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/02-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/03-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/04-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/05-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/06-512.tfrec containing 512 records
Wrote file gs://huynhdoo/prostate-cancer/mounting/256x36/tfrec-jpg-1536x1536/07-512.tfrec containing 512 records
Wrote file gs://huynhdoo/

In [None]:
# Cleaning remote files
# !gsutil -m rm -r gs://huynhdoo/prostate-cancer/mounting/128x36/tfrec-jpg-768x768/

# READ FROM TFRECORDS

In [None]:
def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "class_num": tf.io.FixedLenFeature([], tf.int64),   # shape [] means scalar
        
        # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
        "label":         tf.io.FixedLenFeature([], tf.string),  # one bytestring
        "provider":          tf.io.FixedLenFeature([], tf.string),  # two integers
        "size":          tf.io.FixedLenFeature([2], tf.int64),  # two integers
        "one_hot_class": tf.io.VarLenFeature(tf.float32)        # a certain number of floats
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    # FixedLenFeature fields are now ready to use: exmple['size']
    # VarLenFeature fields require additional sparse_to_dense decoding
    
    image = tf.image.decode_jpeg(example['image'], channels=3)
    image = tf.reshape(image, [*TARGET_SIZE, 3])
    class_num = example['class_num']
    
    label  = example['label']
    provider = example['provider']
    one_hot_class = tf.sparse.to_dense(example['one_hot_class'])
    return image, class_num, label, provider, one_hot_class
    
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.
 
option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False
 
filenames = tf.io.gfile.glob(GCS_OUTPUT + "*.tfrec")
dataset_test = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
dataset_test = dataset_test.with_options(option_no_order)
dataset_test = dataset_test.map(read_tfrecord, num_parallel_calls=AUTO)
dataset_test = dataset_test.shuffle(300)
display_dataset = dataset_test.map(lambda image, class_num, label, provider, one_hot_class: (image, label, provider))
display_images_from_dataset(display_dataset, 3)

# SPEED TEST

## Speed test: too slow
Google Cloud Storage is capable of great throughput but has a per-file access penalty. Run the cell below and see that throughput is around 8 images per second. That is too slow. Training on thousands of individual files will not work. We have to use the **TFRecord** format to group files together. 

In [None]:
display_dataset = dataset.batch(4)
for image, label, provider in display_dataset.take(3):
  print("Image batch shape {})".format(image.numpy().shape))

Image batch shape (4, 29))
Image batch shape (4, 29))
Image batch shape (4, 29))
Image batch shape (4, 29))
Image batch shape (4, 29))


## Speed test: fast
Loading training data is not a bottleneck anymore

In [None]:
for image, class_num, label, provider, height, width, one_hot_class in dataset4.take(300):
    print("Image shape {}, {}x{} px, class={} ({:>10}, {})".format(image.numpy().shape, width, height, class_num, label.numpy().decode('utf8'), one_hot_class))

Image shape (512, 512, 3), 512x512 px, class=1 (         1, [0. 1. 0. 0. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=1 (         1, [0. 1. 0. 0. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=3 (         3, [0. 0. 0. 1. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=5 (         5, [0. 0. 0. 0. 0. 1.])
Image shape (512, 512, 3), 512x512 px, class=3 (         3, [0. 0. 0. 1. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=0 (         0, [1. 0. 0. 0. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=1 (         1, [0. 1. 0. 0. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=3 (         3, [0. 0. 0. 1. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=2 (         2, [0. 0. 1. 0. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=3 (         3, [0. 0. 0. 1. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=0 (         0, [1. 0. 0. 0. 0. 0.])
Image shape (512, 512, 3), 512x512 px, class=5 (         5, [0. 0. 0. 0. 0. 1.])
Image shape (512, 512, 3), 5

# LICENSE



---

ORIGINAL<br>
author: Martin Gorner<br>
twitter: @martin_gorner

FORK<br>
author: Do Huynh<br>
git: huynhdoo

---


Copyright 2020 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


---


This is not an official Google product but sample code provided for an educational purpose
