In [1]:
import os
import glob
from joblib import Parallel, delayed

# Converting the images to tensorflow data

## Convert classified image folders to tfrecords

For each pair of image/label files, we want to create a tensorflow Example and then write those to a .tfrecords file.

In order to be able to parallelise this step, we'll create sharded output files i.e. one .tfrecords for each worker.

## Approach 1: Translate to tfrecords via multithreaded process.

This is taken from #https://github.com/tensorflow/models/blob/f87a58cd96d45de73c9a8330a06b2ab56749a7fa/research/inception/inception/data/build_image_data.py#L291

This uses tensorflow libraries to do all the time consuming parts like reading the image files. Because they are written with this in mind they can be parallelised through multithreading (they don't lock the GIL). However this means that we're limited to what they provide in terms of supported data types etc. In particular they don't support geotiffs or images >8bit depth. So we can only do this for our RGB imagery and first we have to translate the GeoTIFFs to PNG (or could make these in the first place).

In [13]:
IMAGE_FOLDER = r'C:/Users/harry/Documents/Data/airbus_1m_16pad//'
new_imagefolder = r'C:/Users/harry/Documents/Data/airbus_1m_16pad_png/' 
#new_images = glob.glob(os.path.join(new_imagefolder, 'images', '*.png'))
tf_thread_folder = r'C:\Users\harry\Documents\Data\tf_threaded_jpg'
tf_mp_folder = r'C:\Users\harry\Documents\Data\tf_mp'
tf_mp_tif_folder = r'C:\Users\harry\Documents\Data\tf_tif_mp'
sentinel_img_folder = r'C:\Users\harry\Documents\Data\sentinel_10m_2pad'
tf_mp_sentinel_folder = r'C:\Users\harry\Documents\Data\tf_sentinel_mp'

In [42]:
IMAGE_FOLDER = r'C:/Users/harry/Documents/Data/sentinel_10m_2pad_SR_sudan/'
TF_FOLDER = r'C:/Users/harry/Documents/Data/tf_sentinel_SR_arr_sudan/'


### Translate all the tiffs to PNGs

This is only appropriate for the 3 band 8-bit imagery! Do this to translate all the tiffs in a folder to pngs.

In [None]:
import rasterio
def translate_tif_to_png(tif_path):
    with rasterio.open(tif_path, 'r') as ds:
        out_path = tif_path.replace(IMAGE_FOLDER, new_imagefolder).replace('.tif', '.png')
        if not os.path.exists(os.path.dirname(out_path)):
            os.makedirs(os.path.dirname(out_path))
        with rasterio.open(out_path, 'w', driver='PNG',
                          width=ds.width, height=ds.height, count=ds.count,
                           dtype=ds.dtypes[0],nodata=ds.nodata,transform=ds.transform, 
                           crs=ds.crs) as dst:
            dst.write(ds.read())

images = glob.glob(os.path.join(IMAGE_FOLDER, 'images',  '*.tif'))
labels = glob.glob(os.path.join(IMAGE_FOLDER, 'labels',  '*.tif'))
Parallel(n_jobs=8)(delayed(translate_tif_to_png)(t) for t in images)
Parallel(n_jobs=8)(delayed(translate_tif_to_png)(t) for t in labels)
IMAGE_FOLDER = new_imagefolder

In [19]:
import img_to_tf_threaded;from img_to_tf_threaded import process_dataset_multithreaded
import tensorflow as tf
import numpy as np

In [1]:
import tf_example_creation; from tf_example_creation import convert_to_example

We can use this function to translate a complete image folder tree to sharded tfrecords,  if the images are 8-bit PNG or JPG files (although currently it's coded to look for only PNG). PNGs can be transcoded to JPG along the way, which decreases filesize but is lossy.

This process is super fast as it uses optimised TF code throughout.

In [None]:
process_dataset_multithreaded(name="test",  directory=IMAGE_FOLDER, out_directory=TF_FOLDER_MP, 
                              num_shards=12, num_threads=12,
                             dltile_from_filename=True, convert_png_to_jpg=False,
                             store_as_array=True)

## Approach 2: Translate to tfrecords via multiprocessing

Tensorflow native code doesn't have any readers for Tiff files and imagery stuff in general seems to be strongly oriented around 3 band 8 bit images, so isn't helpful for multispectral tiff files with higher bit depths.

We'll use rasterio to load the image data. However we still use the tf API to actually read the data from disk then Rasterio parses it as an in-memory dataset (this seems hugely faster). 

We can't profitably use multithreading because the GIL won't get released so it won't be any faster. Instead we process the images using multiprocessing to split into shards / batches, each process will write one or more separate tfrecords files. 

In [4]:
import img_to_tf_mp; from img_to_tf_mp import process_dataset_mp

In [5]:
import tensorflow as tf

In [None]:
reload(img_to_tf_mp)

In [43]:
process_dataset_mp(name="sentinel_SR_arr_sudan", directory=IMAGE_FOLDER, out_directory=TF_FOLDER, 
                   num_shards=12, num_proc=12, 
                   dltile_from_filename=True, file_ext='tif', 
                   store_as_array=True) 

Determining list of input files and labels from C:/Users/harry/Documents/Data/sentinel_10m_2pad_SR_sudan/.
Found 527 tif image files and 527 label files inside C:/Users/harry/Documents/Data/sentinel_10m_2pad_SR_sudan/.


## Approach 3 : don't translate to tfrecords files, define a mapping in the pipeline


(TBD)

In [45]:
img_files = tf.constant([tpl[0] for tpl in res])
lbl_files = tf.constant([tpl[1] for tpl in res])

In [46]:
ds = tf.data.Dataset.from_tensor_slices((img_files, lbl_files))

In [None]:
from tf_example_creation import convert_to_example
from img_to_tf_threaded import _process_image, ImageCoder

coder = ImageCoder()

def eg_from_image_paths(image_file, label_file):
    img_arr, h, w, c, k = _process_image(image_file, coder)
    target_arr, _, _, _, k2 = _process_image(label_file, coder)
    assert k == k2
    eg = convert_to_example(img_arr, target_arr, h, w, c, h, w, k)
    #[img_arr, lbl_arr, shp, dlkey] = tf.py_function(_process_image_and_lbl, image_file, label_file)
    #img_arr.set_shape(shp)
    return img_arr, lbl_arr, shp, dlkey

In [None]:
ds = ds.map(eg_from_image_paths)

In [None]:

# convert classified image folders to tfrecords

def tfrecord_from_images(imgpath, lblpath):
    #  parse the dl tile key back out of the filename
    dl_tile_key = os.path.basename(imgpath).split()
infiles = glob.glob(r'E:\Temp\mumbai_esri_train2m\images\*.tif')
len(infiles)