In [20]:
import torch
import tables

import os,sys
import glob

import PIL
import numpy as np

import cv2
import matplotlib.pyplot as plt

from sklearn import model_selection
import sklearn.feature_extraction.image
import random


seed = 42
random.seed(seed) # set the seed
print(f"random seed (note down for reproducibility): {seed}")

random seed (note down for reproducibility): 42


In [21]:
#v2
#7/11/2018

dataname="epistroma"

patch_size=500 #size of the tiles to extract and save in the database, must be >= to training size
stride_size=250 #distance to skip between patches, 1 indicated pixel wise extraction, patch_size would result in non-overlapping tiles
mirror_pad_size=250 # number of pixels to pad *after* resize to image with by mirroring (edge's of patches tend not to be analyzed well, so padding allows them to appear more centered in the patch)
test_set_size=.1 # what percentage of the dataset should be used as a held out validation/testing set
resize=1 #resize input images
classes=[0, 1] #what classes we expect to have in the data, here we have only 2 classes but we could add additional classes and/or specify an index from which we would like to ignore

#-----Note---
#One should likely make sure that  (nrow+mirror_pad_size) mod patch_size == 0, where nrow is the number of rows after resizing
#so that no pixels are lost (any remainer is ignored)


In [22]:
# dtype in which the images will be saved, 
# this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
img_dtype = tables.UInt8Atom()

In [23]:
# create an atom to store the filename of the image, 
# just incase we need it later, 
filenameAtom = tables.StringAtom(itemsize=255) 

### list of mask files 

In [8]:
# create a list of the files, in this case 
# we're only interested in files which have masks so
files=glob.glob('/Users/ilyarudyak/data/epi/masks/*.png') 

In [10]:
files[:5]

['/Users/ilyarudyak/data/epi/masks/12884_00018_mask.png',
 '/Users/ilyarudyak/data/epi/masks/10264_00056_mask.png',
 '/Users/ilyarudyak/data/epi/masks/12867_00005_mask.png',
 '/Users/ilyarudyak/data/epi/masks/12929_00017_mask.png',
 '/Users/ilyarudyak/data/epi/masks/10291_00012_mask.png']

In [11]:
len(files)

42

In [12]:
!ls /Users/ilyarudyak/data/epi/masks/ | wc -l

      42


### create training and validation stages

In [13]:
phases={}
phases["train"], phases["val"] = next(iter(model_selection.ShuffleSplit(
    n_splits=1,test_size=test_set_size).split(files)))

In [15]:
phases['train']

array([20,  3, 16, 21, 31, 39, 26, 34,  9, 15, 36, 13,  0, 40, 25, 41, 28,
       12, 23, 30, 22, 19,  1,  8, 35, 17, 24, 11, 32,  4, 14, 38, 10, 37,
        7, 27, 18])

In [16]:
phases['val']

array([33,  2, 29,  6,  5])

In [24]:
#specify that we'll be saving 2 different image types to the database, 
# an image and its associated masked
imgtypes=["img", "mask"]

### main loop

In [26]:
#holder for future pytables
storage = {} 

In [28]:
# block shape specifies what we'll be saving into the pytable array, 
# here we assume that masks are 2d and images are 3d
block_shape = {} 
block_shape["img"] = np.array((patch_size, patch_size, 3))
block_shape["mask"] = np.array((patch_size, patch_size)) 

In [29]:
block_shape

{'img': array([500, 500,   3]), 'mask': array([500, 500])}

In [30]:
# we can also specify filters, such as compression, to improve storage speed
filters=tables.Filters(complevel=6, complib='zlib') 

In [31]:
phases.keys()

dict_keys(['train', 'val'])

In [45]:
# now for each of the phases, we'll loop through the files
for phase in phases.keys(): 
    print(phase)
    
    totals = np.zeros((2, len(classes))) 
    totals[0, :] = classes               

    # open the respective pytable
    hdf5_file = tables.open_file(f"/Users/ilyarudyak/data/epi/{dataname}_{phase}.pytable", mode='w') 
    
    # create the array for storage
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', filenameAtom, (0,)) 
    
    # for each of the image types, 
    # in this case mask and image, we need to create the associated earray
    for imgtype in imgtypes: 
        storage[imgtype]= hdf5_file.create_earray(hdf5_file.root, imgtype, img_dtype,  
                                                  shape=np.append([0],block_shape[imgtype]), 
                                                  chunkshape=np.append([1],block_shape[imgtype]),
                                                  filters=filters)
    # now for each of the files
    for filei in phases[phase]: 
        fname = files[filei] 
        print(fname)
        
        for imgtype in imgtypes:
            if(imgtype=="img"): #if we're looking at an img, it must be 3 channel, but cv2 won't load it in the correct channel order, so we need to fix that
                io = cv2.cvtColor(cv2.imread("/Users/ilyarudyak/data/epi/images/" + os.path.basename(fname).replace("_mask.png",".tif")), cv2.COLOR_BGR2RGB)
                interp_method=PIL.Image.BICUBIC
                
            else: #if its a mask image, then we only need a single channel (since grayscale 3D images are equal in all channels)
                io = cv2.imread(fname)/255 #the image is loaded as {0,255}, but we'd like to store it as {0,1} since this represents the binary nature of the mask easier
                interp_method = PIL.Image.NEAREST #want to use nearest! otherwise resizing may cause non-existing classes to be produced via interpolation (e.g., ".25")
                
                for i,key in enumerate(classes): #sum the number of pixels, this is done pre-resize, the but proportions don't change which is really what we're after
                    totals[1,i] += sum(sum(io[:,:,0]==key))

            
            io = cv2.resize(io,(0,0),fx=resize,fy=resize, interpolation=interp_method) #resize it as specified above
            io = np.pad(io, [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size), (0, 0)], mode="reflect")

            # convert input image into overlapping tiles, size is ntiler x ntilec x 1 x patch_size x patch_size x3
            io_arr_out=sklearn.feature_extraction.image.extract_patches(io,(patch_size,patch_size,3),stride_size)
            
            # resize it into a ntile x patch_size x patch_size x 3
            io_arr_out=io_arr_out.reshape(-1,patch_size,patch_size,3)
            
            
            
            #save the 4D tensor to the table
            if(imgtype=="img"):
                storage[imgtype].append(io_arr_out)
            else:
                storage[imgtype].append(io_arr_out[:,:,:,0].squeeze()) #only need 1 channel for mask data

        storage["filename"].append([fname for x in range(io_arr_out.shape[0])]) #add the filename to the storage array
        
    #lastly, we should store the number of pixels
    npixels=hdf5_file.create_carray(hdf5_file.root, 'numpixels', tables.Atom.from_dtype(totals.dtype), totals.shape)
    npixels[:]=totals
    hdf5_file.close()

train
/Users/ilyarudyak/data/epi/masks/12907_00003_mask.png
/Users/ilyarudyak/data/epi/masks/12929_00017_mask.png
/Users/ilyarudyak/data/epi/masks/12909_00003_mask.png
/Users/ilyarudyak/data/epi/masks/12820_00005_mask.png
/Users/ilyarudyak/data/epi/masks/9043_00045_mask.png
/Users/ilyarudyak/data/epi/masks/12930_00008_mask.png
/Users/ilyarudyak/data/epi/masks/9346_00019_mask.png
/Users/ilyarudyak/data/epi/masks/8975_00017_mask.png
/Users/ilyarudyak/data/epi/masks/9250_00025_mask.png
/Users/ilyarudyak/data/epi/masks/10295_00012_mask.png
/Users/ilyarudyak/data/epi/masks/12826_00003_mask.png
/Users/ilyarudyak/data/epi/masks/8974_00014_mask.png
/Users/ilyarudyak/data/epi/masks/12884_00018_mask.png
/Users/ilyarudyak/data/epi/masks/10260_00022_mask.png
/Users/ilyarudyak/data/epi/masks/12818_00006_mask.png
/Users/ilyarudyak/data/epi/masks/10282_00016_mask.png
/Users/ilyarudyak/data/epi/masks/8951_00005_mask.png
/Users/ilyarudyak/data/epi/masks/10304_00005_mask.png
/Users/ilyarudyak/data/epi/m

useful reference
http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html