In [None]:
'''
Generate training data for use with @jkimmel's fork 
of @vanvalen's DeepCell

DeepCell trains vanilla CNNs on many small 'receptive fields',
each centered on a pixel of interest. The network is trained
to predict the likely class ('feature') of the center pixel, 
based on the information in its surrounding neighborhood.

To train the networks using Keras tools, we first need to build
a numpy .npz archive of these receptive fields, paired with the 
ground truth class they belong to. This notebook outlines that 
process.

To start, you'll need a set of raw images, paired with ground truth
masks labeling the class of each pixel.
A typical data set might be several images of cells, with ground truth
masks labeling cell pixels as 1 and background pixels as 0. 

Filenames for raw images and masks should correspond, with the exception
of a single string identifying the channel or feature the image indicates.

Example directory of training images:

    img_0_DIC.tif
    img_0_feature0.tif
    img_0_feature1.tif
    img_1_DIC.tif
    ...
'''

from __future__ import print_function, division # for python2 compatability
from trainingData import *

## Set some parameters describing the dataset

# path to directory containing images
# can also use seperate dirs for features/channels, just change paths in subsequent commands
direc_name = '/path/to/image/directory'
# strings in the filenames of binary masks describing different features
feature_names = ['feature0', 'feature1'] 
# strings in the filenames of raw images taken on different channels
channel_names = ['DIC']

# size of the receptive field in X & Y, +/- from the center, minus 1
# window_Dim = receptive_field_dim_size/2 - 1
# i.e. window_x = 40 describes a receptive field of 81 in the X dimension
#
# DeepCell training networks are specific to the receptive field size
# Unless you want to design your own networks, use receptive fields sized
# 41x41, 61x61, or 81x81
# Bigger fields generally increase accuracy, but take longer to train
# and may decrease fine-grained spatial resolution
window_x = 40
window_y = 40 # 81x81 receptive field

# Set a maximum number of images to derive training examples
# Training data generation is memory intensive,
# reduce this number if you run into memory issues
max_direcs = 50 
# Set a maximum number of receptive fields to train on, 
# with examples taken from the number of images specified above
# ~1 million is good starting point
max_training_examples = 10**6

# Set a path to save the resulting .npz archive of training data
file_name_save = '/path/to/save.npz'

In [None]:
'''
Load raw images into a numpy array and perform preprocessing

default preprocessing subtracts the median from each image and smooths
with an averaging kernel

See trainingData.py source for additional preprocessing options
'''
channels = load_channel_imgs(direc_name, channel_names, window_x, window_y, max_direcs)

In [None]:
'''
Load feature masks into a numpy array
'''
feature_mask = load_feature_masks(direc_name, feature_names, window_x, window_y, max_direcs)

In [None]:
'''
Determine which feature has the fewest available examples
used for class balancing downstream
'''
min_num = determine_min_examples(feature_mask, window_x, window_y)

In [None]:
'''
Generate a 4 x N feature matrix 
each row represents a different piece of information
each col a different example pixel

Layout:
    rows -- row indices for pixels to train on.
    cols -- col indices for pixels to train on.
    batch -- batch indices for pixels to train on. Only neccessary if subdirectories
                in the training directory are utilized.
    labels -- ground truth class labels for pixels to train on.

NOTE: This is memory intensive.
'''

feature_matrix = identify_training_pixels(feature_mask, min_num, window_x, window_y, max_training_examples)

In [None]:
'''
Save the generated training data to a .npz archive
'''
save_training_data(file_name_save, channels, feature_matrix, window_x, window_y)