# Separate the scans

We scanned multiple fish in [a special sample holder](https://github.com/TomoGraphics/Hol3Drs/blob/master/STL/Stickleback.Multiple.stl).
This notebook is used to separate them into different bunch of reonstructions.

The cells below are used to set up the whole notebook.
They load needed libraries and set some default values.

In [None]:
# Load the modules we need
import platform
import os
import glob
import pandas
import imageio
import numpy
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_scalebar.scalebar import ScaleBar
import seaborn
import dask
import dask_image.imread
from dask.distributed import Client, LocalCluster
import skimage
from tqdm.auto import tqdm, trange

In [None]:
# Load our own log file parsing code
# This is loaded as a submodule to alleviate excessive copy-pasting between *all* projects we do
# See https://github.com/habi/BrukerSkyScanLogfileRuminator for details on its inner workings
from BrukerSkyScanLogfileRuminator.parsing_functions import *

In [None]:
# Set dask temporary folder
# Do this before creating a client: https://stackoverflow.com/a/62804525/323100
# We use the fast internal SSD for speed reasons
import tempfile
if 'Linux' in platform.system():
    # Check if me mounted the FastSSD, otherwise go to standard tmp file
    if os.path.exists(os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')):
        tmp = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD', 'tmp')
    else:
        tmp = tempfile.gettempdir()
elif 'Darwin' in platform.system():
    tmp = tempfile.gettempdir()
else:
    if 'anaklin' in platform.node():
        tmp = os.path.join('F:\\tmp')
    else:
        tmp = os.path.join('D:\\tmp')
dask.config.set({'temporary_directory': tmp})
print('Dask temporary files go to %s' % dask.config.get('temporary_directory'))

In [None]:
from dask.distributed import Client
client = Client()

In [None]:
client

In [None]:
seaborn.set_context('notebook')

In [None]:
# Set up figure defaults
plt.rc('image', cmap='gray', interpolation='nearest')  # Display all images in b&w and with 'nearest' interpolation
# plt.rcParams['figure.figsize'] = (16, 9)  # Size up figures a bit
plt.rcParams['figure.dpi'] = 200

In [None]:
# Setup scale bar defaults
plt.rcParams['scalebar.location'] = 'lower right'
plt.rcParams['scalebar.frameon'] = False
plt.rcParams['scalebar.color'] = 'white'

In [None]:
# Display all plots identically
lines = 3
# And then do something like
# plt.subplot(lines, int(numpy.ceil(len(Data) / float(lines))), c + 1)

Since the (tomographic) data can reside on different drives we set a folder to use below

In [None]:
# Different locations if running either on Linux or Windows
FastSSD = True
if 'Linux' in platform.system():
    if FastSSD:
        BasePath = os.path.join(os.sep, 'media', 'habi', 'Fast_SSD')
    else:
        BasePath = os.path.join(os.path.sep, 'home', 'habi', '2214')
elif 'Windows' in platform.system():
    if FastSSD:
        BasePath = os.path.join('F:\\')
    else:
        BasePath = os.path.join('N:\\')
Root = os.path.join(BasePath, 'IEE Stickleback')
print('We are loading all the data from %s' % Root)

Now that we are set up, actually start to load/ingest the data.

In [None]:
# Make us a dataframe for saving all that we need
Data = pandas.DataFrame()

In [None]:
# Get *all* log files
# Using os.walk is way faster than using recursive glob.glob, see DataWrangling.ipynb for details
# Not sorting the found logfiles is also making it quicker
Data['LogFile'] = [os.path.join(root, name)
                   for root, dirs, files in os.walk(Root)
                   for name in files
                   if name.endswith((".log"))]

In [None]:
# Get all folders
Data['Folder'] = [os.path.dirname(f) for f in Data['LogFile']]

In [None]:
# Show a (small) sampler of the loaded data as a first check
Data.sample(n=5)

In [None]:
# Get rid of all the logfiles from all the folders that might be on disk but that we don't want to load the data from
for c, row in Data.iterrows():
    # Since this notebook only deals with the 'BucketOfFish' scans, drop all others
    if 'Bucket' not in row.Folder:  # Only use the scans named Bucket* here
        Data.drop([c], inplace=True)    
    # TEMPORARY GET RID OF D
    # TEMPORARY GET RID OF D
    # TEMPORARY GET RID OF D
    # TEMPORARY GET RID OF D    
    # elif '_D' in row.Folder:  # Only look at logs in any rec folder
    #     Data.drop([c], inplace=True)        
    # TEMPORARY GET RID OF D
    # TEMPORARY GET RID OF D
    # TEMPORARY GET RID OF D    
    elif 'rec' not in row.Folder:  # Only look at logs in any rec folder
        Data.drop([c], inplace=True)
    elif '_regions' in row.Folder:  # Exclude all log files that we write in this notebook (to $scan$_region folders)
        Data.drop([c], inplace=True)
    elif 'SubScan' in row.Folder:  # Exclude any log files from rsyncing temporary data
        Data.drop([c], inplace=True)
    elif 'rectmp.log' in row.LogFile:  # Exclude any log files from rsyncing temporary data
        Data.drop([c], inplace=True)
# Reset dataframe to something that we would get if we only would have loaded the 'rec' files
Data = Data.reset_index(drop=True)

In [None]:
# Generate us some meaningful colums in the dataframe
Data['Sample'] = [os.path.basename(l).replace('_rec.log','') for l in Data['LogFile']]
Data['Scan'] = [os.path.basename(os.path.dirname(l)) for l in Data['LogFile']]

In [None]:
# Quickly show the data from the last loaded scans
Data.tail()

In [None]:
# Load the file names of all the reconstructions of all the scans
Data['Filenames Reconstructions'] = [sorted(glob.glob(os.path.join(f, '*rec0*.png'))) for f in Data['Folder']]
# How many reconstructions do we have?
Data['Number of reconstructions'] = [len(r) for r in Data['Filenames Reconstructions']]

In [None]:
# Drop samples which have either not been reconstructed yet or of which we deleted the reconstructions with
Data = Data[Data['Number of reconstructions'] > 0]
# Reset the dataframe count/index for easier indexing afterwards
Data.reset_index(drop=True, inplace=True)
print('We have %s folders with reconstructions' % (len(Data)))

In [None]:
# Get parameters to doublecheck from logfiles
Data['Voxelsize'] = [pixelsize(log) for log in Data['LogFile']]
Data['Filter'] = [whichfilter(log) for log in Data['LogFile']]
Data['Exposuretime'] = [exposuretime(log) for log in Data['LogFile']]
Data['Scanner'] = [scanner(log) for log in Data['LogFile']]
Data['Averaging'] = [averaging(log) for log in Data['LogFile']]
Data['ProjectionSize'] = [projection_size(log) for log in Data['LogFile']]
Data['RotationStep'] = [rotationstep(log) for log in Data['LogFile']]
Data['Grayvalue'] = [reconstruction_grayvalue(log) for log in Data['LogFile']]
Data['RingartefactCorrection'] = [ringremoval(log) for log in Data['LogFile']]
Data['BeamHardeningCorrection'] = [beamhardening(log) for log in Data['LogFile']]
Data['DefectPixelMasking'] = [defectpixelmasking(log) for log in Data['LogFile']]
Data['Scan date'] = [scandate(log) for log in Data['LogFile']]

In [None]:
# # Load all reconstructions DASK arrays
# Reconstructions = [dask_image.imread.imread(os.path.join(folder,'*rec*.png')) for folder in Data['Folder']]
# Load all reconstructions into ephemereal DASK arrays, with a nice progress bar...
Reconstructions = [None] * len(Data)
for c, row in tqdm(Data.iterrows(),
                   desc='Loading reconstructions',
                   total=len(Data)):
    Reconstructions[c] = dask_image.imread.imread(os.path.join(row['Folder'], '*rec*.png'))#[:,:,:,0]

In [None]:
# Extract bucket name
Data['Bucket'] = [(l).split(os.sep)[-3].split('_')[-1] for l in Data['LogFile']]

In [None]:
# The three cardinal directions
# Names adapted to fishes: https://en.wikipedia.org/wiki/Fish_anatomy#Body
directions = ['Anteroposterior',
              'Lateral',
              'Dorsoventral']

In [None]:
# Read or calculate the directional MIPs, put them into the dataframe and save them to disk
for d, direction in enumerate(directions):
    Data['MIP_' + direction] = ''
for c, row in tqdm(Data.iterrows(), desc='Working on MIPs', total=len(Data)):
    for d, direction in tqdm(enumerate(directions),
                             desc='%s/%s' % (row['Sample'], row['Scan']),
                             leave=False,
                             total=len(directions)):
        outfilepath = os.path.join(os.path.dirname(row['Folder']),
                                   '%s.%s.MIP.%s.png' % (row['Sample'], row['Scan'], direction))
        if not os.path.exists(outfilepath):
            # Generate MIP
            # drop last axis, since dask.imread insists on reading reconstructions PNGs as RGB
            mip = Reconstructions[c][:,:,:,0].max(axis=d).compute()
            imageio.imwrite(outfilepath, mip.astype('uint8'))
        Data.at[c, 'MIP_' + direction] = dask_image.imread.imread(outfilepath).squeeze()

In [None]:
def getLargestCC(segmentation):
    # Based on https://stackoverflow.com/a/55110923
    labels = skimage.measure.label(segmentation)
    assert( labels.max() != 0 ) # assume at least 1 CC
    largestCC = labels == numpy.argmax(numpy.bincount(labels.flat)[1:])+1
    return largestCC

In [None]:
def vial_label_extractor(whichscan, threshold=33, part=333, verbose=True):
    # Let's get out the numbers, they are 'hidden' in the lower part
    bottom_mip = Reconstructions[whichscan][:part].max(axis=0)[:,:,0].compute()
    if not threshold:
        # Calculate multi Otsu with three classes, use highest threshold
        threshold = skimage.filters.threshold_multiotsu(bottom_mip)[-1]
    # remove largest component from thresholded bottom MIP
    # The largest component are the separation walls of the bucket
    numbers = numpy.bitwise_xor(bottom_mip>threshold, getLargestCC(bottom_mip>threshold))
    # Clean up the image by removing small objects
    numbers_cleaned = skimage.morphology.remove_small_objects(numbers, min_size=10000)
    # only labels should remain
    if verbose:
        plt.subplot(121)
        plt.imshow(Data['MIP_Dorsoventral'][whichscan])
        plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
        plt.gca().add_artist(matplotlib.patches.Rectangle((0, 0), Data['MIP_Dorsoventral'][whichscan].shape[1], part, alpha=0.618))
        plt.title('Bucket %s' % Data['Bucket'][whichscan])
        plt.axis('off')
        plt.subplot(122)
        plt.imshow(Data['MIP_Anteroposterior'][whichscan])
        plt.imshow(numpy.ma.masked_equal(numbers_cleaned, 0), cmap='viridis_r', alpha=0.618)
        plt.title('MIP of marked region\n%s recs >%s - their largest CC' % (part, threshold))
        plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
        plt.axis('off')
        plt.savefig('%s.%s.Labels.Extracted.png' % (os.path.join(os.path.dirname(Data['Folder'][whichscan]), Data.Sample[whichscan]), Data.Scan[whichscan]))
        plt.show()
    return(numbers_cleaned)

In [None]:
Data['VialLabels'] = [vial_label_extractor(i, verbose=True) for i in range(len(Data))]

In [None]:
def detect_fish_position(whichscan, threshold=None, verbose=False):
    import matplotlib.patches
    # Detect the fish positions based on blobs in the top-down MIP
    regions = None
    td_mip = Data['MIP_Anteroposterior'][whichscan].compute()
    if not threshold:
        threshold = skimage.filters.threshold_otsu(td_mip[td_mip>10])
    td_mip_thresholded = td_mip>threshold
    # Remove central part, on some scans the connector shows up...
    region_radius=200
    td_mip_thresholded[td_mip_thresholded.shape[0]//2-region_radius:td_mip_thresholded.shape[0]//2+region_radius,
                       td_mip_thresholded.shape[1]//2-region_radius:td_mip_thresholded.shape[1]//2+region_radius] = 0
    # Clean speckles, assuming all fish are larger than 5000 px
    cleaned = skimage.morphology.remove_small_objects(td_mip_thresholded,
                                                      min_size=5000)
    # Remove central part, on some scans the connector shows up...
    region_radius=275
    cleaned[cleaned.shape[0]//2-region_radius:cleaned.shape[0]//2+region_radius,cleaned.shape[1]//2-region_radius:cleaned.shape[1]//2+region_radius] = 0
    # Label image
    label_image = skimage.measure.label(cleaned)
    # Detect regions
    regions = skimage.measure.regionprops(label_image)
    if verbose:
        plt.subplot(121)
        plt.imshow(td_mip)
        plt.imshow(numpy.ma.masked_equal(Data['VialLabels'][whichscan], 0), cmap='viridis_r', alpha=0.618)        
        plt.title('Bucket %s' % Data['Bucket'][whichscan])
        plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
        plt.axis('off')
        plt.subplot(122)
        # to make the background transparent, pass the value of `bg_label`,
        # and leave `bg_color` as `None` and `kind` as `overlay`   
        plt.imshow(skimage.color.label2rgb(label_image, image=td_mip, bg_label=0))
        for c, region in enumerate(regions):
            # draw rectangle around segmented fish
            minr, minc, maxr, maxc = region.bbox
            rect = matplotlib.patches.Rectangle((minc, minr), maxc - minc, maxr - minr,
                                      fill=False, edgecolor='white', ls='--')
            # plt.scatter(region.centroid[1], region.centroid[0])
            plt.gca().add_patch(rect)
            plt.annotate('%s' % region.label, xy=((minc + maxc) / 2, minr - 15), color='white', ha='center')
        plt.title('Detected fish')
        plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
        plt.axis('off')
        plt.savefig('%s.%s.Labels.Detected.png' % (os.path.join(os.path.dirname(Data['Folder'][whichscan]), Data.Sample[whichscan]), Data.Scan[whichscan]))        
        plt.show()
    return(regions)

In [None]:
# Calculate the region properties of the single fish
Data['Regions'] = [detect_fish_position(i, verbose=True) for i in range(len(Data))]

In [None]:
def reorder_list(list, neworder = [0, 2, 4, 5, 3, 1]):
    '''
    Shuffle a list to a defined order
    We *deliberately* want a new list, so we can keep the old one around for double-checks
    https://stackoverflow.com/questions/2177590/how-can-i-reorder-a-list#comment106984501_2177607
    '''
    ordered_list = [list[i] for i in neworder]
    return(ordered_list)

In [None]:
# We had a *very* hard time to correctly shuffle the list
# If we both print out the label from the image and the shuffeled list, it's easier to check :)
print('X: from image', [i+1 for i in range(6)])
# For most buckets
print('Y: from label', reorder_list([i+1 for i in range(6)]))
# For bucket C
print('C: from label', reorder_list([i+1 for i in range(6)], neworder = [0, 2, 3, 5, 4, 1]))
# For bucket D
print('D: from label', reorder_list([i+1 for i in range(6)], neworder = [0, 2, 3, 5, 4, 1]))
# For bucket E
print('E: from label', reorder_list([i+1 for i in range(6)], neworder = [1, 3, 5, 4, 2, 0]))

In [None]:
# Reorder regions
# Since we always reconstructed vial 1 up north, the order is consistent (most of the time)
# We can thus just reshuffle *all* the detected regions for writing them out correctly afterwards
Data['Regions_Ordered'] = [reorder_list(regions) for regions in Data['Regions']]
# WARNING the region.labels now do NOT correspond to the correct label anymore :)

In [None]:
# If the reshuffling above does not work for one scan, we can reorder it deliberately again.
# Bucket C: label 4 and 5 are swapped, due to one having a bigger fish than the others
Data['Regions_Ordered'][2] = reorder_list(Data['Regions'][2], neworder = [0, 2, 3, 5, 4, 1])
# Bucket D: label 4 and 5 are swapped, due to one having a bigger fish than the others
Data['Regions_Ordered'][3] = reorder_list(Data['Regions'][3], neworder = [0, 2, 3, 5, 4, 1])
# Bucket E: we (temporarily) reconstructed 6 north
Data['Regions_Ordered'][4] = reorder_list(Data['Regions'][4], neworder = [1, 3, 5, 4, 2, 0])

In [None]:
# Construct us a fish number
Data['FishNumber'] = [[reg.label + (len(region) * c - 1) for reg in region] for c, region in enumerate(Data.Regions)]

In [None]:
# Overwrite blunders in first two batches with their correct numbers
# Bucket A
Data['FishNumber'][0] = [fn + 1 for fn in range(len(Data['FishNumber'][0]))]
Data['FishNumber'][0][2] = 30
# Bucket B
Data['FishNumber'][1][0] = 3

In [None]:
# Construct us a fish ID
# This should correspond to the fish ID in Bens XLS sheet
Data['FishID'] = [['FG.X23.%03d' % n for n in number] for number in Data.FishNumber]

In [None]:
def doublecheck_fish_position(whichscan):
    plt.subplot(121)
    plt.imshow(Data['MIP_Anteroposterior'][whichscan])
    plt.imshow(numpy.ma.masked_equal(Data['VialLabels'][whichscan], False), cmap='viridis_r')
    for c, region in enumerate(Data['Regions'][whichscan]):
        plt.annotate('%s' % str(c+1),
                     xy=(region.centroid[1] + 100, region.centroid[0]),
                     color='black',
                     va='center',
                     bbox=dict(fc="white", alpha=0.618))
    plt.title('Bucket %s, calculated labels' % Data['Bucket'][whichscan])
    plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
    plt.axis('off')
    plt.subplot(122)
    plt.imshow(Data['MIP_Anteroposterior'][whichscan])
    plt.imshow(numpy.ma.masked_equal(Data['VialLabels'][whichscan], False), cmap='viridis_r')
    for c, region in enumerate(Data['Regions_Ordered'][whichscan]):
        plt.annotate('%s:%s' % (c+1, Data['FishID'][whichscan][c]),                     
                     xy=(region.centroid[1], region.centroid[0]),
                     color='black',
                     fontsize=8,
                     va='center',
                     bbox=dict(fc="white", alpha=0.618))
    plt.title('Resorted label:mapped IDs')
    plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
    plt.axis('off')
    plt.savefig('%s.%s.Labels.Sorted.png' % (os.path.join(os.path.dirname(Data['Folder'][whichscan]), Data.Sample[whichscan]), Data.Scan[whichscan]))
    plt.show()
    return()

In [None]:
for i in range(len(Data)):
    doublecheck_fish_position(i)

In [None]:
def regionextractor(whichscan, buffer=50, verbose=True):
    os.makedirs(Data.Folder[whichscan] + '_regions', exist_ok=True)
    for c, region in tqdm(enumerate(Data['Regions_Ordered'][whichscan]),
                          total=len(Data['Regions_Ordered'][whichscan]),
                          desc='Extracting and visualizing regions'):
        outputfilename = os.path.join(Data.Folder[whichscan] + '_regions', 'region_%s_%s.zarr' % (str(c+1),
                                                                                                  Data['FishID'][whichscan][c]))
        if not os.path.exists(outputfilename):
            # Crop current region out of reconstructions stack, drop RGB axis and rechunk, making for even more efficient access
            currentregion = Reconstructions[whichscan][:,region.bbox[0]-buffer:region.bbox[2]+buffer,region.bbox[1]-buffer:region.bbox[3]+buffer][:,:,:,0].rechunk('auto')
            print('Writing to %s. This takes a while.' % outputfilename[len(Root)+1:])
            dask.array.to_zarr(currentregion, outputfilename)
        if verbose:
            # Read written file back in, so we can profit from the rechunking
            currentregion = dask.array.from_zarr(outputfilename)
            plt.subplot(2,6,c+1)
            plt.imshow(Data['MIP_Anteroposterior'][whichscan][region.bbox[0]-buffer:region.bbox[2]+buffer,region.bbox[1]-buffer:region.bbox[3]+buffer])
            plt.imshow(numpy.ma.masked_equal(Data['VialLabels'][whichscan][region.bbox[0]-buffer:region.bbox[2]+buffer,region.bbox[1]-buffer:region.bbox[3]+buffer],
                                             False),
                       cmap='viridis_r')
            plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
            plt.title('%s:Cut original' % str(c + 1))
            plt.axis('off')
            # plt.subplot(6, 2, (2 * c ) + 2)
            plt.subplot(2,6,c+1+6)  
            # Recalculate MIP for double-checking
            plt.imshow(currentregion.max(axis=0))
            plt.gca().add_artist(ScaleBar(Data['Voxelsize'][whichscan], 'um'))
            plt.title(Data['FishID'][whichscan][c])
            plt.axis('off')
    if verbose:
        plt.savefig('%s.%s.Regions.png' % (os.path.join(os.path.dirname(Data['Folder'][whichscan] + '_regions'), Data.Sample[whichscan]), Data.Scan[whichscan]))
        plt.show()
    return()

In [None]:
# Explicitly state the buffer, we want it later for adding the crop region to the regional log files
buffer = 50
for i in range(len(Data)):
    regionextractor(i, buffer=buffer, verbose=True)

In [None]:
# Load overview and labbook image, if present
Data['LabbookImage'] = [dask_image.imread.imread(os.path.join(os.path.dirname(f), '_labbook.jpg')).squeeze() for f in Data['Folder']]
Data['OverviewImage'] = [dask_image.imread.imread(os.path.join(os.path.dirname(f), '_overview.jpg')).squeeze()
                         if os.path.exists(os.path.join(os.path.dirname(f), '_overview.jpg'))
                         else numpy.random.random((2**8,2**8)) for f in Data['Folder']]

In [None]:
# Show all information we have, to double-check the mapping
for c, row in Data.iterrows():
    plt.subplot(131)
    plt.imshow(row.LabbookImage)
    plt.title('Bucket %s: Labbook' % row.Bucket)
    plt.axis('off')
    plt.subplot(132)
    plt.imshow(row.OverviewImage)
    plt.title('Bucket %s: Tubes' % row.Bucket)
    plt.axis('off')
    plt.subplot(133)
    plt.imshow(Data['MIP_Anteroposterior'][c])
    plt.imshow(numpy.ma.masked_equal(Data['VialLabels'][c], False), cmap='viridis_r')
    for d, region in enumerate(Data['Regions_Ordered'][c]):
        plt.annotate('%s:%s' % (d+1, Data['FishID'][c][d]),                     
                     xy=(region.centroid[1], region.centroid[0]),
                     color='black',
                     ha='center',
                     va='center',
                     fontsize=8,
                     bbox=dict(fc="white", alpha=0.618))
    plt.title('Bucket %s: IDs' % row.Bucket)
    plt.gca().add_artist(ScaleBar(Data['Voxelsize'][c], 'um'))
    plt.axis('off')
    plt.savefig('%s.%s.Mapping.png' % (os.path.join(os.path.dirname(row['Folder']), row.Sample), row.Scan))
    plt.show()

In [None]:
# Read in regions again
RegionZarrFiles = [glob.glob(os.path.join(folder + '_regions', '*.zarr')) for folder in Data['Folder']]
Regions = [[dask.array.from_zarr(f) for f in files] for files in RegionZarrFiles]

In [None]:
# We want to generate log files for the cutout regions
# Aeons ago, we wrote a little wrapper function to log stuff at TOMCAT
# https://github.com/habi/TOMCAT/blob/master/postscan/StackedScanOverlapFinder.py#L104
# The function below is slightly tweaked from there
def myLogger(logfilename, verbose=False):
    import logging
    logger = logging.getLogger(logfilename)
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(logfilename, 'w')
    logger.addHandler(handler)
    if verbose:
        print('Logging to %s' % logfilename)
    return logger
# Then write to the file with
# logfile = myLogger(Filename))
# logfile.info('Put this into the log file')

In [None]:
# Save out a log file
for c, row in tqdm(Data.iterrows(), total=len(Data), desc='Writing log files for regions'):
    for d, region in tqdm(enumerate(row.Regions),
                            total=len(row.Regions),
                            desc=Data.Folder[c][len(Root)+1:]):
        # Generate output directory
        os.makedirs(os.path.join(row.Folder + '_regions', row['FishID'][d]), exist_ok=True)
        # Generate logfile name
        logfilename = os.path.join(row.Folder + '_regions', row['FishID'][d], row['FishID'][d] + '.log')
        # Delete logfile, if it already exists
        if os.path.exists(logfilename):
            os.remove(logfilename)
        logfile = myLogger(logfilename)
        logfile.info('Scan = %s' % os.path.join(row.Sample, row.Scan))
        logfile.info('Voxel size = %s um' % row.Voxelsize)
        logfile.info('ID = %s' % row['FishID'][d])
        logfile.info('Vial = %s' % str(d + 1))
        logfile.info('Centroid (x,y) in the original stack = %s, %s' % (int(round(region.centroid[1])), int(round(region.centroid[0]))))
        logfile.info('Bounding box (x1:x2, y1:y2) of this region in the original stack = %s:%s, %s:%s' % (region.bbox[1]-buffer, region.bbox[3]+buffer,
                                                                                                          region.bbox[0]-buffer, region.bbox[2]+buffer))

In [None]:
from joblib import Parallel, delayed
def imsaver(image, filename):
    ''' Function for parallelizing writing out images '''
    if not os.path.exists(filename):  # only do something if there's no image on disk
        if image.mean():  # only write something if there's really an image
            imageio.imwrite(filename, image.astype('uint8'))

In [None]:
# Save out PNG slices 
for c, row in tqdm(Data.iterrows(),
                   total=len(Data),
                   desc='Saving out PNGs for each region of each bucket'):
    for d, zarrfile in tqdm(enumerate(Regions[c]),
                            total=len(Regions[c]),
                            desc=Data.Folder[c][len(Root)+1:]):
        # print(zarrfile.shape)
        # plt.imshow(zarrfile[666])
        # plt.show()
        # Make output directory
        outputdir = os.path.join(row.Folder + '_regions', row['FishID'][d])
        os.makedirs(outputdir, exist_ok=True)
        outputfilenames = [os.path.join(outputdir,
                                        os.path.basename(fn)).replace(Data.Sample[c], Data['FishID'][c][d]) for fn in Data['Filenames Reconstructions'][c]]
        parallelize = True
        if parallelize:
            # Hat tip to Oleksiy for providing a snippet to parallelize the PNG writing 
            # It is paramount that the filenames are sorted though!
            Parallel(n_jobs=-1)(delayed(imsaver)(zarrfile[slice],
                                                 outputfilenames[slice]) for slice in range(len(outputfilenames)))
        else:
            for slice in tqdm(range(len(outputfilenames)),
                              total=len(outputfilenames),
                              desc='%s' % os.path.splitext(RegionZarrFiles[c][d])[0][len(Root)+1:],
                              leave=False):
                if not os.path.exists(outputfilenames[slice]):
                    imageio.imwrite(outputfilenames[slice], zarrfile[slice].astype('uint8'))

In [None]:
def thresholder(stack, discard=5, verbose=False):
    '''
    Threshold function to reliably threshold *only* bones
    First, we calculate a low threshold to 'filter' out the fish.
    Then, we threshold everything above that again to get the bone threshold
    '''
    thresholds = skimage.filters.threshold_multiotsu(stack[stack>discard].compute(),
                                                     classes=4)
    if verbose:
        histogram, bins = dask.array.histogram(stack,
                                               bins=2**8,
                                               range=[0, 2**8])
        plt.semilogy(histogram)
        plt.axvline(discard, label='completely discarded, below %s' % discard)
        for t in thresholds:
            plt.axvline(t, label='threshold %s' % t)
        plt.xlim([0, 2**8])
        plt.legend()
        seaborn.despine()
        plt.show()
    # Return only the middle threshold value
    return(thresholds[1])

In [None]:
thresholds = thresholder(Regions[0][0], verbose=False)
print(thresholds)

In [None]:
slice = 1500
plt.subplot(131)
plt.imshow(Regions[0][0][slice])
plt.axis('off')
plt.subplot(132)
plt.imshow((Regions[0][0][slice]>thresholds))
plt.title(t)    
plt.axis('off')
plt.subplot(133)
plt.imshow(Regions[0][0][slice][150:600,50:-50])
plt.imshow(dask.array.ma.masked_equal((Regions[0][0][slice]>thresholds)[150:600,50:-50], 0), cmap='viridis_r', alpha=0.618)
plt.axis('off')
plt.show() 

In [None]:
# Calculate threshold for each separated region
# Each "threshold" is actually three values, we select the middle one "[1]" later on.
Data['RegionThreshold'] = [[thresholder(rg) for rg in regions] for regions in Regions]

In [None]:
Data['RegionThreshold']

In [None]:
for c, row in Data.iterrows():
    print(row.Bucket)
    for d, region in enumerate(Regions[c]):
        plt.imshow(region[len(region)//5]>row['RegionThreshold'][d] * 2)
        plt.title('Slice %s of %s > %s' % (len(region)//5, row['FishID'][d], row['RegionThreshold'][d]))
        plt.gca().add_artist(ScaleBar(Data['Voxelsize'][c], 'um'))        
        plt.axis('off')
        plt.show()

In [None]:
# Write out thresholded regions
for c, row in tqdm(Data.iterrows(),
                   total=len(Data),
                   desc='Saving out bucket'):
    for d, region in tqdm(enumerate(Regions[c]),
                          total=len(Regions[c]),
                          desc='Saving out regions',
                          leave=False):
        outputfilename = RegionZarrFiles[c][d].replace('rec_regions',
                                                       'rec_regions_thresholded').replace(row.FishID[d],
                                                                                          '%s_thresholded_%03d' % (row.FishID[d], row.RegionThreshold[d]))
        if not os.path.exists(outputfilename):
            print('Writing %s > %s to %s.' % (row.FishID[d],
                                              row.RegionThreshold[d],
                                              outputfilename[len(Root)+1:]))
            dask.array.to_zarr((region>row.RegionThreshold[d]), outputfilename)

In [None]:
# Load the thresholded regions
ThresholdedRegionZarrFiles = [glob.glob(os.path.join(folder + '_regions_thresholded', '*.zarr')) for folder in Data['Folder']]
ThresholdedRegions = [[dask.array.from_zarr(f) for f in files] for files in ThresholdedRegionZarrFiles]

In [None]:
# Save out thresholded PNG slices 
for c, row in tqdm(Data.iterrows(),
                   total=len(Data),
                   desc='Saving out PNGs for each thresholded region of each bucket'):
    for d, zarrfile in tqdm(enumerate(ThresholdedRegions[c]),
                            total=len(ThresholdedRegions[c]),
                            desc=Data.Folder[c][len(Root)+1:]):
        # Make output directory
        outputdir = os.path.join(row.Folder + '_regions_thresholded', row['FishID'][d])
        os.makedirs(outputdir, exist_ok=True)
        outputfilenames = [os.path.join(outputdir,
                                        os.path.basename(fn)).replace(Data.Sample[c], Data['FishID'][c][d]) for fn in Data['Filenames Reconstructions'][c]]
        parallelize = True
        if parallelize:
            # Hat tip to Oleksiy for providing a snippet to parallelize the PNG writing 
            # It is paramount that the filenames are sorted though!
            Parallel(n_jobs=-1)(delayed(imsaver)(zarrfile[slice],
                                                 outputfilenames[slice]) for slice in range(len(outputfilenames)))
        else:
            for slice in tqdm(range(len(outputfilenames)),
                              total=len(outputfilenames),
                              desc='%s' % os.path.splitext(RegionZarrFiles[c][d])[0][len(Root)+1:],
                              leave=False):
                if not os.path.exists(outputfilenames[slice]):
                    imageio.imwrite(outputfilenames[slice], zarrfile[slice].astype('uint8'))

In [None]:
def labeler(stack):
    return(labeled_stack)

In [None]:
aSDFASDFASDF==£

In [None]:
# Minimize .zarr files to only fish-extent

In [None]:
for c, region in enumerate(Regions):
    outfilename = RegionZarrFiles[c].replace('_rec.zarr', '.MIPs.png')
    if not os.path.exists(outfilename):
        for d, direction in enumerate(directions):
            plt.subplot(1, 3 , d+1)
            plt.imshow(region.max(axis=d))
            plt.title('Region %s\n%s MIP' % (c, direction))
            plt.axis('off')
            plt.gca().add_artist(ScaleBar(voxelsize, 'um'))
        plt.savefig(outfilename)
        plt.show()
    else:
        print('MIP already saved to %s' % outfilename)

In [None]:
# Calculate the histograms of one of the MIPs
# Caveat: dask.da.histogram returns histogram AND bins, making each histogram a 'nested' list of [h, b]
Histograms = [dask.array.histogram(dask.array.array(region),
                                          bins=2**8,
                                          range=[0, 2**8]) for region in Regions]
# Actually compute the data and put only h into the dataframe, so we can use it below.
# Discard the bins
Histograms = [h.compute() for h, b in Histograms]

In [None]:
Thresholds = [skimage.filters.threshold_otsu(region[:,:,:,0][region[:,:,:,0]>10].compute()) for region in Regions]

In [None]:
for c, hist in enumerate(Histograms):
    plt.semilogy(hist,
                 c=seaborn.color_palette()[c])
    plt.axvline(Thresholds[c],
                label='R%s: %s' % (c, Thresholds[c]),
                c=seaborn.color_palette()[c])
plt.legend()
plt.show()

In [None]:
for c, region in enumerate(Regions):
    outfilename = RegionZarrFiles[c].replace('_rec.zarr', '.MIPsasdfasdfa.png')
    region = region[:,:,:,0].compute()
    if not os.path.exists(outfilename):
        for d, direction in enumerate(directions):
            plt.subplot(1, 3 , d+1)
            plt.imshow((region>Thresholds[c]).max(axis=d))
            plt.title('Region %s\n%s MIP' % (c, direction))
            plt.axis('off')
            plt.gca().add_artist(ScaleBar(voxelsize, 'um'))
        # plt.savefig(outfilename)
        plt.show()
    else:
        print('MIP already saved to %s' % outfilename[len(Root)+1:])

In [None]:
Thresholds

In [None]:
labels = skimage.morphology.label(Regions[0][:,:,:,0]>Thresholds[0])

In [None]:
import zarr

In [None]:
# Label fish and save out as .zarr
os.makedirs(Data.Folder[whichscan] + '_labeled', exist_ok=True)
for c, region in tqdm(enumerate(Regions), total=len(regions)):
    plt.subplot(1, 6, c+1)
    currentregion = skimage.morphology.label(region[:,:,:,0]>Thresholds[c])
    outputfilename = os.path.join(Data.Folder[whichscan] + '_labeled', 'region_%s_rec_labeled.zarr' % str(c+1))
    if not os.path.exists(outputfilename):
        print('writing to', outputfilename)
        zarr_out_3D_convenient = zarr.save(outputfilename, currentregion)
    else:
        print(outputfilename[len(Root)+1:], 'already exists')
    currentmip = currentregion.max(axis=0)
    plt.imshow(currentmip)
    plt.gca().add_artist(ScaleBar(voxelsize, 'um'))
    plt.title('Region %s' % c)
    plt.axis('off')
plt.show()

In [None]:
# Read in labels again
LabelZarrFiles = glob.glob(os.path.join(Data.Folder[whichscan] + '_labeled', '*.zarr'))
Labels = [dask.array.from_zarr(file) for file in LabelZarrFiles]

In [None]:
for c, region in enumerate(Labels):
    outfilename = LabelZarrFiles[c].replace('_rec_labeled.zarr', '.MIPs.labeled.png')
    print(outfilename)
    # region = region[:,:,:,0].compute()
    if not os.path.exists(outfilename):
        for d, direction in enumerate(directions):
            plt.subplot(1, 3 , d+1)
            plt.imshow((region).max(axis=d))
            plt.title('Region %s\n%s MIP' % (c, direction))
            plt.axis('off')
            plt.gca().add_artist(ScaleBar(voxelsize, 'um'))
        plt.savefig(outfilename)
        plt.show()
    else:
        print('MIP overview image already saved to %s' % outfilename[len(Root)+1:])

In [None]:
for c, region in enumerate(Labels):
    outfilename = LabelZarrFiles[c].replace('_rec_labeled.zarr', '.Summed.labeled.png')
    if not os.path.exists(outfilename):
        for d, direction in enumerate(directions):
            plt.subplot(1, 3 , d+1)
            plt.imshow((region).sum(axis=d))
            plt.title('Region %s\n%s Sum' % (c, direction))
            plt.axis('off')
            plt.gca().add_artist(ScaleBar(voxelsize, 'um'))
        plt.savefig(outfilename)
        plt.show()
    else:
        print('Summed image already saved to %s' % outfilename[len(Root)+1:])

In [None]:
slice = 333
for c, r in enumerate(Regions):
    plt.subplot(2,3,c+1)
    plt.imshow(r[slice])
    # plt.imshow((r[:,:,:,0]>Thresholds[c])[slice], alpha=0.5, cmap='viridis')
    plt.imshow(skimage.morphology.label(r[:,:,:,0][slice]>Thresholds[c]), alpha=0.5, cmap='viridis')
    plt.title('R%s' % c)
    plt.axis('off')
plt.show()

In [None]:
Labels[0]

In [None]:
# Save out PNG slices for later use
for c, zarrfile in tqdm(enumerate(Labels),
                        total=len(Labels),
                        desc=Data.Folder[whichscan][len(Root)+1:]):
    # Make output directory
    os.makedirs(os.path.splitext(LabelZarrFiles[c])[0], exist_ok=True)
    for d, slice in tqdm(enumerate(zarrfile),
                         total=len(zarrfile),
                         desc='Saving to %s' % os.path.splitext(LabelZarrFiles[c])[0][len(Root)+1:],
                         leave=False):
        outfilepath = os.path.join(os.path.splitext(LabelZarrFiles[c])[0],
                                   os.path.basename(Data['Filenames Reconstructions'][whichscan][d])).replace('_rec00', '_region_%s_labeled_rec00' % str(c+1))
        if not os.path.exists(outfilepath):
            # plt.imshow(slice.compute())
            # plt.show()
            # print(type(slice))
            imageio.imwrite(outfilepath, slice.compute().astype('uint8'))

In [None]:
blobs = skimage.feature.blob_dog(clean)

In [None]:
blobs

In [None]:
plt.subplot(121)
plt.imshow(clean)
plt.subplot(122)
plt.imshow(mip)