# Parse for Cellpose
Parse the data in PNAS to fit with the specifications of Cellpose.

The neural network in cellpose works on 2D images, and hence each z-slice is stored in the folder 'PNAS-Cellpose' under the file name 'sample_<sample_i>_layer_<z_i>'.
The masks have the same name but with an added '_masks' at the end. 



In [1]:
import os
from os.path import join
import imageio
source_path = '/scratch/ottosson/datasets/SAM/data/PNAS'
destination_path = '/scratch/ottosson/datasets/SAM/data/PNAS-Cellpose'

In [2]:
# Get all directories in source path
plants = [dir for dir in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, dir))] 
# Set intermediat paths 
intermediate_input_path = 'processed_tiffs'
intermediate_target_path = 'segmentation_tiffs'
# Set strings which are unique for input and output to be able to tell them apart
determiner_input_string = 'acylYFP'
determiner_output_string = ''

In [3]:
def find_label_file(file, directory):
    """
    Finds the files in the directory which share the timestamp and plant name of 'file'
    The file is assumed to be named "time_plant_XXXXXX.XX"
    """
    parts = file.split("_")
    time = parts[0]
    plant = parts[1]
    matched_files = []
    for f in os.listdir(directory):
        f_parts = f.split("_")
        f_time = f_parts[0]
        f_plant = f_parts[1]
        if time == f_time and plant == f_plant:
            matched_files.append(f)
    if len(matched_files) != 1:
        print(f"Wrong number of file matched: {matched_files}\nfile: {file}\nDir: {directory}\n")
    if len(matched_files) == 0:
        return None
    return matched_files[0]

In [4]:
# COPY WITHOUT! THE FLOW
sample_i = 0
# Go trhough all plant 'movies'
for plant in plants:
    # Get all frames in 'movie'
    plant_path = join(source_path,plant)
    files = os.listdir(join(plant_path,intermediate_input_path))
    # Go through all frames
    for file in files:
        # Skip files which are not training
        if determiner_input_string not in file: continue 
        # Find target file corresponding to input file
        input_path = join(plant_path,intermediate_input_path,file)
        target_name = find_label_file(file,join(plant_path,intermediate_target_path))
        # If no target file, contnue
        if not target_name: continue
        
        # Create paths to targets and inputs
        target_path = join(plant_path,intermediate_target_path,target_name)
        new_input_path = join(destination_path,f'wells_{str(sample_i).zfill(3)}.tif')
        new_target_path = join(destination_path,f'wells_{str(sample_i).zfill(3)}_masks.tif')
        # Create a new folder for the restructured data.
        inputs = imageio.volread(input_path)
        targets = imageio.volread(target_path)
        for layer_i in range(inputs.shape[0]):
            name = f'sample_{str(sample_i).zfill(3)}_layer_{str(layer_i).zfill(3)}'
            new_input_path = join(destination_path, name +'.tif')
            new_target_path = join(destination_path, name +'_masks.tif')
            imageio.imwrite(new_input_path, inputs[layer_i])
            imageio.imwrite(new_target_path, targets[layer_i])
        sample_i = sample_i + 1
        

Wrong number of file matched: []
file: 40hrs_plant18_trim-acylYFP.tif
Dir: /scratch/ottosson/datasets/SAM/data/PNAS/plant18/segmentation_tiffs



In [4]:
# COPY WITH! FLOW
import cellpose
# Go trhough all plant 'movies'
for plant_i, plant in enumerate(plants):
    # Get all frames in 'movie'
    plant_path = join(source_path,plant)
    samples = os.listdir(join(plant_path,intermediate_input_path))
    # Go through all frames
    sample_i = 0
    for sample in samples:
        # Skip files which are not training
        if determiner_input_string not in sample: continue 
        # Find target file corresponding to input file
        input_path = join(plant_path,intermediate_input_path,sample)
        target_name = find_label_file(sample,join(plant_path,intermediate_target_path))
        # If no target file, contnue
        if not target_name: continue
        
        # Create paths to targets and inputs
        target_path = join(plant_path,intermediate_target_path,target_name)
        # Create a new folder for the restructured data.
        inputs = imageio.volread(input_path)
        targets = imageio.volread(target_path)

        
        target_list = []
        new_target_file_list = []
        for layer_i in range(inputs.shape[0]):
            target_list.append(targets[layer_i])
            name = f'plant_{str(plant_i).zfill(3)}_sample_{str(sample_i).zfill(3)}_layer_{str(layer_i).zfill(3)}'
            new_input_path = join(destination_path, name +'.tif')
            new_target_path = join(destination_path, name +'.tif')
            imageio.imwrite(new_input_path, inputs[layer_i])
            new_target_file_list.append(new_target_path)


        cellpose.dynamics.labels_to_flows(target_list,
                                          new_target_file_list)
        sample_i = sample_i + 1
    print(f"Plant {plant_i} is done!")
        

2021-11-04 14:16:58,590 [INFO] WRITING LOG OUTPUT TO /home/ottosson/.cellpose/run.log
2021-11-04 14:16:59,257 [INFO] NOTE: computing flows for labels (could be done before to save time)


 72%|███████▏  | 155/214 [37:51<18:50, 19.16s/it]