# Parser for PlantSeg
Parse the data in PNAS to fit with PlantSeg input specifications.

The data will be copied into the folder 'PNAS-PlantSeg'. There, each 3D volume will be stored in a file 'sample_<i>.h5'
This file contains bothe the raw data stored in 'raw' and the labels in 'label'. 
I should perhaps split them into training and evaluation set.

In [1]:
import os
import h5py
from os.path import join
source_path = '/scratch/ottosson/datasets/SAM/data/PNAS'
destination_path = '/scratch/ottosson/datasets/SAM/data/PNAS-PlantSeg'

In [2]:
# Get all directories in source path
plants = [dir for dir in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, dir))] 
# Set intermediat paths 
intermediate_input_path = 'processed_tiffs'
intermediate_target_path = 'segmentation_tiffs'
# Set strings which are unique for input and output to be able to tell them apart
determiner_input_string = 'acylYFP'
determiner_output_string = ''

In [4]:
def find_label_file(file, directory):
    """
    Finds the files in the directory which share the timestamp and plant name of 'file'
    The file is assumed to be named "time_plant_XXXXXX.XX"
    """
    parts = file.split("_")
    time = parts[0]
    plant = parts[1]
    matched_files = []
    for f in os.listdir(directory):
        f_parts = f.split("_")
        f_time = f_parts[0]
        f_plant = f_parts[1]
        if time == f_time and plant == f_plant:
            matched_files.append(f)
    if len(matched_files) != 1:
        print(f"Wrong number of file matched: {matched_files}\nfile: {file}\nDir: {directory}\n")
    if len(matched_files) == 0:
        return None
    return matched_files[0]

In [7]:
sample_i = 0
# Go trhough all plant 'movies'
for plant in plants:
    # Get all frames in 'movie'
    plant_path = join(source_path,plant)
    files = os.listdir(join(plant_path,intermediate_input_path))
    # Go through all frames
    for file in files:
        # Skip files which are not training
        if determiner_input_string not in file: continue 
        # Find target file corresponding to input file
        input_path = join(plant_path,intermediate_input_path,file)
        target_name = find_label_file(file,join(plant_path,intermediate_target_path))
        # If no target file, contnue
        if not target_name: continue
        
        # Create paths to targets and inputs
        target_path = join(plant_path,intermediate_target_path,target_name)
        sample_path = join(destination_path,f"sample_{sample_i}.h5")
        
        # Create a new folder for the restructured data.
        raw = imageio.volread(input_path)
        label = imageio.volread(target_path)
        with h5py.File(sample_path, 'w') as hf:
            hf.create_dataset('raw', data=raw)
            hf.create_dataset('label', data=label)
        sample_i = sample_i + 1

Wrong number of file matched: []
file: 40hrs_plant18_trim-acylYFP.tif
Dir: /scratch/ottosson/datasets/SAM/data/PNAS/plant18/segmentation_tiffs

