# pysmFISH pipeline running template

This jupyter lab notebook is used to run automated data analysis via papermill. The data will be run through the entire pipeline (full run). A copy of the run notebook will be stored in the processed experiment folder inside the notebooks subfolder. 

In [1]:
from pathlib import Path
import time

from pysmFISH.pipeline import Pipeline
from pysmFISH.utils import end_processing_file

In [None]:
# THIS CELL IS TAGGED PARAMETERS

# REQUIRED ARGUMENTS
# -------------------

# Path to the experiment folder
experiment_fpath = '' 

# Define if it is a 'new' or 're-run' (default: new)
run_type = 'new'

# Define the parsing type. Can be: 
# original/no_parsing/reparsing_from_processing_folder/reparsing_from_storage
# (default: original)
parsing_type = 'original'

# OPTIONAL KWARGS
# ----------------

# Path to the cold storage hard drive (default: /fish/rawdata)
raw_data_folder_storage_path = '/fish/rawdata'

# Tag to identify the zarr file with parsed images (default: img_data)
parsed_image_tag = 'img_data'

# Tag to identify the zarr file with preprocessed images (default: preprocessed_img_data)
preprocessed_image_tag = 'preprocessed_img_data'

# Path to the location where the dataset are stored (default: /fish/fish_datasets)
dataset_folder_storage_path = '/fish/fish_datasets'

# Path to the location where the dataset are stored (default: /fish/fish_results)
results_folder_storage_path = '/fish/fish_results'

# Determine if the processed images will be saved (default: True)
save_intermediate_steps = True

# Path to an existing dataset that will be used in the processing
dataset_path = ''

# Number of FOV to process in parallel
chunk_size = 50

# Searching distance that define two dots as identical (default: 10)
same_dot_radius_duplicate_dots = 10

# Define the stitched counts on which the overlapping dotes will be removed 
# (default: microscope_stitched) 
stitching_selected = 'microscope_stitched'

# Value to select the barcodes that are passing the 
# screening (< hamming_distance). (default: 3)
hamming_distance = 3

# Define the name of the system that will run the processing. Can be local/htcondor
# (default htcondor). If engine == local the parameters that define the cluster
# will be ignored
processing_engine = 'htcondor'

# Number of cores to use in htcondor (default 20)
cores = 20

# Total memory for all the cores in htcondor (default 200GB)
memory = '200GB'

# Size of the spillover disk for dask in htcondor (default 0.1GB)
disk = '0.1GB'

# Directory where to spill over on the node in htcondor (default /tmp)
local_directory = '/tmp'

# Directory where to store dask and htcondor logs
logs_directory = ''

# Save the intensity of the bits and the flipping direction
save_bits_int = False


# Add a note if needed
notes = 'no notes'

In [None]:
# Add a running time tag to the pipeline run name
experiment_fpath = Path(experiment_fpath)
date_tag = time.strftime("%y%m%d_%H_%M_%S")
pipeline_run_name = date_tag + '_' + experiment_fpath.stem

In [None]:
print(f"{notes}")

In [None]:
# Set up the pipeline run

running_pipeline = Pipeline(
                        pipeline_run_name= pipeline_run_name,
                        experiment_fpath= experiment_fpath,
                        run_type= run_type,
                        parsing_type= parsing_type,
                        processing_engine= processing_engine,
                        cores= cores,
                        memory= memory,
                        disk= disk,
                        local_directory= local_directory,
                        chunk_size= chunk_size,
                        raw_data_folder_storage_path= raw_data_folder_storage_path,
                        parsed_image_tag= parsed_image_tag,
                        preprocessed_image_tag= preprocessed_image_tag,
                        dataset_folder_storage_path= dataset_folder_storage_path,
                        results_folder_storage_path= results_folder_storage_path,
                        save_intermediate_steps= save_intermediate_steps,
                        dataset_path= dataset_path,
                        same_dot_radius_duplicate_dots= same_dot_radius_duplicate_dots,
                        stitching_selected= stitching_selected,
                        hamming_distance= hamming_distance,
                        logs_directory= logs_directory,
                        save_bits_int= save_bits_int)

In [None]:
# Full pipeline run

running_pipeline.run_full()

In [None]:
# Signal completion of the analysis
end_processing_file(path_destination=experiment_fpath.parent,
                    completion_pattern='processing_completed.txt')

print('Processing completed')