In [1]:
import numpy as np
from skimage import data
from skimage import filters
from skimage import measure
from skimage import morphology
from skimage import segmentation
from skimage import feature
import scipy.ndimage as ndi
import pandas as pd
from cellpose import models
import glob
import tifffile
import torch

In [2]:
fnames = glob.glob('Data/*.tif')

# Setup SLURM cluster for Dask

In [3]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import time
import dask

cluster = SLURMCluster(
    cores=2,                      # Number of cores per job
    memory="128GB",                # Memory per job
    account="smc",     # Project/account name
    queue="gpu",         # Queue/partition name
    job_extra_directives =[
        #'--gpus=1',  # Number of GPUs per job
        '--gres=gpu:1',  # Number of GPUs per job
    ],
    walltime="02:00:00",           # Job time limit
    local_directory="$TMPDIR",     # Temporary directory (optional)
    log_directory="logs",          # Directory for log files (optional)
)

client = Client(cluster)
print(client)

cluster.scale(jobs=4)  # Request 4 jobs

<Client: 'tcp://10.0.53.5:34107' processes=0 threads=0, memory=0 B>


# Setup and run a simple image processing workflow

This function a filename as an argument, performs cellpose segmentation on the last channel of the image, then calculates shape parameters and mean intensities for the first two channels.  The results are saved as a csv file with the same name as the input file, but with the extension changed to .csv.  A labels file is also saved with the cellpose results.

In [4]:
def process_file(fname):
    my_file = open(fname.replace('.tif', '.txt'), 'w')
    my_file.write('Cuda available:  ' + str(torch.cuda.is_available()))
    my_file.close()

    model = models.Cellpose(gpu=True, model_type='cyto')
    img = tifffile.imread(fname)
    for idx in range(img.shape[0]):
        img[idx] = np.clip(img[idx] - np.percentile(img[idx], 5), 0, np.percentile(img[idx], 100))
    labels = model.eval(img[-1], diameter=200, channels=[0,0], flow_threshold=0.9, min_size=4000)[0]
    df1 = pd.DataFrame(measure.regionprops_table(labels, img[1], properties=['label', 'area', 'centroid', 'eccentricity', 'major_axis_length', 'minor_axis_length', 'orientation', 'mean_intensity'])).rename(columns={'mean_intensity': 'mean_intensity1'})
    df2 = pd.DataFrame(measure.regionprops_table(labels, img[2], properties=['mean_intensity'])).rename(columns={'mean_intensity': 'mean_intensity2'})
    df = pd.concat([df1, df2], axis=1)
    tifffile.imwrite(fname.replace('.tif', '_labels.tiff'), labels.astype(np.uint16))
    return df

Run over all of the files in the list.  Can run this with and without dask/slurm to see the speedup advantage

In [5]:
%%time
using_dask = True
all_data = []


if using_dask:
    delayed = dask.delayed(process_file)
    for i, f in enumerate(fnames):
        all_data.append(delayed(f))
    all_data = dask.compute(*all_data)  
    df = pd.concat(all_data)
else:
    for i, f in enumerate(fnames):
        all_data.append(process_file(f))
        print([np.floor(i*100.0/len(fnames)), f])
    df = pd.concat(all_data)

CPU times: user 851 ms, sys: 202 ms, total: 1.05 s
Wall time: 59.6 s


# De-allocate cluster resources

In [6]:
client.close()
cluster.close()
