In [3]:
from typing import Tuple
import os
import glob
import shutil
import scipy

import zarr
from cryoet_data_portal import Client, Dataset
import numpy as np

class CziiCollector():
    def __init__(
        self, 
        tmp_dir: str = "d:/flagellar/temp/", 
        out_dir: str = "/kaggle/working/volumes/", 
        img_size: Tuple[int] = (128, 512, 512),
    ):
        super().__init__()
        self.tmp_dir = tmp_dir
        self.out_dir = out_dir
        self.img_size = img_size

        # Tmp dir
        if os.path.exists(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # Out dir
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)

    def percentile_norm(self, x):
        lower, upper= np.percentile(x, 0.1), np.percentile(x, 99.9)
        x = np.clip(x, lower, upper)
        x = (x - lower) / (upper - lower)
        return (x*255).astype(np.uint8)

    def process_tomogram(self, x):
        # Norm
        x = self.percentile_norm(x)
        print("original_shape: {}".format(x.shape))
    
        # Optional: Evenly sampled across Z
        indices = np.linspace(0, x.shape[0]-1, 128).astype(int)
        x= x[indices]
    
        # Resize
        t,h,w= self.img_size
        zoom_factor= (t / x.shape[0], h / x.shape[1], w / x.shape[2])
        x= scipy.ndimage.zoom(x, zoom_factor, order=3)
        x= np.clip(x, 0, 255)
        x= x.astype(np.uint8)
        print("final_shape: {}".format(x.shape))
        return x
        

    def run(self,):

        # ========== Query Datasets ==========
        client = Client()

        # Datasets by Author
        # ds_all = Dataset.find(client, [Dataset.authors.name == "Yi-Wei Chang"])
        # ds_all = Dataset.find(client, [Dataset.authors.name == "Ariane Briegel"])
        ds_all = Dataset.find(client, [Dataset.authors.name == "Morgan Beeby"])
        print("="*25)
        print("N_DATASETS:", len(ds_all))
        print("="*25)


        # ========= Process Single Dataset ==========
        for ds in ds_all:
            s= "TOTAL: {:<10}     TITLE: {}".format(
                len(ds.runs),
                ds.title,
            )
            print(s)

            # Create dataset dir
            ds_dir= os.path.join(self.out_dir, str(ds.id))
            if not os.path.exists(ds_dir):
                os.mkdir(ds_dir)

            # Process runs
            for run in ds.runs:
                
                # Check if tomo already downloaded
                outpath= os.path.join(ds_dir, str(run.name)) + ".npy"
                if os.path.exists(outpath):
                    continue

                # Download tomo
                try:
                    tomo= run.tomograms[0]
                    tomo.download_omezarr(dest_path=self.tmp_dir)                    

                    # Load tomo
                    fpath= glob.glob(self.tmp_dir + "*")[0]
                    arr = zarr.open(fpath, mode='r')
                    raise 'stop'
                    arr = arr[0]                    

                    # Preprocess
                    arr= self.process_tomogram(arr)
    
                    # Save
                    np.save(outpath, arr)
                
                except Exception as e:
                    raise 'stop'
                    print(e)
                    print("FAILED:", outpath)

                # Clear tmp
                shutil.rmtree(self.tmp_dir)
                os.makedirs(self.tmp_dir)
                
                break # <-------------- COMMENT OUT HERE FOR FULL COLLECTION
            break  # <-------------- COMMENT OUT HERE FOR FULL COLLECTION
                
        return

p= CziiCollector()
p.run()

N_DATASETS: 45
TOTAL: 183            TITLE: High magnification tomograms of whole WT Campylobacter cells



00%|██████████████████████████████████████████████████████████████████████████████| 265M/265M [01:40<00:00, 2.63MiB/s]

TypeError: exceptions must derive from BaseException