 # vEM-Mitochondria: Data Preparation
 
 Lorem ipsum

## Google Colab

IMPORTANT: Run the next cells until `Download MitoEM` only if you execute this notebook on Google Colab. If you run this notebook locally, you need to set up a python environment with the correct dependencies beforehand, check out [these instructions](TODO).

In [None]:
# TODO mount google drive

## Download MitoEM data and create volume

Lorem ipsum

In [4]:
# imports required for downloading and volume creation
import hashlib
import os
import requests
from glob import glob
from shutil import copyfileobj

import dask
import dask.array as da
import imageio
from tqdm import tqdm

In [5]:
# url and checksum for the data
mitoem_url = "https://www.dropbox.com/sh/p5xn9e4gderjtm6/AADfUMzAA38XBvcXDTG1kAGGa/MitoEM-H.zip?dl=1"
checksum = "f4ad14e098697be78d3ea13f263f76d5ba81a27e354c9edc906adfe728c765bd"

# where to save the data
# the filepaths are for running in google colab, modify if you run this notebook locally
data_root = "./mito-em-data"  # TODO colab path
tmp_path = "./mito-em-tmp"

os.makedirs(data_root, exist_ok=True)
os.makedirs(tmp_path, exist_ok=True)

In [5]:
# check if the data is already fully downloaded and initial volume has been created
# if yes, we skip the following cells
# TODO
data_path = os.path.join(data_root, "mitoem-h.ome.zarr")
have_mitoem_vol = False

In [None]:
# download the data
zip_path = os.path.join(tmp_path, "MitoEM-H.zip")
if not have_mitoem_vol and not os.path.exists(zip_path):
    with requests.get(mitoem_url, stream=True) as r:
        filesize = int(r.headers.get("Content-Length", 0))
        desc = f"Download {mitoem_url} to {zip_path}"
        with tqdm.wrapattr(r.raw, "read", total=filesize, desc=desc) as r_raw, open(zip_path, "wb") as f:
            copyfileobj(r_raw, f)
    with open(zip_path, "rb") as f:
        this_checksum = hashlib.sha256(f.read()).hexdigest()
    if this_checksum == checksum:
        print("Download to", zip_path, "was successful.")
    else:
        print("The file was downloaded to", zip_path, "but the file is likely corrupted!")
        print("Please remove", zip_path, "and try the download again.")

Download https://www.dropbox.com/sh/p5xn9e4gderjtm6/AADfUMzAA38XBvcXDTG1kAGGa/MitoEM-H.zip?dl=1 to ./mito-em-tmp/mitoem-h.zip: 100%|██████████| 13.8G/13.8G [1:17:32<00:00, 3.18MB/s]  


In [None]:
# unzip the data
unzipped_path = os.path.join(tmp_path, "")
if not have_mitoem_vol and not os.path.exists(unzipped_path):
    pass  # TODO

In [None]:
# copy the data into a ome.zarr file
# TODO actually use ome.zarr instead of normal zarr

# we use dask to copy the data into the ome.zarr array lazily
# see https://docs.dask.org/en/stable/array-creation.html
# https://docs.dask.org/en/latest/generated/dask.array.to_zarr.html
imread = dask.delayed(imageio.imread, pure=True)
image_paths = glob(os.path.join(unzipped_path, "*.png"))
image_paths.sort()
images = [imread(path) for path in image_paths]

# find the shape and datatype from the first image
sample = images[0].compute()

# load individual images to get a dask array for each image and stack to get a volume
arrays = [da.from_delayed(im, dtype=sample.dtype, shape=sample.shape)
          for im in images]
volume = da.stack(arrays, axis=0)

# write to zarr
dask.to_zarr(volume, os.path.join(data_path, "s0"), chunks=(32, 128, 128))

## Downscale and crop the data