In [1]:
from tqdm import tqdm
from glob import glob
import tifffile
import numpy as np
import os
from EmbedSeg.utils.preprocess_data import extract_data, split_train_val, split_train_test
from EmbedSeg.utils.generate_crops import *

### Download Data

The images and corresponding masks are downloaded from an external url, specified by `zip_url` to the path specified by the variables `data_dir` and `project_dir`. The following structure is generated after executing the `extract_data` method below:

```
data
└───bbbc010-2012
    |───download
        |───train
        |───test
    |───bbbc010-2012.zip
```

In [2]:
data_dir = '../../../data'
project_name = 'bbbc010-2012'

In [3]:
extract_data(
    zip_url = 'https://github.com/juglab/EmbedSeg/releases/download/v1.0/bbbc010-2012.zip',
    data_dir = data_dir,
    project_name = project_name,
)

Downloaded data as ../../../data/bbbc010-2012.zip
Unzipped data to ../../../data/bbbc010-2012/download/


### Split Data into `train`, `val` \& `test`

Since the `train`-`test` data partition doesn't exist by itself in the original data, we can execute the following cell to reserve some data as evaluation or test data.

In [4]:
split_train_test(
    data_dir = data_dir,
    project_name = project_name, 
    train_test_name = 'train',
    subset = 0.5
)

Created new directory : ../../../data/bbbc010-2012/train/images/
Created new directory : ../../../data/bbbc010-2012/train/masks/
Created new directory : ../../../data/bbbc010-2012/val/images/
Created new directory : ../../../data/bbbc010-2012/val/masks/
Created new directory : ../../../data/bbbc010-2012/test/images/
Created new directory : ../../../data/bbbc010-2012/test/masks/
Created new directory : ../../../data/bbbc010-2012/download/test/images
Created new directory : ../../../data/bbbc010-2012/download/test/masks
Train-Test Images/Masks saved at ../../../data/bbbc010-2012


Now, we would like to reserve a small fraction (15 % by default) of the provided train dataset as validation data. Here, in case you would like to repeat multiple experiments with the same partition, you may continue and press <kbd>Shift</kbd> + <kbd>Enter</kbd> on the next cell - but in case, you would like different partitions each time, please add the `seed` attribute equal to a different integer (For example, 
```
split_train_val(
data_dir = data_dir, 
project_name = project_name, 
train_val_name = 'train', 
subset = 0.15, 
seed = 1000)
```
)

In [5]:
split_train_val(
    data_dir = data_dir,
    project_name = project_name, 
    train_val_name = 'train',
    subset = 0.15)

Train-Val-Test Images/Masks saved at ../../../data/bbbc010-2012


### Specify desired centre location for spatial embedding of pixels

Interior pixels of an object instance can either be embedded at the `centroid` (evaluated in $\mathcal{O(n)}$ operations, where $\mathcal{n}$ is the number of pixels in an object instance), or the `approximate-medoid` (also evaluated in $\mathcal{O(n)}$ operations) or the `medoid` (evaluated in $\mathcal{O(n^{2})}$ operations). Please note that evaluating `medoid` of the instances could be slow especially if you choose a large `crop_size` later: in such a scenario, a quicker alternative is opting for the `approximate-medoid` option, which gives comparable results.

In [6]:
center = 'centroid'
try:
    assert center in {'medoid', 'approximate-medoid', 'centroid'}
    print("Spatial Embedding Location chosen as : {}".format(center))
except AssertionError as e:
    e.args += ('Please specify center as one of : {"medoid", "approximate-medoid", "centroid"}', 42)
    raise



Spatial Embedding Location chosen as : centroid


### Specify cropping configuration parameters

Images and the corresponding masks are cropped into patches centred around an object instance, which are pre-saved prior to initiating the training. **Run the following two cells twice** - first time set `data_subset = 'train'` and the second time set `data_subset = 'val'`.  Note that the cropped images, masks and center-images would be saved at the path specified by `crops_dir`. Please set `one_hot = True` in case the instances are encoded in a one-hot style. 

In [9]:
crops_dir = 'crops'
data_subset = 'train' 
crop_size = 256
one_hot = True

### Generate Crops



In [10]:
image_dir = os.path.join(data_dir, project_name, data_subset, 'images')
instance_dir = os.path.join(data_dir, project_name, data_subset, 'masks')
image_names = sorted(glob(os.path.join(image_dir, '*.tif'))) 
instance_names = sorted(glob(os.path.join(instance_dir, '*.tif')))  
for i in tqdm(np.arange(len(image_names))):
    if one_hot:
        process_one_hot(image_names[i], instance_names[i], os.path.join(crops_dir, project_name), data_subset, crop_size, center, one_hot = one_hot)
    else:
        process(image_names[i], instance_names[i], os.path.join(crops_dir, project_name), data_subset, crop_size, center, one_hot=one_hot)
print("Cropping of images, instances and centre_images for data_subset = `{}` done!".format(data_subset))

100%|██████████| 43/43 [00:14<00:00,  2.96it/s]

Cropping of images, instances and centre_images for data_subset = `train` done!



