# Building the dataset

- This notebook depends highly on the kind of dataset you want to run the training. The output is a folder (`data_dir`) which contains all of your training dataset as well as the annotations.
- The current [`2_Train.ipynb`](2_Train.ipynb) notebook uses the [`MicrotubuleDataset`](../mask_lib/dataset.py) class to load the data from within the training dataset folder (`data_dir`).

In [53]:
%matplotlib inline

from pathlib import Path
import os
import sys
import itertools
import datetime
import shutil
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

import tqdm

from simuscope import Model

root_dir = Path("/home/hadim/.data/Neural_Network/Mask-RCNN/Microtubules/")
data_dir = root_dir / "Data"
description_path = data_dir / "DESCRIPTION.md"

In [3]:
# Reset the data folder
[os.remove(fname) for fname in data_dir.glob("*")]
data_dir.mkdir(parents=True, exist_ok=True)

## Generate fake microtubule images

We generate microtubule images over a wide range of SNR (signal over noise ratio) and number of microtubules per image.

In [5]:
model_name = "simple_microtubule"
model = Model.load_model(model_name)

model.acquisition.n_frames = 1
model.acquisition.channels.pop("channel_2")

builder = model.get_builder()
print(builder)

# Setup parameter ranges
snr_range = np.arange(1.3, 4, 0.2)
n_mts_range = np.arange(1, 60, 5)
n = 1

total_images = snr_range.shape[0] * n_mts_range.shape[0] * n
print(total_images)

Image shape: (1, 1, 512, 512)
Image memory size: 2.00 MB
Channels: ['channel_1']
Objects: [<simuscope.builder.object_builder.microtubule_builder.SimpleMicrotubuleBuilder object at 0x7f0693793550>]

168


In [6]:
# Generate the dataset

def create(*args):
    
    snr, n_mts = args[0]
    
    model.acquisition.channels["channel_1"].snr = snr

    mt_obj = model.objects["microtubule"]
    mt_obj.parameters["nucleation_rate"]["parameters"]["loc"] = 0
    mt_obj.parameters["n_microtubules"]["parameters"]["loc"] = n_mts
    mt_obj.parameters["initial_length"]["parameters"]["loc"] = 6
    mt_obj.parameters["initial_length"]["parameters"]["scale"] = 5

    for i in range(n):
        basename = f"image_snr_{snr:.1f}_n-mts_{n_mts}_id_{i}"

        random_size = np.random.randint(512, 1200)
        model.microscope.camera.chip_size_height = random_size
        model.microscope.camera.chip_size_width = random_size
        
        builder = model.get_builder()
        images = builder.build(keep_images=False)
        builder.save(str(data_dir / basename))
    
parameters = list(itertools.product(snr_range, n_mts_range))
p = joblib.Parallel(n_jobs=8, verbose=1)
_ = p(map(joblib.delayed(create), parameters))

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 168 out of 168 | elapsed:   19.2s finished


## Copy Manually Annotated Dataset

Here we copy a manually annotated dataset to the final training dataset.

In [74]:
import tifffile
import read_roi

manual_data_dir = root_dir / "Manual Training Dataset"
processed_data_dir = root_dir / "Manual Training Dataset/Processed"
manual_description_path = manual_data_dir / "DESCRIPTION.md"

In [71]:
fnames = [fname.with_suffix(".tif") for fname in processed_data_dir.glob("*.zip")]
for fname in fnames:
    new_fname = shutil.copy(fname, data_dir)
    json_path = Path(new_fname).with_suffix(".json")
    
    # Convert ZIP rois to JSON rois
    rois = read_roi.read_roi_zip(fname.with_suffix(".zip"))

    json_roi = {"microtubule": {}}
    mt = json_roi["microtubule"]
    mt["end_x"] = {}
    mt["end_y"] = {}
    mt["frame"] = {}
    mt["mt_id"] = {}
    mt["start_x"] = {}
    mt["start_y"] = {}
    mt["type"] = {}

    for i, (roi_name, roi) in enumerate(rois.items()):
        mt["type"][str(i)] = "seed"
        mt["frame"][str(i)] = 0
        mt["mt_id"][str(i)] = i
        
        if "x1" in roi.keys():
            mt["end_x"][str(i)] = roi["x2"]
            mt["end_y"][str(i)] = roi["y2"]
            mt["start_y"][str(i)] = roi["y1"]
            mt["start_x"][str(i)] = roi["x1"]
        else:
            mt["end_x"][str(i)] = roi["x"][-1]
            mt["end_y"][str(i)] = roi["y"][-1]
            mt["start_y"][str(i)] = roi["y"][0]
            mt["start_x"][str(i)] = roi["x"][0]

    json.dump(json_roi, open(json_path, "w"), indent=True)

# Description file

Add to `DESCRIPTION.md` a description about this dataset.

In [80]:
date_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")

with open(description_path, "a") as f:
    f.write("Description of the Training Dataset\n\n")
    f.write(f"- Date: {date_str}\n\n")
    f.write(f"- Manually Annotated Dataset ({len(fnames)} images):\n\n")
    
    with open(manual_description_path) as d:
        desc = d.readlines()
        f.write("\n".join(["\t" + line for line in desc]))
    
    f.write(f"\n")
    f.write(f"- Simulated Dataset Parameters using the Python library `simuscope` ({total_images} images):\n\n")
    f.write(f"```yaml\n")
    f.write(f"{str(model)}\n")
    f.write(f"```\n")