# KIT-Loe-GE Cell Segmentation and Tracking


Simultaneous cell segmentation and tracking method used for our submission as team KIT-Loe-GE to the [Cell Tracking Challenge](http://celltrackingchallenge.net/) in 2022.

The code is publicly available at https://github.com/kaloeffler/EmbedTrack.

----

Publication:
K. Löffler and M. Mikut (2022). EmbedTrack -- Simultaneous Cell Segmentation and Tracking Through Learning Offsets and Clustering Bandwidths. arXiv preprint. DOI: [10.48550/arXiv.2204.10713](https://doi.org/10.48550/arXiv.2204.10713)

----


## 1) Setting up the environment

Creating the environment, cloning the code and adding some utilities for downloading the CTC data. Everything (data, code, trained models) will be stored in your personal google drive folder ('/content/drive/MyDrive') in a folder named "EmbedTrack", so you can access it later.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/
!git clone https://github.com/kaloeffler/EmbedTrack.git


In [None]:
!conda --version

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
!conda env create -f /content/drive/MyDrive/EmbedTrack/environment.yml

In [None]:
%%shell
eval "$(conda shell.bash hook)" # copy conda command to shell
conda activate venv_embedtrack
which python
python --version


In [None]:
!pip install imagecodecs --no-dependencies
!pip install cffi=="1.15.0"

In [None]:
# Restarting the runtime
get_ipython().kernel.do_shutdown(True)

**Check cuda is available - otherwise set in colab under "runtime" -> "change runtime type" the runtime from "None" to "GPU"**



In [None]:
import torch
print(torch.cuda.is_available())

**Utilities to facilitate downloading data from the Cell Tracking Challenge**

Please note: you need to run this cell before jumping to the training / inference sections!

In [None]:
import requests
import zipfile
import os

def retrieve_ctc_data(url, save_dir):
  zip_file = os.path.join(save_dir, url.split("/")[-1])
  with requests.get(url, stream=True) as req:
    req.raise_for_status()
    with open(zip_file, "wb") as file: 
      for chunk in req.iter_content(chunk_size=8192):
        file.write(chunk)
  print(f"Unzip data set {os.path.basename(zip_file)}")
  with zipfile.ZipFile(zip_file) as z:
    z.extractall(save_dir)
  
  os.remove(zip_file)
      

## 2.) Training and Inference

### 2.1.) Select a data set to do training / inference on

EmbedTrack was tested and trained on the following 2D datasets, as they all provide an additional Silver Truth (ST) which will be processed together with the Gold Truth annotations (GT) to get fully labelled cell segmentation masks with resonable annotation quality: 
"Fluo-N2DH-SIM+",
  "Fluo-C2DL-MSC",
    "Fluo-N2DH-GOWT1",
    "PhC-C2DL-PSC",
    "BF-C2DL-HSC",
    "Fluo-N2DL-HeLa",
    "BF-C2DL-MuSC",
    "DIC-C2DH-HeLa", and
    "PhC-C2DH-U373".

In [None]:
# possible data sets:

#[    "Fluo-N2DH-SIM+",
#    "Fluo-C2DL-MSC",
#    "Fluo-N2DH-GOWT1",
#    "PhC-C2DL-PSC",
#    "BF-C2DL-HSC",
#    "Fluo-N2DL-HeLa",
#    "BF-C2DL-MuSC",
#    "DIC-C2DH-HeLa",
#    "PhC-C2DH-U373",
#]

data_set = "Fluo-N2DH-SIM+"


### 2.2.) Download the selected data set from the Cell Tracking Challenge

In [None]:
# change to the embedtrack folder that has been created in your drive
%cd /content/drive/MyDrive/EmbedTrack/
!ls

In [None]:
import os
from pathlib import Path

ctc_data_url = "http://data.celltrackingchallenge.net"
ctc_metrics_url = "http://public.celltrackingchallenge.net/software/EvaluationSoftware.zip"

training_data_url = os.path.join(ctc_data_url, "training-datasets/")
challenge_data_url = os.path.join(ctc_data_url, "challenge-datasets/")

current_path = Path.cwd()
data_path = current_path / 'ctc_raw_data'
ctc_metrics_path = os.path.join(current_path, "embedtrack", "ctc_metrics", "CTC_eval")

# Download training data set
if not os.path.exists(data_path / "train" / data_set):
  dp = os.path.join(data_path, "train", data_set)
  print(f"Downloading training data set to {dp} ...")
  data_url = training_data_url + data_set + ".zip"
  retrieve_ctc_data(data_url, os.path.join(data_path, "train"))

# Download challenge data set
if not os.path.exists(data_path / "challenge" / data_set):
  dp = os.path.join(data_path, "challenge", data_set)
  print(f"Downloading challenge data set to {dp} ...")
  data_url = challenge_data_url + data_set + ".zip"
  retrieve_ctc_data(data_url, os.path.join(data_path, "challenge"))

# Download evaluation software
if len(os.listdir(ctc_metrics_path)) <= 1:
  print(f"Downloading  ctc metrics to {ctc_metrics_path} ...")
  retrieve_ctc_data(ctc_metrics_url, ctc_metrics_path)
 
# make CTC metrics executable
!chmod -R 755 $ctc_metrics_path

### 2.3.) Train a model for the selected data set

In [None]:
# change to the embedtrack folder that has been created in your drive
%cd /content/drive/MyDrive/EmbedTrack/
!ls

In [None]:
import matplotlib
import matplotlib.pyplot as plt

matplotlib.use("Agg")
from embedtrack.train.run_training_pipeline import (
    DataConfig,
    ModelConfig,
    TrainConfig,
    run_pipeline,
)
import os
from pathlib import Path

# data configs

PROJECT_PATH = Path.cwd()

RAW_DATA_PATH = os.path.join(PROJECT_PATH, "ctc_raw_data/train")
DATA_PATH_DEST = os.path.join(PROJECT_PATH, "data")
MODEL_PATH = os.path.join(PROJECT_PATH, "models")

USE_SILVER_TRUTH = True
TRAIN_VAL_SEQUNCES = ["01", "02"]
TRAIN_VAL_SPLIT = 0.1

N_EPOCHS = 15
# Adam optimizer; normalize images; OneCycle LR sheduler; N epochs
MODEL_NAME = "adam_norm_onecycle_" + str(N_EPOCHS)

if data_set == "Fluo-N2DH-SIM+":
    use_silver_truth = False
else:
    use_silver_truth = USE_SILVER_TRUTH

data_config = DataConfig(
    RAW_DATA_PATH,
    data_set,
    DATA_PATH_DEST,
    use_silver_truth=use_silver_truth,
    train_val_sequences=TRAIN_VAL_SEQUNCES,
    train_val_split=TRAIN_VAL_SPLIT,
)

# train configs
MODEL_SAVE_DIR = os.path.join(
    MODEL_PATH,
    data_set,
    MODEL_NAME,
)
if data_set != "Fluo-C2DL-MSC":
    CROP_SIZE = 256
    TRAIN_BATCH_SIZE = 16
    VAL_BATCH_SIZE = 16
    DISPLAY_IT = 1000

else:
    CROP_SIZE = 512
    TRAIN_BATCH_SIZE = 8
    VAL_BATCH_SIZE = 8
    DISPLAY_IT = 200

CENTER = "medoid"  
RESUME_TRAINING = False
TRAIN_SIZE = None  # if None training on full train data set; otherwise still training on full data set but only use a fraction of the data per epoch
VAL_SIZE = None  # if None validation on full val data set; otherwise still val on full data set but only use a fraction of the data per epoch
VIRTUAL_TRAIN_BATCH_MULTIPLIER = 1
VIRTUAL_VAL_BATCH_MULTIPLIER = 1
DISPLAY = False

train_config = TrainConfig(
    MODEL_SAVE_DIR,
    crop_size=CROP_SIZE,
    center=CENTER,
    resume_training=RESUME_TRAINING,
    train_size=TRAIN_SIZE,
    train_batch_size=TRAIN_BATCH_SIZE,
    virtual_train_batch_multiplier=VIRTUAL_TRAIN_BATCH_MULTIPLIER,
    val_size=VAL_SIZE,
    val_batch_size=VAL_BATCH_SIZE,
    virtual_val_batch_multiplier=VIRTUAL_VAL_BATCH_MULTIPLIER,
    n_epochs=N_EPOCHS,
    display=DISPLAY,
    display_it=DISPLAY_IT,
)

# model config
INPUT_CHANNELS = 1
N_SEG_CLASSES = [4, 1]
N_TRACK_CLASSES = 2

model_config = ModelConfig(INPUT_CHANNELS, N_SEG_CLASSES, N_TRACK_CLASSES)

run_pipeline(data_config, train_config, model_config)
plt.close("all")


### 2.4.) Inference using the just trained model

In [None]:
# change to the embedtrack folder that has been created in your drive
%cd /content/drive/MyDrive/EmbedTrack/
!ls

In [None]:
# make CTC metrics executable
current_path = Path.cwd()
ctc_metrics_path = os.path.join(current_path, "embedtrack", "ctc_metrics", "CTC_eval")
!chmod -R 755 $ctc_metrics_path

In [None]:
import os
from datetime import datetime
from pathlib import Path
from time import time
import shutil
from embedtrack.ctc_metrics.eval_ctc import calc_ctc_scores
from embedtrack.infer.infer_ctc_data import inference

PROJECT_PATH = Path.cwd()

RAW_DATA_PATHS = [os.path.join(PROJECT_PATH, "ctc_raw_data/challenge"),
                  os.path.join(PROJECT_PATH, "ctc_raw_data/train")]
MODEL_PATH = os.path.join(PROJECT_PATH, "models")
RES_PATH = os.path.join(PROJECT_PATH, "results")

# Adam optimizer; normalize images; OneCycle LR sheduler; N epochs
MODEL_NAME = "adam_norm_onecycle_15"
BATCH_SIZE = 32

for raw_data_path in RAW_DATA_PATHS:
      for data_id in ["01", "02"]:
          img_path = os.path.join(raw_data_path, data_set, data_id)

          model_dir = os.path.join(MODEL_PATH, data_set, MODEL_NAME)
          if not os.path.exists(model_dir):
              print(f"no trained model for data set {data_set}")
              continue

          # time stamps
          timestamps_trained_models = [
              datetime.strptime(time_stamp, "%Y-%m-%d---%H-%M-%S")
              for time_stamp in os.listdir(model_dir)
          ]
          timestamps_trained_models.sort()
          last_model = timestamps_trained_models[-1].strftime("%Y-%m-%d---%H-%M-%S")
          model_path = os.path.join(model_dir, last_model, "best_iou_model.pth")
          config_file = os.path.join(model_dir, last_model, "config.json")
          t_start = time()
          inference(img_path, model_path, config_file, batch_size=BATCH_SIZE)
          t_end = time()

          run_time = t_end - t_start
          print(f"Image sequence: {img_path}")
          print(f"Inference Time {img_path}: {run_time}s")

          res_path = os.path.join(RES_PATH, data_set, MODEL_NAME, last_model, os.path.basename(raw_data_path), data_id+"_RES")
          if not os.path.exists(os.path.dirname(res_path)):
            os.makedirs(os.path.dirname(res_path))
          shutil.move(img_path+"_RES", res_path)
          if os.path.basename(raw_data_path) == "train":
            metrics = calc_ctc_scores(Path(res_path), Path(img_path+"_GT"))
            print(metrics)



## 3.) Inference using the models submitted to the CTC

Download the trained models submitted to the CTC and use them for inference.

In [None]:
# select data set to do inference on
# possible data sets:
#[    "Fluo-N2DH-SIM+",
#    "Fluo-C2DL-MSC",
#    "Fluo-N2DH-GOWT1",
#    "PhC-C2DL-PSC",
#    "BF-C2DL-HSC",
#    "Fluo-N2DL-HeLa",
#    "BF-C2DL-MuSC",
#    "DIC-C2DH-HeLa",
#    "PhC-C2DH-U373",
#]

data_set = "Fluo-N2DH-SIM+"

In [None]:
# change to the embedtrack folder that has been created in your drive
%cd /content/drive/MyDrive/EmbedTrack/
!ls

In [None]:
import os
from pathlib import Path
executables_url = "http://public.celltrackingchallenge.net/participants/KIT-Loe-GE.zip"
executables_path = Path.cwd()
# Download trained models and executables submitted to the CTC
if not os.path.exists(os.path.join(executables_path, "KIT-Loe-GE")):
  dp = os.path.join(executables_path, "KIT-Loe-GE")
  print(f"Downloading trained models and excetuables of KIT-Loe-GE to {dp} ...")
  retrieve_ctc_data(executables_url, executables_path)

In [None]:
import os
from pathlib import Path

ctc_data_url = "http://data.celltrackingchallenge.net"
ctc_metrics_url = "http://public.celltrackingchallenge.net/software/EvaluationSoftware.zip"

training_data_url = os.path.join(ctc_data_url, "training-datasets/")
challenge_data_url = os.path.join(ctc_data_url, "challenge-datasets/")

current_path = Path.cwd()
data_path = current_path / 'ctc_raw_data'
ctc_metrics_path = os.path.join(current_path, "embedtrack", "ctc_metrics", "CTC_eval")

# Download training data set
if not os.path.exists(data_path / "train" / data_set):
  dp = os.path.join(data_path, "train", data_set)
  print(f"Downloading training data set to {dp} ...")
  data_url = training_data_url + data_set + ".zip"
  retrieve_ctc_data(data_url, os.path.join(data_path, "train"))

# Download challenge data set
if not os.path.exists(data_path / "challenge" / data_set):
  dp = os.path.join(data_path, "challenge", data_set)
  print(f"Downloading challenge data set to {dp} ...")
  data_url = challenge_data_url + data_set + ".zip"
  retrieve_ctc_data(data_url, os.path.join(data_path, "challenge"))

# Download evaluation software
if len(os.listdir(ctc_metrics_path)) <= 1:
  print(f"Downloading  ctc metrics to {ctc_metrics_path} ...")
  retrieve_ctc_data(ctc_metrics_url, ctc_metrics_path)

In [None]:
# make CTC metrics executable
current_path = Path.cwd()
ctc_metrics_path = os.path.join(current_path, "embedtrack", "ctc_metrics", "CTC_eval")
!chmod -R 755 $ctc_metrics_path

In [None]:
import os

from pathlib import Path
from time import time
import shutil
from embedtrack.ctc_metrics.eval_ctc import calc_ctc_scores
from embedtrack.infer.infer_ctc_data import inference

PROJECT_PATH = "/content/drive/MyDrive/EmbedTrack/"

RAW_DATA_PATHS = [os.path.join(PROJECT_PATH, "ctc_raw_data/challenge"),
                  os.path.join(PROJECT_PATH, "ctc_raw_data/train")]
MODEL_PATH = os.path.join(PROJECT_PATH, "KIT-Loe-GE", "models")
RES_PATH = os.path.join(PROJECT_PATH, "results")

BATCH_SIZE = 32
for raw_data_path in RAW_DATA_PATHS:
      for data_id in ["01", "02"]:
          img_path = os.path.join(raw_data_path, data_set, data_id)

          model_dir = os.path.join(MODEL_PATH, data_set)
          if not os.path.exists(model_dir):
              print(f"no trained model for data set {data_set}")
              continue
          
          model_path = os.path.join(model_dir, "best_iou_model.pth")
          config_file = os.path.join(model_dir, "config.json")
          t_start = time()
          inference(img_path, model_path, config_file, batch_size=BATCH_SIZE)
          t_end = time()

          run_time = t_end - t_start
          print(f"Image sequence: {img_path}")
          print(f"Inference Time {img_path}: {run_time}s")

          res_path = os.path.join(RES_PATH, data_set, "KIT-Loe-GE", os.path.basename(raw_data_path), data_id+"_RES")
          if not os.path.exists(os.path.dirname(res_path)):
            os.makedirs(os.path.dirname(res_path))
          shutil.move(img_path+"_RES", res_path)
          if os.path.basename(raw_data_path) == "train":
            metrics = calc_ctc_scores(Path(res_path), Path(img_path+"_GT"))
            print(metrics)

