In [0]:

## A Fully Annotated Dataset for Nuclei Instance Segmentation in H&E-Stained Images
# https://www.kaggle.com/datasets/ipateam/nuinsseg/data
# https://github.com/masih4/NuInsSeg/tree/main

# https://learnopencv.com/yolov9-instance-segmentation-on-medical-dataset/
# https://github.com/spmallick/learnopencv/blob/master/YOLOv9-Instance-Segmentation-on-Medical-Dataset/Yolov9e_1024.ipynb ***

# https://gist.github.com/Praneet9/5c182383466308b5bbb8cceac7b3b95c

In [0]:
# !pip install -q ultralytics==8.1.45 pycocotools scikit-learn matplotlib kaggle
!pip install -q ultralytics pycocotools scikit-learn matplotlib kaggle

In [0]:
# dbutils.library.restartPython()
%restart_python

In [0]:
import os
import copy
import random
import json
import yaml
import glob
import cv2
import numpy as np
import time
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import requests
from   zipfile import ZipFile
import argparse
from PIL import Image
import PIL.Image
import shutil
from IPython.display import Image
from sklearn.model_selection import train_test_split

import torch
import torch.utils.data
from torch import nn
import torchvision
from ultralytics import YOLO

from torchvision import transforms as T


from pycocotools import mask as coco_mask
from pycocotools.coco import COCO

In [0]:
## DO 1x
# Download the dataset
# !kaggle datasets download -d ipateam/nuinsseg -p /Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/

In [0]:
# Define the paths
zip_file_path = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/nuinsseg.zip'
uc_vol_path = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/'

## DO 1x
# Unzip the dataset to the Unity Catalog volume using bash ## took 6hrs!
# dbutils.fs.mkdirs(uc_vol_path)  # Ensure the target directory exists
# !unzip -o {zip_file_path} -d {uc_vol_path}

--------------------------

In [0]:
base_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/'
directories2use = ['tissue images', 'mask binary without border', 'label masks modify']

# List the contents of directories2use for every subdirectory in base_dir
for subdirectory in dbutils.fs.ls(base_dir):
    if subdirectory.isDir():
        for directory in directories2use:
            full_path = f"{subdirectory.path}{directory}/"
            print(f"Contents of {full_path}:")
            display(dbutils.fs.ls(full_path))

--------------------------

In [0]:
def get_image_mask_pairs(data_dir):
    image_paths = []
    mask_paths = []

    for root,_,files in os.walk(data_dir):
        if 'tissue images' in root:
            for file in files:
                if file.endswith('.png'):
                    image_paths.append(os.path.join(root,file))
                    mask_paths.append(os.path.join(root.replace('tissue images','label masks modify'), file.replace('.png','.tif')))
    return image_paths, mask_paths

In [0]:
def mask_to_polygons(mask,epsilon=1.0):
    contours,_ = cv2.findContours(mask,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) ##accounts for occlusions
    polygons = []
    for contour in contours:
        if len(contour) > 2:
           poly = contour.reshape(-1).tolist()
           if len(poly) > 4: #Ensures valid polygon
              polygons.append(poly)
    return polygons

In [0]:
def process_data(image_paths, mask_paths, output_images_dir, output_labels_dir):
    annotations = []
    images = []
    image_id = 0
    ann_id = 0

    for img_path, mask_path in zip(image_paths, mask_paths):
        image_id += 1
        img = cv2.imread(img_path)
        mask = cv2.imread(mask_path, cv2.IMREAD_UNCHANGED)
        shutil.copy(img_path, os.path.join(output_images_dir, os.path.basename(img_path)))

        # Add image to the list
        images.append({
            "id": image_id,
            "file_name": os.path.basename(img_path),
            "height": img.shape[0],
            "width": img.shape[1]
        })

        unique_values = np.unique(mask)
        for value in unique_values:
            if value == 0:  # Ignore background
                continue

            object_mask = (mask == value).astype(np.uint8) * 255
            polygons = mask_to_polygons(object_mask)

            for poly in polygons:
                ann_id += 1
                annotations.append({
                  
                    "image_id": image_id,
                    "category_id": 1,  # Only one category: Nuclei
                    "segmentation": [poly],
                   
                })

    coco_input = {
        "images": images,
        "annotations": annotations,
        "categories": [{"id": 1, "name": "Nuclei"}]
    }

    # Convert COCO-like dictionary to YOLO format
    for img_info in coco_input["images"]:
        img_id = img_info["id"]
        img_ann = [ann for ann in coco_input["annotations"] if ann["image_id"] == img_id]
        img_w, img_h = img_info["width"], img_info["height"]

        if img_ann:
            with open(os.path.join(output_labels_dir, os.path.splitext(img_info["file_name"])[0] + '.txt'), 'w') as file_object:
                for ann in img_ann:
                    current_category = ann['category_id'] - 1
                    polygon = ann['segmentation'][0]
                    normalized_polygon = [format(coord / img_w if i % 2 == 0 else coord / img_h, '.6f') for i, coord in enumerate(polygon)]
                    file_object.write(f"{current_category} " + " ".join(normalized_polygon) + "\n")

--------------------------

In [0]:
def yolo_dataset_preparation():
    # data_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/'
    # output_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset' #v9e_1024

    proj_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/'
    yolo_data_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/'
    

    # Define the paths for the images and labels for training and validation
    train_images_dir = os.path.join(yolo_data_dir, 'train', 'images')
    val_images_dir = os.path.join(yolo_data_dir, 'val', 'images')
    train_labels_dir = os.path.join(yolo_data_dir, 'train', 'labels')
    val_labels_dir = os.path.join(yolo_data_dir, 'val', 'labels')

    # Create the output directories if they do not exist
    os.makedirs(train_images_dir, exist_ok=True)
    os.makedirs(val_images_dir, exist_ok=True)
    os.makedirs(train_labels_dir, exist_ok=True)
    os.makedirs(val_labels_dir, exist_ok=True)

    # Get image and mask paths
    image_paths, mask_paths = get_image_mask_pairs(proj_dir)

    # Split data into train and val
    train_img_paths, val_img_paths, train_mask_paths, val_mask_paths = train_test_split(image_paths, mask_paths, test_size=0.2, random_state=42)

    # Process and save the data in YOLO format for training and validation
    process_data(train_img_paths, train_mask_paths, train_images_dir, train_labels_dir)
    process_data(val_img_paths, val_mask_paths, val_images_dir, val_labels_dir)

    ## Assume create_yaml function is defined elsewhere and set appropriate paths for the YAML file --> moved outside
    # output_yaml_path = os.path.join(output_dir, 'data.yaml')
    # train_path = os.path.join(output_dir, 'train', 'images')
    # val_path = os.path.join(output_dir, 'val', 'images')
    # create_yaml(output_yaml_path, train_path, val_path)

    # output_yaml_path = os.path.join(yolo_data_dir, 'data.yaml')
    # train_path = os.path.join(yolo_output_dir, 'train', 'images')
    # val_path = os.path.join(yolo_output_dir, 'val', 'images')
    # create_yaml(output_yaml_path, train_path, val_path)


In [0]:
def create_yaml(output_yaml_path, train_images_dir, val_images_dir, nc=1):
    # Assuming all categories are the same and there is only one class, 'Nuclei'
    names = ["Nuclei"]

    # Create a dictionary with the required content
    # yaml_data = {
    #               "names": ["Nuclei"],
    #               "nc": 1,
    #               "train": "train/images",
    #               "val": "val/images"
    #             }

    yaml_data = {
                'names': names,
                'nc': nc,  # Number of classes
                'train': train_images_dir,
                'val': val_images_dir #,

                # 'test': '' ## original code had `' '` spacing that was causing errors -- since we 
                #  dont have test directory (we can include) for the testing i am commenting it out 
                #  for now
                
                }

    # Write the dictionary to a YAML file
    with open(output_yaml_path, 'w') as file:
        yaml.dump(yaml_data, file, default_flow_style=False)

In [0]:
import os
import yaml

Vols_proj_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/'
yolo_data_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/'
WS_proj_dir = '/Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/'
    
yaml_data_dir = ''
ws_data_dir = 'datasets' #v9e_1024

# useVols = True
useVols = False

if useVols:
  output_yaml_path = os.path.join(yolo_data_dir, 'data.yaml')
  train_path = os.path.join(yolo_data_dir, 'train', 'images')
  val_path = os.path.join(yolo_data_dir, 'val', 'images')
  create_yaml(output_yaml_path, train_path, val_path)

else: #workspace
  output_yaml_path = os.path.join(WS_proj_dir, 'data.yaml')
  train_path = os.path.join(yaml_data_dir, 'train', 'images')
  val_path = os.path.join(yaml_data_dir, 'val', 'images')
  create_yaml(output_yaml_path, train_path, val_path)



In [0]:
!cat /Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/data.yaml

In [0]:
# !cat /Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/data.yaml
!cat /Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/data.yaml

In [0]:
## Do Once
yolo_dataset_preparation() # 35-40mins!

--------------------------

In [0]:
# def set_seeds():
#     # fix random seeds
#     SEED_VALUE = 42

#     random.seed(SEED_VALUE)
#     np.random.seed(SEED_VALUE)
#     torch.manual_seed(SEED_VALUE)
    
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed(SEED_VALUE)
#         torch.cuda.manual_seed_all(SEED_VALUE)
#         torch.backends.cudnn.deterministic = True
#         torch.backends.cudnn.benchmark = True
            
# set_seeds()

In [0]:
# Import the YOLO class from the ultralytics library
from ultralytics import YOLO

# Instance
model = YOLO("yolov9e-seg.pt") # Transfer the weights from a pretrained model (recommended for training)

In [0]:
# model

In [0]:
with open("/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/data.yaml",'r') as stream:
     num_classes = str(yaml.safe_load(stream)['nc'])

In [0]:
num_classes

In [0]:
# dist.destroy_process_group()

In [0]:
# import os
# import subprocess

# # Kill any existing TensorBoard processes
# subprocess.run(["pkill", "-f", "tensorboard"])

# # Clear the TensorBoard logs
# tensorboard_log_dir = "/dbfs/tmp/tensorboard_logs"
# if os.path.exists(tensorboard_log_dir):
#     subprocess.run(["rm", "-rf", tensorboard_log_dir])

# # Create the log directory
# os.makedirs(tensorboard_log_dir, exist_ok=True)

# # Start TensorBoard
# subprocess.Popen(["tensorboard", "--logdir", tensorboard_log_dir, "--host", "0.0.0.0", "--port", "6006"])

In [0]:
ABS_wsPATH = os.getcwd()
ABS_wsPATH

In [0]:
import torch.distributed as dist
import mlflow
import os

# Initialize the process group
if not dist.is_initialized():
    dist.init_process_group(backend='nccl')  # required for cuda

try:
    # Train the model
    results = model.train(
        data=os.path.join("/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/", "data.yaml"),
        epochs=70,
        patience=0,  # setting patience=0 to disable early stopping
        batch=3,
        imgsz=1024
    )

# Automatic Mixed Precision (AMP)

finally:
    # Destroy the process group
    dist.destroy_process_group()

In [0]:
## it's possible that Data in Vols would work and the logging needs to be down on local/workspace TBD

--------------------------

In [0]:
## if ultralytics require local paths this could be challenging wrt file sizes -- need to check (with Engineering??) why Vols path not accessible... if WS paths work... this is pretty weird if UC vols is restrictive -- otherwise do we need to mount external storage location instead? hmmmmmm


# Some Obs:
# Vols path seems inaccessible for logging and results -- this could be problematic for larger data -- maybe dbfs/mnt + external location is needed
# git repo paths not ideal for mlflow tracking/logs
# ultralytics seem to have some preferred default workspace paths...



In [0]:
# def set_seeds():
#     # fix random seeds
#     SEED_VALUE = 42

#     random.seed(SEED_VALUE)
#     np.random.seed(SEED_VALUE)
#     torch.manual_seed(SEED_VALUE)
    
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed(SEED_VALUE)
#         torch.cuda.manual_seed_all(SEED_VALUE)
#         torch.backends.cudnn.deterministic = True
#         torch.backends.cudnn.benchmark = True
            
# set_seeds()

In [0]:
# Import the YOLO class from the ultralytics library
from ultralytics import YOLO

model = YOLO("yolov9e-seg.pt") # Transfer the weights from a pretrained model (recommended for training)

In [0]:
# !cp /Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/train/ -r /Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/datasets/train/

# !cp /Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/val/ -r /Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/datasets/val/

In [0]:
## output_dir = '/Volumes/mmt_mlops_demos/cv/data/Nuclei_Instance_Dataset/yolo_dataset/'

# project_dir = '/Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/yolo_dataset/'

# output_yaml_path = os.path.join(project_dir, 'data.yaml')
# train_path = os.path.join(project_dir, 'train', 'images')
# val_path = os.path.join(project_dir, 'val', 'images')
# create_yaml(output_yaml_path, train_path, val_path)

In [0]:
# !cat /Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/yolo_dataset/data.yaml

!cat /Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/data.yaml

In [0]:
# Import the YOLO class from the ultralytics library
from ultralytics import YOLO
# import torch
# from torchsummary import summary

# Instance
# model = YOLO("yolov9e.seg.yaml") # build a model from YAML
model = YOLO("yolov9e-seg.pt") # Transfer the weights from a pretrained model (recommended for training)

In [0]:
import torch.distributed as dist

# Initialize the process group
if not dist.is_initialized():
    dist.init_process_group(backend='nccl') ## required for cuda 

# project = project.strip()

# Train the model
results = model.train(
    data="/Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/data.yaml",  # Ensure no trailing spaces
    # project=#project,
    # name=name,
    epochs=70,
    patience=0,  # setting patience=0 to disable early stopping
    batch=3,
    imgsz=1024
)

# Automatic Mixed Precision (AMP) 


# ERROR related to github issue -- maybe need a shared location for 'project' -- to try 
# Experiment with name '/Workspace/Users/may.merkletan@databricks.com/db_CV/NuInsSeg/yolo_dataset/results/' does not exist. Creating a new experiment.
# RestException: INVALID_PARAMETER_VALUE: MLflow experiment creation is not permitted in a Git folder (repo). Use the default experiment for a notebook in a Git folder (repo) or create an MLflow experiment in the workspace.

--------------------------

In [0]:
# mlflow tracking is same as project path?? : /Shared/Ultralytics ??

###### Transfer learning from WS `data.yaml + datasets path` seem to work with `default` mlflow folders 

In [0]:
import torch.distributed as dist
from ultralytics import YOLO

# Initialize the process group
if not dist.is_initialized():
    dist.init_process_group(backend='nccl') ## required for cuda 

try:
    model = YOLO("yolov8n-seg.pt")

    results = model.train(                          
                            data="data.yaml",
                            epochs=70,
                            patience=0,  # setting patience=0 to disable early stopping
                            batch=3,
                            imgsz=1024
                            # optimizers="adam", #"sgd",
                        )
finally:
    # Destroy the process group
    dist.destroy_process_group()                  

In [0]:
# Ultralytics 8.3.70 🚀 Python-3.11.0rc1 torch-2.3.1+cu121 CUDA:0 (NVIDIA A10-24Q, 24298MiB)
# YOLOv8n-seg summary (fused): 195 layers, 3,258,259 parameters, 0 gradients, 12.0 GFLOPs
#                  Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 23/23 [00:05<00:00,  4.33it/s]
#                    all        133       6958      0.841       0.77      0.855      0.503      0.831      0.765      0.844      0.462
# Speed: 0.5ms preprocess, 3.0ms inference, 0.0ms loss, 3.4ms postprocess per image
# Results saved to runs/segment/train2

In [0]:
%load_ext tensorboard
%tensorboard --logdir runs/segment/train2

In [0]:
Image("runs/segment/train2/results.png")

--------------------------

In [0]:
from ultralytics import YOLO
inference_model = YOLO('runs/segment/train2/weights/best.pt')

In [0]:
inference_img_path = "datasets/val/images/human_spleen_07.png"
inference_result = inference_model.predict(inference_img_path,conf=0.7,save=True,imgsz=1024)

In [0]:
print(inference_result)

In [0]:
print("Boxes:   \n",inference_result[0].boxes)

In [0]:
print("Masks:  \n",inference_result[0].masks)

In [0]:
inference_result_array = inference_result[0].plot()
plt.figure(figsize=(9,9))
plt.imshow(inference_result_array)
plt.show()

In [0]:
def get_outputs(image, model, threshold=0.5):
    
    # print("Image shape",image.shape)
    outputs = model.predict(image, imgsz=1024, conf=threshold)
    print("Outputs",outputs)

    scores = outputs[0].boxes.conf.detach().cpu().numpy()
    thresholded_indices = [idx for idx, score in enumerate(scores) if score > threshold]
    print(f"Total detections: {len(scores)}, Passed threshold: {len(thresholded_indices)}")

    if len(thresholded_indices) > 0:
        masks = [outputs[0].masks.xy[idx] for idx in thresholded_indices]
        boxes = outputs[0].boxes.xyxy.detach().cpu().numpy()[thresholded_indices]
        boxes = [[(int(box[0]), int(box[1])), (int(box[2]), int(box[3]))] for box in boxes]
        labels = [outputs[0].names[int(outputs[0].boxes.cls[idx])] for idx in thresholded_indices]
    else:
        masks, boxes, labels = [], [], []

    return masks, boxes, labels

In [0]:
def draw_segmentation_map(image, masks, boxes, labels):
    alpha = 1.0
    beta = 0.5  # Transparency for the segmentation map
    gamma = 0  # Scalar added to each sum
    image = np.array(image)  # Convert the original PIL image into a NumPy format
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert from RGB to OpenCV BGR format

    for mask, box, label in zip(masks, boxes, labels):
        color = (0, 255, 0)  # Green color for visualization
        segmentation_map = np.zeros_like(image)

        if mask is not None and len(mask) > 0:
            poly = np.array(mask, dtype=np.int32)
            cv2.fillPoly(segmentation_map, [poly], color)

        cv2.addWeighted(image, alpha, segmentation_map, beta, gamma, image)
        cv2.rectangle(image, box[0], box[1], color=(255, 0, 0), thickness=2)  # Red color for bounding box

    return image

In [0]:
# !pip install pillow

from ultralytics import YOLO
inference_model = YOLO('runs/segment/train2/weights/best.pt')

In [0]:
## TODO compare labels and inference ?

In [0]:
fig = plt.figure(figsize=(3, 3), layout="constrained")
image_path = "datasets/val/images/human_placenta_24.png"
image = PIL.Image.open(image_path)
orig_image = image.copy()  # Keep a copy of the original image for OpenCV functions and applying masks

masks, boxes, labels = get_outputs(image, inference_model, threshold=0.5)
result = draw_segmentation_map(orig_image, masks, boxes, labels)

## ./inference folder assumed to exist -- to check
# save_path = f"./inference/nuclei_instance_out{image_path.split(os.path.sep)[-1].split('.')[0]}.jpg"
save_path = f"./inference/nuclei_instance_out{image_path.split(os.path.sep)[-1].split('.')[0]}.png"
cv2.imwrite(save_path, result)

plt.imshow(result)

plt.axis('off')
plt.show()

In [0]:
import os
import matplotlib.pyplot as plt

# List of image paths
inference_img_paths = [
    "datasets/val/images/human_bladder_03.png",
    "datasets/val/images/human_spleen_13.png",
    "datasets/val/images/human_cerebellum_6.png",
    "datasets/val/images/human_epiglottis_4.png",
    "datasets/val/images/human_jejunum_02.png",
    "datasets/val/images/human_kidney_01.png",
    "datasets/val/images/human_melanoma_03.png",
    "datasets/val/images/human_placenta_24.png",
]

inference_model = YOLO('runs/segment/train2/weights/best.pt')

# Number of images
N = len(inference_img_paths)

# Calculate the number of rows and columns for subplots
rows = (N + 2) // 3
cols = 3

# Create subplots
fig, axes = plt.subplots(
    rows, 
    cols, 
    # figsize=(9 * cols, 9 * rows)
    figsize=(3 * cols, 3 * rows)
)

# Flatten the axes array
axes = axes.flatten()

# Iterate over image paths and plot results
for i, img_path in enumerate(inference_img_paths):
    inference_result = inference_model.predict(
        img_path,
        conf=0.7,
        visualize=False,
        save=True
    )
    inference_result_array = inference_result[0].plot()
    axes[i].imshow(inference_result_array)
    axes[i].set_title(os.path.basename(img_path))
    axes[i].axis('off')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

plt.show()