In [2]:
COLAB = False
# Set this depending on whether running in colab or not

In [3]:
if COLAB:
    from google.colab import drive

    drive.mount("/content/drive", force_remount=True)

    DATAPATH = Path("/content/drive/MyDrive/DataSets/data")
else:
    DATAPATH = "data"

In [4]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
from keras.models import Model
from keras.layers import (
    Input,
    Conv2D,
    MaxPooling2D,
    Conv2DTranspose,
    GaussianNoise,
    Lambda,
    Dropout,
    UpSampling2D,
)
from keras import backend as K
from keras.optimizer_v2 import adam
from keras.metrics import RootMeanSquaredError
from sklearn.model_selection import train_test_split
from pathlib import Path
from PIL import Image
from skimage import io
import math
from scipy.sparse import coo_matrix
import pickle

2021-12-27 13:05:00.767173: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/jamesholcombe/git/personal/cell-instance-segmentation/venv/lib/python3.8/site-packages/cv2/../../lib64:
2021-12-27 13:05:00.767236: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Data Preparation 

I am defining a few helper functions used as part of the data prep pipeline.

In [5]:
def rle_decode(
    mask_rle, shape=(520, 704), color=1
):  # function to convert tabular mask data to image
    """
    mask_rle: run-length as string formated (start length)
    shape: (height, width, channels) of array to return
    color: color for the mask
    Returns numpy array (mask)

    """
    s = mask_rle.split()

    starts = list(map(lambda x: int(x) - 1, s[0::2]))
    lengths = list(map(int, s[1::2]))
    ends = [x + y for x, y in zip(starts, lengths)]

    img = np.zeros((shape[0] * shape[1]), dtype=np.float32)

    for start, end in zip(starts, ends):
        img[start:end] = color

    return img.reshape(shape)

In [6]:
def get_centroid(arr):
    non_zero = np.where(arr == 1)
    x, y = non_zero
    return int(np.median(x)), int(np.median(y))


def make_mask(centroid: tuple):
    x, y = centroid
    a = np.zeros(
        (
            int(520 / 8),
            int(704 / 8),
        )
    )
    a[int(x / 8), int(y / 8)] = 1
    return a

### Custom Data Generator

As the dataset is large, I am implementing a custom data generator to load and prepare the data on the fly during model training. This ensures that RAM usage does not exceed hardware limitations.

In [7]:
from tensorflow.keras.utils import Sequence


class DataGenerator(Sequence):
    """Generates data for Keras
    Sequence based data generator. Suitable for building data generator for training and prediction.
    """

    def __init__(
        self,
        image_directory,
        data_path,
        to_fit=True,
        batch_size=32,
        dim=(520, 704),
    ):
        """Initialization"""
        self.data = pd.read_csv(data_path)
        self.sample_ids = self.data["id"].unique()
        self.image_indexes = dict(zip(range(len(self.sample_ids)), self.sample_ids))
        self.indexes = list(range(len(self.image_indexes)))
        self.to_fit = to_fit
        self.batch_size = batch_size
        self.dim = dim

        self.on_epoch_end()

    def __len__(self):
        """Denotes the number of batches per epoch
        :return: number of batches per epoch
        """
        return int(np.floor(len(self.sample_ids) / self.batch_size))

    def __getitem__(self, index):
        """Generate one batch of data
        :param index: index of the batch
        :return: X and y when fitting. X only when predicting
        """
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]

        # Generate data
        X = self._generate_X(indexes)

        if self.to_fit:
            y = self._generate_y(indexes)
            return X, y
        else:
            return X

    def _generate_X(self, indexes):
        """Generates data containing batch_size images
        :param list_IDs_temp: list of label ids to load
        :return: batch of images
        """
        # Initialization
        X = np.empty((self.batch_size, *self.dim, 3))

        for i, label in enumerate(indexes):
            image_id = self.image_indexes[label]
            X[i] = np.repeat(
                np.asarray(io.imread(DATAPATH / Path(f"train/{image_id}.png")))[
                    :, :, np.newaxis
                ],
                repeats=3,
                axis=2,
            )
        return X

    def _generate_y(self, indexes):
        """Generates data containing batch_size masks
        :param list_IDs_temp: list of label ids to load
        :return: batch if masks
        """
        y = np.empty((self.batch_size, *self.dim), dtype=int)

        image_ids = [self.image_indexes[i] for i in indexes]
        df = self.data[self.data["id"].isin(image_ids)]

        decoded = df["annotation"].apply(rle_decode)
        df["x_cent"], df["y_cent"] = zip(*decoded.apply(get_centroid))
        df["centroid"] = list(zip(df["x_cent"], df["y_cent"]))
        df["masks"] = df["centroid"].apply(make_mask)
        y = df.groupby("id")["masks"].sum()

        return np.stack(y)

Building the model

In [8]:
x_in = Input(shape=(520, 704, 3))  # input shape: (height, width, 3 bands of RGB)


x_temp = Conv2D(64, (3, 3), activation="relu", padding="same")(x_in)
x_temp = Dropout(0.25)(x_temp)
x_temp = Conv2D(64, (3, 3), activation="relu", padding="same")(x_temp)
x_temp = MaxPooling2D((2, 2))(x_temp)
x_temp = Conv2D(64, (3, 3), dilation_rate=(2, 2), activation="relu", padding="same")(
    x_temp
)  # dilated convolutions
x_temp = Dropout(0.25)(x_temp)
x_temp = Conv2D(64, (3, 3), dilation_rate=(2, 2), activation="relu", padding="same")(
    x_temp
)  # dilated convolutions
x_temp = MaxPooling2D((2, 2))(x_temp)
x_temp = Conv2D(64, (3, 3), dilation_rate=(2, 2), activation="relu", padding="same")(
    x_temp
)  # dilated convolutions
x_temp = Dropout(0.25)(x_temp)
x_temp = Conv2D(64, (3, 3), dilation_rate=(2, 2), activation="relu", padding="same")(
    x_temp
)  # dilated convolutions
x_temp = MaxPooling2D((2, 2))(x_temp)
x_temp = Conv2D(64, (3, 3), dilation_rate=(2, 2), activation="relu", padding="same")(
    x_temp
)  # dilated convolutions
x_temp = Dropout(0.25)(x_temp)
x_temp = Conv2D(64, (3, 3), dilation_rate=(2, 2), activation="relu", padding="same")(
    x_temp
)  # dilated convolutions

x_temp = Conv2D(64, (1, 1), activation="relu", padding="same")(x_temp)
x_temp = Conv2D(64, (1, 1), activation="relu", padding="same")(x_temp)
x_out = Conv2D(1, (1, 1), activation="relu", padding="same")(x_temp)

2021-12-27 13:05:04.192526: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/jamesholcombe/git/personal/cell-instance-segmentation/venv/lib/python3.8/site-packages/cv2/../../lib64:
2021-12-27 13:05:04.192564: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-27 13:05:04.192580: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LDNLTCND1333CZ3): /proc/driver/nvidia/version does not exist
2021-12-27 13:05:04.192789: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropr

In [9]:
def customLoss(yTrue, yPred):
    return K.sqrt(
        K.sum(
            K.flatten(K.tf.multiply(K.square(yTrue - yPred), yTrue + 1) / (1 * 2 + 1))
        )
        / (64 * 64)
    )


model = Model(inputs=x_in, outputs=x_out)
model.compile(loss=customLoss, optimizer=adam.Adam())  # setting loss and optimizer
model.summary()  # printing the model summary

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 520, 704, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 520, 704, 64)      1792      
                                                                 
 dropout (Dropout)           (None, 520, 704, 64)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 520, 704, 64)      36928     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 260, 352, 64)     0         
 )                                                               
                                                                 
 conv2d_2 (Conv2D)           (None, 260, 352, 64)      36928     
                                                             

In [10]:
data_gen = DataGenerator(
    DATAPATH / Path("train"),
    DATAPATH / Path("train/.csv"),
)
model.fit(data_gen, epochs=50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["x_cent"], df["y_cent"] = zip(*decoded.apply(get_centroid))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["centroid"] = list(zip(df["x_cent"], df["y_cent"]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["masks"] = df["centroid"].apply(make_mask)


Epoch 1/50
 2/18 [==>...........................] - ETA: 9:35 - loss: 5.6084  

In [None]:
model_json = model.to_json()
with open("./models/Centroid_Estimation.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("./models/Centroid_Estimation.h5")

2021-12-24 17:13:53.949075: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/Centroid model/assets
