<a href="https://colab.research.google.com/github/haythamlabrini/drusen-images-segmentation/blob/main/Run_model_on_all_flattened_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install/load required libraries

Install all require libraries

In [2]:
!pip install segmentation-models-pytorch
!pip install pytorch-lightning
!pip install torchvision
!pip install tensorflow
!pip install pandas




## Load libraries

In [3]:
import os
import torch
import matplotlib.pyplot as plt
import segmentation_models_pytorch as smp
from PIL import Image
import numpy as np
import pytorch_lightning as pl
import random
import torchvision.transforms as transforms
from pprint import pprint
from torch.utils.data import DataLoader

# Validate the available data
- Validate the number of images
- Validate there no duplicates
- List all available extensions in all files
- Confirm all images are of the same shape

In [4]:
complete_batch_path = '/content/drive/MyDrive/Drusen Project/complete-batch-flat-finale'
# complete_batch_path = '/content/drive/MyDrive/5 Line Scans'

### File count and duplicate count

In [5]:
import os

def list_files_recursive(directory):
    file_count = 0
    file_list = []
    file_duplicates = {}

    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_list.append(file_path)
            file_count += 1
            if file in file_duplicates:
                file_duplicates[file] += 1
            else:
                file_duplicates[file] = 1

    return file_list, file_count, file_duplicates

# Example usage:
images_list, images_count, images_duplicates = list_files_recursive(complete_batch_path)

print("Total number of files found:", images_count)
print("Number of duplicates found:", sum(count - 1 for count in images_duplicates.values()))


Total number of files found: 4172
Number of duplicates found: 0


### List the filename extensions

In [6]:
import os

def count_file_extensions(directory):
    file_extension_count = {}

    for root, dirs, files in os.walk(directory):
        for file in files:
            _, extension = os.path.splitext(file)
            extension = extension.lower()  # Convert to lowercase to ensure case-insensitivity
            if extension:
                if extension in file_extension_count:
                    file_extension_count[extension] += 1
                else:
                    file_extension_count[extension] = 1

    return file_extension_count

extension_count = count_file_extensions(complete_batch_path)

print("File extensions and their counts:")
for extension, count in extension_count.items():
    print(extension, ":", count)


File extensions and their counts:
.jpg : 4172


### Validate that all images are of the same shape

In [7]:
import os
import matplotlib.pyplot as plt
from PIL import Image

# Lists to categorize images by their size
images_with_1024_885 = []
images_without_1024_885 = []



total_images = len(images_list)

for idx, filename in enumerate(images_list, start=1):
    path = os.path.join(complete_batch_path, filename)
    print(f"[{idx}/{total_images}] Handling image - {path}")

    try:
        with Image.open(path) as img:
            if img.size == (1024, 885):
                images_with_1024_885.append(filename)
            else:
                images_without_1024_885.append(filename)
    except IOError:
        print(f"Failed to open or process image {filename}")

print("Processing completed.")

# Output the results
print("Count of images with size 1024x885:", len(images_with_1024_885))
print("Count of images without size 1024x885:", len(images_without_1024_885))
print("Images without size 1024x885:", images_without_1024_885)

image_file_paths = images_with_1024_885
image_filenames = [os.path.basename(path) for path in image_file_paths]

print(image_filenames)

[1/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/025_2022-08-15_OD_B_7.jpg
[2/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/003_2022-06-16_OD_T_2.jpg
[3/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/025_2022-08-15_OD_B_8.jpg
[4/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/003_2023-03-06_OS_T_2.jpg
[5/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/025_2022-08-15_OD_B_9.jpg
[6/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/003_2023-05-31_OD_T_7.jpg
[7/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/025_2022-08-15_OD_B_10.jpg
[8/4172] Handling image - /content/drive/MyDrive/Drusen Project/complete-batch-flat-finale/025_2022-08-15_OS_T_1.jpg
[9/4172] Handling image - /content/drive/MyDrive/Drusen Project

# Define Helpers

### Handle Image Transformations

In [8]:
def get_images_from_folder(folder_path, image_filenames):
    image_list = []

    # Iterate over files in the folder
    for filename in image_filenames:
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)

        # Load the image using PIL
        image = Image.open(file_path)

        # Append the image to the list
        image_list.append(image)

    return image_list


def resize_image(image, width, height):
    aspect_ratio = width / height

    new_width = width - (width % 32)
    new_height = int(new_width / aspect_ratio)

    resized_image = image.resize((new_width, new_height))
    return resized_image

import os
from PIL import Image

def resize_image_for_model(image, should_augment):
    original_width, original_height = image.size
    # Set the desired dimensions (divisible by 32)
    desired_width = original_width - (original_width % 32)
    desired_height = original_height - (original_height % 32)
    # Resize the image
    resized_image = resize_image(image, desired_width, desired_height)
    if not should_augment:
        return resized_image
    ## TODO: Move this somewhere else
    transform = transforms.Compose([
        transforms.RandomRotation(degrees=(-10, 10)),  # Randomly rotate the image between -10 and 10 degrees
        transforms.RandomHorizontalFlip(),  # Apply random horizontal flipping
        transforms.RandomVerticalFlip(),  # Apply random vertical flipping
    ])
    augmented_image = transform(resized_image)
    return augmented_image

def get_image_path_from_folder(folder_path, filename):
    return folder_path + "/" + filename

def get_images_filename_from_folder(folder_path):
    """
    Returns a list of filenames in a specified directory.

    Parameters:
    - folder_path (str): Path to the directory from which filenames are to be retrieved.

    Returns:
    - list: A list containing the filenames found in the directory.
    """
    try:
        filenames = []
        # List all entries in the folder
        entries = os.listdir(folder_path)
        for entry in entries:
            # Construct the full path to the item
            full_path = os.path.join(folder_path, entry)
            # If the item is a file, add its name to the list
            if os.path.isfile(full_path):
                filenames.append(entry)
        return filenames
    except FileNotFoundError:
        print(f"The directory {folder_path} does not exist.")
        return []
    except PermissionError:
        print(f"Permission denied: cannot access the directory {folder_path}.")
        return []

### Prepare the data for prediction

In [9]:
def set_image(filename, should_resize = True, should_augment = False):
    image = Image.open(get_image_path_from_folder(complete_batch_path, filename))
    if should_resize:
        image = resize_image_for_model(image, should_augment)
    image_array = np.array(image)
    mask_array = np.array(image)
    mask_max = np.max(mask_array)
    if mask_max == 0:
        mask_max = 1.0
    normalized_mask_array = mask_array / np.max(mask_max)

    element = {
        "filename": filename,
        "image": image_array.transpose(2,0,1),
        "mask": np.expand_dims(normalized_mask_array, axis=0),
    }

    return element


def set_data(array, should_resize = True, should_augment = False):
    result = []
    for filename in array:
        element = set_image(filename, should_resize, should_augment)
        result.append(element)
    return result


def get_indexes(n):
  index_70_percent = int(n * 0.70) - 1
  index_85_percent = int(n * 0.85) - 1
  index_100_percent = n - 1  # Last item in the list
  return [0, index_70_percent, index_85_percent, index_100_percent]

def calculate_accuracy(tp, tn, fp, fn):
    """
    Calculate the accuracy metric.

    Parameters:
    - tp (int): True Positives
    - tn (int): True Negatives
    - fp (int): False Positives
    - fn (int): False Negatives

    Returns:
    - accuracy (float): The calculated precision.
    """
    return (tp + tn) / (tp + fp + tn + fn)

def calculate_precision(tp, fp):
    """
    Calculate the precision metric.

    Parameters:
    - tp (int): True Positives
    - fp (int): False Positives

    Returns:
    - precision (float): The calculated precision.
    """
    # Ensure the denominator is not zero to avoid division by zero error
    if (tp + fp) == 0:
        return 0
    else:
        precision = tp / (tp + fp)
        return precision

def mean_of_tensors(tensor_list):
    # Ensure tensor_list is not empty
    if not tensor_list:
        raise ValueError("The tensor list is empty.")

    # Stack the tensors along a new dimension, and then calculate the mean
    stacked_tensors = torch.stack(tensor_list)
    mean_tensor = torch.mean(stacked_tensors, dim=0)

    return mean_tensor


### Prepare the data for the model

In [10]:
def prepare_data(input_filenames):
  loaded_real_images = get_images_from_folder(complete_batch_path, input_filenames)
  loaded_mask_images = get_images_from_folder(complete_batch_path, input_filenames)

  properly_loaded_dataset = set_data(input_filenames)

  n_cpu = os.cpu_count()
  ## Probably come back here and change the type of data to load with {image,mask}
  dataloader = DataLoader(properly_loaded_dataset, batch_size=10, shuffle=False, num_workers=n_cpu)

  return dataloader

In [11]:
print(torch.cuda.is_available())

True


# Data & folder structure validation

In [12]:
print(complete_batch_path)
predictions_path = "/content/drive/MyDrive/Drusen Project/feb7_results"

/content/drive/MyDrive/Drusen Project/complete-batch-flat-finale


In [13]:
# image_filenames: the image names that

image_filenames_paths = [os.path.join(complete_batch_path, filename) + "/" + filename for filename in image_filenames]
image_prediction_filenames_paths = [os.path.join(predictions_path, filename) for filename in image_filenames]

assert len(image_filenames_paths) == len(image_prediction_filenames_paths)
assert len(image_filenames_paths) == len(image_filenames)

print("The filenames list, path list and prediction path list have all the same length", len(image_filenames_paths))

The filenames list, path list and prediction path list have all the same length 4170


# Define the model

In [14]:
class DrusenModel(pl.LightningModule):
    def __init__(self, arch, encoder_name, in_channels, out_classes, **kwargs):
        super().__init__()
        self.validation_step_outputs = []
        self.training_step_outputs = []
        self.save_hyperparameters()
        self.test_step_outputs = []
        self.model = smp.create_model(
            arch, encoder_name=encoder_name, in_channels=in_channels, classes=out_classes, **kwargs
        )

        # preprocessing parameters for image
        params = smp.encoders.get_preprocessing_params(encoder_name)
        self.register_buffer("std", torch.tensor(params["std"]).view(1, 3, 1, 1))
        self.register_buffer("mean", torch.tensor(params["mean"]).view(1, 3, 1, 1))

        # for image segmentation dice loss could be the best first choice
        self.loss_fn = smp.losses.DiceLoss(smp.losses.BINARY_MODE, from_logits=True)

    def forward(self, image):
        # normalize image here
        image = (image - self.mean) / self.std
        mask = self.model(image)
        return mask

    def shared_step(self, batch, stage):
        image = batch["image"]
        image.toCuda()

        # Shape of the image should be (batch_size, num_channels, height, width)
        # if you work with grayscale images, expand channels dim to have [batch_size, 1, height, width]
        assert image.ndim == 4

        # Check that image dimensions are divisible by 32,
        # encoder and decoder connected by `skip connections` and usually encoder have 5 stages of
        # downsampling by factor 2 (2 ^ 5 = 32); e.g. if we have image with shape 65x65 we will have
        # following shapes of features in encoder and decoder: 84, 42, 21, 10, 5 -> 5, 10, 20, 40, 80
        # and we will get an error trying to concat these features
        h, w = image.shape[2:]
        assert h % 32 == 0 and w % 32 == 0

        mask = batch["mask"]

        # Shape of the mask should be [batch_size, num_classes, height, width]
        # for binary segmentation num_classes = 1
        assert mask.ndim == 4

        # Check that mask values in between 0 and 1, NOT 0 and 255 for binary segmentation
        assert mask.max() <= 1.0 and mask.min() >= 0

        logits_mask = self.forward(image)

        # Predicted mask contains logits, and loss_fn param `from_logits` is set to True
        loss = self.loss_fn(logits_mask, mask)
        # self.validation_step_outputs.append(loss)

        # Lets compute metrics for some threshold
        # first convert mask values to probabilities, then
        # apply thresholding
        prob_mask = logits_mask.sigmoid()
        pred_mask = (prob_mask > 0.5).float()

        # We will compute IoU metric by two ways
        #   1. dataset-wise
        #   2. image-wise
        # but for now we just compute true positive, false positive, false negative and
        # true negative 'pixels' for each image and class
        # these values will be aggregated in the end of an epoch
        tp, fp, fn, tn = smp.metrics.get_stats(pred_mask.long(), mask.long(), mode="binary")
        accuracy = smp.metrics.accuracy(tp, fp, fn, tn)
        precision = smp.metrics.precision(tp, fp, fn, tn)
        recall = smp.metrics.recall(tp, fp, fn, tn)
        # precision over recall
        f1 = smp.metrics.f1_score(tp, fp, fn, tn)

        local_metrics = {
            "loss": loss,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall
        }

        return local_metrics

    def shared_epoch_end(self, outputs, stage):
        # aggregate step metics

        loss = [x["loss"] for x in outputs]


        # loss = torch.cat([x["loss"] for x in outputs])
        tp = torch.cat([x["tp"] for x in outputs])
        fp = torch.cat([x["fp"] for x in outputs])
        fn = torch.cat([x["fn"] for x in outputs])
        tn = torch.cat([x["tn"] for x in outputs])
        accuracy = torch.cat([x["accuracy"] for x in outputs])
        precision = torch.cat([x["precision"] for x in outputs])
        recall = torch.cat([x["recall"] for x in outputs])

        # per image IoU means that we first calculate IoU score for each image
        # and then compute mean over these scores
        per_image_iou = smp.metrics.iou_score(tp, fp, fn, tn, reduction="micro-imagewise")
        # per_image_accuracy
        # per_image_precision
        # per_image_recall
        mean_accuracy = accuracy.mean().item()
        mean_precision = precision.mean().item()
        mean_recall = recall.mean().item()
        mean_loss = torch.stack(loss).mean().item()
        # mean_loss = loss.mean().item()

        # dataset IoU means that we aggregate intersection and union over whole dataset
        # and then compute IoU score. The difference between dataset_iou and per_image_iou scores
        # in this particular case will not be much, however for dataset
        # with "empty" images (images without target class) a large gap could be observed.
        # Empty images influence a lot on per_image_iou and much less on dataset_iou.
        dataset_iou = smp.metrics.iou_score(tp, fp, fn, tn, reduction="micro")

        metrics = {
            f"{stage}_per_image_iou": per_image_iou,
            f"{stage}_dataset_iou": dataset_iou,
            f"{stage}_mean_accuracy": mean_accuracy,
            f"{stage}_mean_precision": mean_precision,
            f"{stage}_mean_recall": mean_recall,
            f"{stage}_mean_loss": mean_loss,
            f"{stage}_mean_precision_over_recall": mean_precision / mean_recall,
            f"{stage}_mean_recall_over_precision": mean_recall / mean_precision
        }

        self.log_dict(metrics, prog_bar=True)

    def training_step(self, batch, batch_idx):
      training_step_outputs = self.shared_step(batch, "train")
      self.training_step_outputs.append(training_step_outputs)
      return training_step_outputs

    def on_train_epoch_end(self):
        self.shared_epoch_end(self.training_step_outputs, "train")
        self.training_step_outputs.clear()

    def validation_step(self, batch, batch_idx):
        validation_step_outputs = self.shared_step(batch, "validation")
        self.validation_step_outputs.append(validation_step_outputs)
        return validation_step_outputs

    def on_validation_epoch_end(self):
        self.shared_epoch_end(self.validation_step_outputs, "validation")
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        test_step_outputs = self.shared_step(batch, "test")
        self.test_step_outputs.append(test_step_outputs)
        # self.test_step_outputs.append(test_step_outputs["loss"])
        return test_step_outputs

    def on_test_epoch_end(self):
        self.shared_epoch_end(self.test_step_outputs, "test")
        self.test_step_outputs.clear()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)


# Load the model from the last available checkpoint

In [15]:
from pytorch_lightning.loggers import TensorBoardLogger

logger_path = "/content/syncedLatest/syncedLatest/logs/tb_logs"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

logger = TensorBoardLogger(logger_path, name="drusen_model_v1")
latest_version_checkpoint = "/content/drive/MyDrive/Drusen Project/syncedLatest/syncedLatest/logs/tb_logs/drusen_model_v1/version_17/checkpoints/epoch=9-step=180.ckpt"
model = DrusenModel.load_from_checkpoint(latest_version_checkpoint)
model = model.to(device)

trainer = pl.Trainer(
      max_epochs=10,
      logger=logger,
      accumulate_grad_batches=10,
  )


cuda


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


# Start the Prediction

## Helper to store result images

In [16]:
import matplotlib.pyplot as plt

def save_images_with_plt(images, filenames, folder_path):
    """
    Saves given images to the specified folder with the provided filenames, using Matplotlib.

    Parameters:
    - images: List of images (as NumPy arrays).
    - filenames: List of filenames as strings (without extension, .png will be added).
    - folder_path: Path to the folder where images will be saved.
    """
    # Ensure the folder exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Loop through images and filenames to save each image
    for image, filename in zip(images, filenames):
        # Complete path for saving the image
        file_path = os.path.join(folder_path, f"{filename}.png")
        # Save the image using Matplotlib
        plt.imsave(file_path, image, cmap='gray')  # Use appropriate colormap (cmap) if needed




## Helper for CSV handling

In [17]:
def generate_csv(headers, values, csv_filename):
    """
    Generates a CSV file with given headers and values.

    Parameters:
    - headers: A list of strings representing the column headers.
    - values: A list of values corresponding to the headers. This should
              be a list of lists if there are multiple rows.
    - csv_filename: The filename (including path) where the CSV should be saved.
    """
    # Create a DataFrame
    df = pd.DataFrame(values, columns=headers)

    # Save to CSV
    df.to_csv(csv_filename, index=False)

## Helper to create folder if not exist

In [18]:
def create_folder_if_not_exist(folder_path):
    """
    Creates a directory if it does not exist.

    Parameters:
    - folder_path (str): Path to the directory to be created if it does not exist.
    """
    if not os.path.exists(folder_path):
        try:
            os.makedirs(folder_path)
            print(f"Directory created: {folder_path}")
        except OSError as error:
            print(f"Failed to create directory {folder_path}: {error}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
    else:
        print(f"Directory already exists: {folder_path}")

# Helper to Predict all the results

In [19]:
def store_results(filename, image_to_plot, predicted_mask_to_plot):
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))

    # Example usage
    images = [image_to_plot, predicted_mask_to_plot]
    filenames = ['original', 'prediction']
    filename_without_extension = os.path.splitext(filename)[0]
    folder_path = os.path.join(predictions_path, filename_without_extension)
    create_folder_if_not_exist(folder_path)

    original_path = os.path.join(folder_path, "original.png")
    prediction_path = os.path.join(folder_path, "prediction.png")
    figure_path = os.path.join(folder_path, "figure.png")
    csv_path = os.path.join(folder_path, "metrics.csv")

    # Call the function to save the images
    save_images_with_plt(images, filenames, folder_path)

    # Plot the original image
    axs[0].imshow(image_to_plot)
    axs[0].set_title('Original Image')
    axs[0].axis('off')  # Hide axis

    # Plot the predicted mask
    # Since predicted_mask_to_plot is single-channel, use cmap='gray' to display it in grayscale
    axs[1].imshow(predicted_mask_to_plot, cmap='gray')
    axs[1].set_title('Predicted Mask')
    axs[1].axis('off')  # Hide axis

    plt.savefig(figure_path, bbox_inches='tight')
    plt.close()

In [20]:
# NEW CODE
import torch
import torch.nn.functional as F

# Iterate over dataloader
def process_dataloader(dataloader):
    dataloader_iter = iter(dataloader)
    image_counter = 0  # Initialize a counter to keep track of saved images

    for batch_idx, batch in enumerate(dataloader_iter):
        with torch.no_grad():
            model.eval()
            image = batch["image"].cuda(non_blocking=True)  # Use non_blocking for efficiency
            logits = model(image)
            pr_masks = logits.sigmoid()

        batch_number = batch_idx + 1
        print(f"Batch {batch_number}/{len(dataloader)}")

        # Process images individually (no large in-memory list)
        for idx, (filename, img, gt_mask, pr_mask) in enumerate(zip(batch["filename"], batch["image"], batch["mask"], pr_masks)):
            # Convert from (C, H, W) to (H, W, C) for plotting
            image_to_plot = (
                F.interpolate(img.unsqueeze(0), size=(1024, 1024), mode='bilinear', align_corners=False)
                .squeeze(0)
                .cpu()
                .numpy()
                .squeeze()
                .transpose(1, 2, 0)
            )

            predicted_mask_to_plot = (
                F.interpolate(pr_mask.unsqueeze(0), size=(1024, 1024), mode='bilinear', align_corners=False)
                .squeeze(0)
                .cpu()
                .numpy()
                .squeeze()
            )

            # Store results immediately (no list accumulation)
            store_results(filename, image_to_plot, predicted_mask_to_plot)

            # Free memory
            del image_to_plot, predicted_mask_to_plot
            torch.cuda.empty_cache()

            image_counter += 1  # Increment counter

        # Explicitly free memory at batch level
        del image, logits, pr_masks
        torch.cuda.empty_cache()

    print(f"Processing completed. {image_counter} images processed.")


In [21]:
# OLD CODE
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import pandas as pd
import os

# Iterate over dataloader
def process_dataloader(dataloader):
  dataloader_iter = iter(dataloader)
  image_counter = 0  # Initialize a counter to keep track of saved images
  for batch_idx, batch in enumerate(dataloader_iter):
      with torch.no_grad():
          model.eval()
          image = batch["image"].cuda()
          logits = model(image)
          pr_masks = logits.sigmoid()

      batch_number = batch_idx + 1

      print(f"Batch {batch_number}/{len(dataloader)}")

      image_mask_pair = []

      for idx, (filename, image, gt_mask, pr_mask) in enumerate(zip(batch["filename"], batch["image"], batch["mask"], pr_masks)):
          # Convert from (C, H, W) to (H, W, C) for plotting
          image_to_plot = F.interpolate(image.unsqueeze(0), size=(1024, 1024), mode='bilinear', align_corners=False).squeeze(0).cpu().numpy().squeeze().transpose(1, 2, 0)
          predicted_mask_to_plot = F.interpolate(pr_mask.unsqueeze(0), size=(1024, 1024), mode='bilinear', align_corners=False).squeeze(0).cpu().numpy().squeeze()

          image_mask_pair.append((filename, image_to_plot, predicted_mask_to_plot))

          image_counter += 1  # Increment the counter
      # print(len(image_mask_pair))

      for filename, image_to_plot, predicted_mask_to_plot in image_mask_pair:
          store_results(filename, image_to_plot, predicted_mask_to_plot)

# Predict the drusen in images

In [22]:
create_folder_if_not_exist(predictions_path)

Directory already exists: /content/drive/MyDrive/Drusen Project/feb7_results


In [None]:
import os
import glob

length = len(image_filenames)

for i in range(0, len(image_filenames), 10):
    filenames = image_filenames[i:i+10]
    print(f'{i}/{length}')
    remaining_filenames = [filename for filename in filenames if not os.path.exists(os.path.join(predictions_path, os.path.splitext(filename)[0], "prediction.png"))]

    dataloader = prepare_data(remaining_filenames)
    process_dataloader(dataloader)


0/4170
10/4170
20/4170
30/4170
40/4170
50/4170
60/4170
70/4170
80/4170
90/4170
100/4170
110/4170
120/4170
130/4170
140/4170
150/4170
160/4170
170/4170
180/4170
190/4170
200/4170
210/4170
220/4170
230/4170
240/4170
250/4170
260/4170
270/4170
280/4170
290/4170
300/4170
310/4170
320/4170
330/4170
340/4170
350/4170
360/4170
370/4170
380/4170
390/4170
400/4170
410/4170
420/4170
430/4170
440/4170
450/4170
460/4170
470/4170
480/4170
490/4170
500/4170
510/4170
520/4170
530/4170
540/4170
550/4170
560/4170
570/4170
580/4170
590/4170
600/4170
610/4170
620/4170
630/4170
640/4170
650/4170
660/4170
670/4170
680/4170
690/4170
700/4170
710/4170
720/4170
730/4170
740/4170
750/4170
760/4170
770/4170
780/4170
790/4170
800/4170
810/4170
820/4170
830/4170
840/4170
850/4170
860/4170
870/4170
880/4170
890/4170
900/4170
910/4170
920/4170
930/4170
940/4170
950/4170
960/4170
970/4170
980/4170
990/4170
1000/4170
1010/4170
1020/4170
1030/4170
1040/4170
1050/4170
1060/4170
1070/4170
1080/4170
1090/4170
1100/4170
1

# Generate the CSV

## Define Helpers

In [None]:
import cv2
import numpy as np

def count_white_objects_and_area(image_path):
    # Load the image in grayscale mode
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Create a binary mask where white pixels are marked with True (or 1)
    white_pixels_mask = image == 255

    # Count the white pixels using numpy
    num_white_pixels = np.sum(white_pixels_mask)

    # Check if the image is binary; if not, binarize it
    if np.unique(image).tolist() not in ([0, 255], [255]):
        _, image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)

    # Find contours in the binary image
    # Note: The function returns a tuple where the first element contains the contours
    contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # The number of contours found corresponds to the number of white objects
    num_white_objects = len(contours)

    return num_white_objects, num_white_pixels


## Combine all cvs

In [None]:
# get the list of all folders within predictions_path
folders = [f for f in os.listdir(predictions_path) if os.path.isdir(os.path.join(predictions_path, f))]

print(len(folders))

# # loop through the folders that do not have a csv
for folder in folders:
    if not os.path.exists(predictions_path + folder + "/metrics.csv"):
        print(folder)
        # get the full path of this folder
        folder_path = predictions_path + folder + "/"
        prediction_path = folder_path + "prediction.png"
        prediction_count, prediction_area = count_white_objects_and_area(prediction_path)
        csv_path = folder_path + "metrics.csv"

        headers = ['filename', 'prediction_count', 'prediction_area']
        values = [[folder, prediction_count, prediction_area]]
        generate_csv(
            headers,
            values,
            csv_path
        )

['007_2020-12-17_OS_T_7.jpg', '007_2020-12-17_OS_T_9.jpg', '007_2020-12-17_OS_T_10.jpg', '007_2023-02-27_OD_T_1.jpg', '007_2023-02-27_OD_T_2.jpg', '007_2023-02-27_OD_T_3.jpg', '007_2023-02-27_OD_T_4.jpg', '007_2023-02-27_OD_T_5.jpg', '007_2022-07-03_OD_T_1.jpg', '007_2022-07-03_OD_T_2.jpg', '007_2022-07-03_OD_T_4.jpg', '007_2022-07-03_OD_T_5.jpg', '007_2022-07-03_OD_T_6.jpg', '007_2022-07-03_OD_T_7.jpg', '007_2022-07-03_OD_T_8.jpg', '007_2022-07-03_OD_T_9.jpg', '007_2022-07-03_OD_T_10.jpg', '007_2022-07-03_OS_B_1.jpg', '007_2022-07-03_OS_B_2.jpg', '007_2022-07-03_OS_B_4.jpg', '007_2022-07-03_OS_B_5.jpg', '007_2022-07-03_OS_B_6.jpg', '007_2022-07-03_OS_B_7.jpg', '007_2022-07-03_OS_B_8.jpg', '007_2022-07-03_OS_B_9.jpg', '007_2022-07-03_OS_B_10.jpg', '007_2021-03-17_OD_T_1.jpg', '007_2021-03-17_OD_T_2.jpg', '007_2021-03-17_OD_T_4.jpg', '007_2021-03-17_OD_T_5.jpg', '007_2021-03-17_OD_T_6.jpg', '007_2021-03-17_OD_T_7.jpg', '007_2021-03-17_OD_T_8.jpg', '007_2021-03-17_OD_T_9.jpg', '007_2021-

In [None]:
# Combine all the csv files together in one
import pandas as pd
import glob
import os

combined_csv_path = predictions_path + 'combined_metrics.csv'  # Update this path as needed

# Pattern to match all metrics.csv files within the subdirectories of base_dir
pattern = os.path.join(predictions_path, '*/metrics.csv')

# Find all files that match the pattern
metrics_files = glob.glob(pattern)

# List to hold the data from each metrics.csv file
dataframes = []

# Iterate over the list of file paths & read each file into a pandas DataFrame
for file_path in metrics_files:
    df = pd.read_csv(file_path)
    # Optionally, add a column to track the source file or folder if needed
    df['source'] = os.path.basename(os.path.dirname(file_path))
    dataframes.append(df)

# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_csv_path, index=False)

print(f'Combined metrics saved to {combined_csv_path}')


Combined metrics saved to /content/drive/MyDrive/Drusen Project/complete-batch-flat-predictions/combined_metrics.csv
