# Dataset Preparation for YOLOv8 models

In this notebook, some tools are provided to generate datasets directly usable by YOLOv8.

## Setup

### Imports

In [1]:
import os
import random
import shutil
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import yaml
from tqdm.notebook import tqdm

### Global variables

Adjust the `INPUT_DIR_YOLOV8_INSTANCE_SEGMENTATION_LABELS`, `OUTPUT_DIR_YOLOV8_SEGMENTATION`, `INPUT_LABEL_MISMATCH_CSV_DATA` and `INPUT_DIR_DATASET_ROOT_RS_LABELLED` path to match your filesystem setup.

In [2]:
# IMPORTANT: Modify these four paths to point to your own data
INPUT_DIR_DATASET_ROOT_RS_LABELLED = Path(
    "/home/chouffe/playground/datasets/benthic_datasets/mask_labels/rs_labelled"
)
INPUT_DIR_YOLOV8_INSTANCE_SEGMENTATION_LABEL = Path(
    "/home/chouffe/playground/datasets/yolov8/benthic_datasets/instance_segmentation"
)
INPUT_LABEL_MISMATCH_CSV_DATA = Path(
    "/home/chouffe/fruitpunch/challenges/coralreefs2/datasets/benthic_datasets/label_mismatch/data.csv"
)
OUTPUT_DIR_YOLOV8_SEGMENTATION = Path(
    "/home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/"
)

ALL_DATASETS = [
    "SEAFLOWER_BOLIVAR",
    "SEAFLOWER_COURTOWN",
    "SEAVIEW_ATL",
    "SEAVIEW_IDN_PHL",
    "SEAVIEW_PAC_AUS",
    "SEAVIEW_PAC_USA",
    "TETES_PROVIDENCIA",
]

LABEL_TO_CLASS_MAPPING = {"soft_coral": 0, "hard_coral": 1}
CLASS_TO_LABEL_MAPPING = {v: k for k, v in LABEL_TO_CLASS_MAPPING.items()}

# For type hints
Quadratid = int
Contour = np.ndarray
Mask = np.ndarray
Polygon = np.ndarray
Entry = list[dict]

### Check folder structures

Make sure that the following command returns something like this:

```
├── SEAFLOWER_BOLIVAR
│   └── labels
│       ├── images
│       ├── individual
│       └── stitched
├── SEAFLOWER_COURTOWN
│   └── labels
│       ├── images
│       ├── individual
│       └── stitched
├── SEAVIEW_ATL
│   └── labels
│       ├── images
│       ├── individual
│       └── stitched
├── SEAVIEW_IDN_PHL
│   └── labels
│       ├── images
│       ├── individual
│       └── stitched
├── SEAVIEW_PAC_AUS
│   └── labels
│       ├── images
│       ├── individual
│       └── stitched
├── SEAVIEW_PAC_USA
│   └── labels
│       ├── images
│       ├── individual
│       └── stitched
└── TETES_PROVIDENCIA
    └── labels
        ├── images
        ├── individual
        └── stitched
```

In [3]:
!tree -d -L 3 $INPUT_DIR_YOLOV8_INSTANCE_SEGMENTATION_LABEL

[01;34m/home/chouffe/playground/datasets/yolov8/benthic_datasets/instance_segmentation[0m
├── [01;34mSEAFLOWER_BOLIVAR[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mimages[0m
│       ├── [01;34mindividual[0m
│       └── [01;34mstitched[0m
├── [01;34mSEAFLOWER_COURTOWN[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mimages[0m
│       ├── [01;34mindividual[0m
│       └── [01;34mstitched[0m
├── [01;34mSEAVIEW_ATL[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mimages[0m
│       ├── [01;34mindividual[0m
│       └── [01;34mstitched[0m
├── [01;34mSEAVIEW_IDN_PHL[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mimages[0m
│       ├── [01;34mindividual[0m
│       └── [01;34mstitched[0m
├── [01;34mSEAVIEW_PAC_AUS[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mimages[0m
│       ├── [01;34mindividual[0m
│       └── [01;34mstitched[0m
├── [01;34mSEAVIEW_PAC_USA[0m
│   └── [01;34mlabels[0m
│       ├── [01;34mimages[0m
│       ├── [01;34mindivid

Make sure that the following command returns something like this:

```
├── SEAFLOWER_BOLIVAR
│   ├── images
│   ├── masks
│   └── masks_stitched
├── SEAFLOWER_COURTOWN
│   ├── images
│   ├── masks
│   └── masks_stitched
├── SEAVIEW_ATL
│   ├── images
│   ├── masks
│   └── masks_stitched
├── SEAVIEW_IDN_PHL
│   ├── images
│   ├── masks
│   └── masks_stitched
├── SEAVIEW_PAC_AUS
│   ├── images
│   ├── masks
│   └── masks_stitched
├── SEAVIEW_PAC_USA
│   ├── images
│   ├── masks
│   └── masks_stitched
└── TETES_PROVIDENCIA
    ├── images
    ├── masks
    └── masks_stitched
```

In [4]:
!tree -d -L 3 $INPUT_DIR_DATASET_ROOT_RS_LABELLED

[01;34m/home/chouffe/playground/datasets/benthic_datasets/mask_labels/rs_labelled[0m
├── [01;34mSEAFLOWER_BOLIVAR[0m
│   ├── [01;34mimages[0m
│   ├── [01;34mmasks[0m
│   └── [01;34mmasks_stitched[0m
├── [01;34mSEAFLOWER_COURTOWN[0m
│   ├── [01;34mimages[0m
│   ├── [01;34mmasks[0m
│   └── [01;34mmasks_stitched[0m
├── [01;34mSEAVIEW_ATL[0m
│   ├── [01;34mimages[0m
│   ├── [01;34mmasks[0m
│   └── [01;34mmasks_stitched[0m
├── [01;34mSEAVIEW_IDN_PHL[0m
│   ├── [01;34mimages[0m
│   ├── [01;34mmasks[0m
│   └── [01;34mmasks_stitched[0m
├── [01;34mSEAVIEW_PAC_AUS[0m
│   ├── [01;34mimages[0m
│   ├── [01;34mmasks[0m
│   └── [01;34mmasks_stitched[0m
├── [01;34mSEAVIEW_PAC_USA[0m
│   ├── [01;34mimages[0m
│   ├── [01;34mmasks[0m
│   └── [01;34mmasks_stitched[0m
└── [01;34mTETES_PROVIDENCIA[0m
    ├── [01;34mimages[0m
    ├── [01;34mmasks[0m
    └── [01;34mmasks_stitched[0m

28 directories


In [5]:
def rm_r(path: Path) -> None:
    """Equivalent to the bash command `rm -r $path`.

    Warning: Make sure you know which folder you are clearing before running it.
    The erased files won't go to the Trash folder.
    """

    if not os.path.exists(path):
        return
    if os.path.isfile(path) or os.path.islink(path):
        os.unlink(path)
    else:
        shutil.rmtree(path)


class MyDumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(MyDumper, self).increase_indent(flow, False)


def write_config_yaml(
    path: Path,
    X_train,
    X_val,
    dataset_names: list[str],
    seed: int,
    train_size_ratio: float,
) -> None:
    """Writes the `config.yaml` file that describes the generated dataset."""

    def entries_to_dict(entries):
        result = defaultdict(list)
        for entry in entries:
            result[entry["dataset_name"]].append(entry["image_filepath"].name)
        return dict(result)

    data = {
        "dataset_names": dataset_names,
        "seed": seed,
        "train_size_ratio": train_size_ratio,
        "train_dataset_size": len(X_train),
        "val_dataset_size": len(X_val),
        "train_dataset": entries_to_dict(X_train),
        "val_dataset": entries_to_dict(X_val),
    }

    with open(path / "config.yaml", "x") as f:
        yaml.dump(data, f, Dumper=MyDumper, default_flow_style=False, sort_keys=False)


def slurp(filepath: Path) -> str:
    with open(filepath, "r") as f:
        return f.read()


def write_data_yaml(path: Path) -> None:
    """Writes the `data.yaml` file necessary for YOLOv8 training at `path`
    location."""
    data = {
        "train": "./train/images",
        "val": "./val/images",
        "nc": 2,
        "names": [CLASS_TO_LABEL_MAPPING[i] for i in range(2)],
    }
    with open(path / "data.yaml", "x") as f:
        yaml.dump(data, f, Dumper=MyDumper, default_flow_style=False, sort_keys=False)


def write_readme(path: Path) -> None:
    """Writes the README.md file of the dataset that describes how to train a
    YOLOv8 model on it."""
    content = [
        "# README",
        "",
        "## Basic training",
        "To train a yolo model on this dataset, follow the steps:",
        "1. Install ultralytics in a virtualenv:",
        "> pip install ultralytics",
        (
            "2. open data.yaml and edit `train` and `val` value to indicate an absolute"
            " path (eg. /home/user/fruitpunch/datasets/train/images)"
        ),
        (
            "3. run the following basic command to train yolo for object detection for"
            " 1 epoch on the dataset:"
        ),
        "> yolo train data=./data.yaml model=yolov8n.pt epochs=1",
        (
            "4. run the following basic command to train yolo for instance segmentation"
            " for 1 epoch on the dataset:"
        ),
        "> yolo train data=./data.yaml model=yolov8n-seg.pt epochs=1",
        "",
        "## More advanced training",
        "One can use different model sizes for yolo (n, s, m, l, x):",
        "Eg. Train for 10 epochs the `m` size yolo model for instance segmentation:",
        "> yolo train data=./data.yaml model=yolov8m-seg.pt epochs=10",
        "Eg. Train for 10 epochs the `x` size yolo model for object detection:",
        "> yolo train data=./data.yaml model=yolov8x.pt epochs=10",
    ]
    with open(path / "README.md", "x") as f:
        f.write("\n".join(content))


def init_yolov8_dataset_folder_structure(
    output_dir: Path = OUTPUT_DIR_YOLOV8_SEGMENTATION, clear: bool = True
) -> None:
    """Creates the right yolov8 dataset empty folder structure."""
    if clear:
        print(f"clearing folder {output_dir}")
        rm_r(output_dir)

    dirs = [
        output_dir / "train/images/",
        output_dir / "train/labels/",
        output_dir / "val/images/",
        output_dir / "val/labels/",
    ]

    for dir in dirs:
        if not os.path.isdir(dir):
            print(f"Making directory: {dir}")
            os.makedirs(dir)

    print("Writing data.yaml file")
    write_data_yaml(output_dir)
    print("Writing README.md file")
    write_readme(output_dir)

In [13]:
def list_image_filepaths(
    dataset_name: str, input_dir: Path = INPUT_DIR_DATASET_ROOT_RS_LABELLED
) -> list[Path]:
    """Returns a list of paths that are the list of all image names for a given
    `dataset_name`."""
    path = input_dir / dataset_name / "images"
    return [path / f for f in os.listdir(path) if os.path.isfile(path / f)]


def is_label_mismatch(
    dataset_name: str, invalid_seaview_quadratids: set[Quadratid], filepath: Path
) -> bool:
    """Returns whether the `filepath` has a label mismatch."""
    if not dataset_name.startswith("SEAVIEW"):
        return False
    elif int(filepath.stem) in invalid_seaview_quadratids:
        return True
    else:
        return False


def is_only_black_pixels(mask: Mask) -> bool:
    """Returns True if the mask image is only black pixels."""
    non_black_pixels = np.any(mask != [0, 0, 0], axis=-1)
    black_pixels = ~non_black_pixels
    return black_pixels.all()


def get_invalid_seaview_quadratids(
    csv_data_path: Path = INPUT_LABEL_MISMATCH_CSV_DATA,
) -> set[Quadratid]:
    """Returns a set of quadratids from the seaview folders that contain label
    mismatches.

    Note:
    ReefSupport suggested to discared the following datapoints:
    - For Seaview, discard images with a mismatch of maximum 10 points
      (20% if 50 annotation points or 10% if 100 annotation points)
    - Seaflower and Tetes labelling results are best
    """
    df = pd.read_csv(csv_data_path)
    df_mismatch_labels = df[
        df["folder"].str.startswith("SEAVIEW") & (df["points_mismatch_count"] >= 10)
    ]
    return set(df_mismatch_labels["quadratid"])


def is_empty_label(label_filepath: Path) -> bool:
    """Returns true if the label file is empty (== black mask)"""
    return (
        (not os.path.isfile(label_filepath))
        or slurp(label_filepath) is None
        or slurp(label_filepath) == ""
    )


def image_filepath_to_label_filepath(dataset_name: str, image_filepath: Path) -> Path:
    label_filename = f"{image_filepath.stem}.txt"
    label_filepath = (
        INPUT_DIR_YOLOV8_INSTANCE_SEGMENTATION_LABEL
        / dataset_name
        / "labels"
        / "images"
        / label_filename
    )
    return label_filepath


def get_image_filepaths_with_empty_masks(
    dataset_names: list[str] = ALL_DATASETS,
) -> set[Path]:
    """Returns a list of image filepaths that have empty masks (= empty
    labels)."""
    image_filepaths_with_empty_label = set()
    for dataset_name in dataset_names:
        print(f"Looking for empty masks in {dataset_name}")
        all_image_filepaths = list_image_filepaths(dataset_name)
        empty_labels = [
            image_filepath
            for image_filepath in all_image_filepaths
            if is_empty_label(
                image_filepath_to_label_filepath(dataset_name, image_filepath)
            )
        ]
        print(f"    Found {len(empty_labels)} empty label files")
        if len(empty_labels) > 0:
            image_filepaths_with_empty_label = image_filepaths_with_empty_label.union(
                empty_labels
            )
    return image_filepaths_with_empty_label


def get_X(
    dataset_names: list[str],
    invalid_seaview_quadratids: set[Quadratid],
    invalid_image_filepaths: set[Path] = set(),
) -> list[Entry]:
    """Returns a list of {dataset_name, image_filepath, label_filepath} that
    constitues the X dataset.

    Excludes the datapoints that contain data label mismatch.
    """
    X = []
    for dataset_name in dataset_names:
        all_image_filepaths = list_image_filepaths(dataset_name)
        image_filepaths = [
            p
            for p in all_image_filepaths
            # Remove filepaths that are known to have label mismatches
            if (not is_label_mismatch(dataset_name, invalid_seaview_quadratids, p))
            # Remove filepaths that are invalid (empty masks for instance)
            and (not p in invalid_image_filepaths)
        ]

        if len(all_image_filepaths) > len(image_filepaths):
            print(
                f"Excluding {len(all_image_filepaths) - len(image_filepaths)} files"
                f" from {dataset_name} because of label mismatch or empty masks."
            )

        for image_filepath in image_filepaths:
            label_filename = f"{image_filepath.stem}.txt"
            label_filepath = (
                INPUT_DIR_YOLOV8_INSTANCE_SEGMENTATION_LABEL
                / dataset_name
                / "labels"
                / "images"
                / label_filename
            )
            entry = {
                "dataset_name": dataset_name,
                "image_filepath": image_filepath,
                "label_filepath": label_filepath,
            }
            X.append(entry)
    return X


def split_train_val(
    X: list[Entry], train_size_ratio: float = 0.8, seed: int = 42
) -> tuple[list[Entry], list[Entry]]:
    """Returns a splitted dataset X into X_train and X_val using the
    `train_size_ratio` and the random `seed`."""
    N = len(X)
    random.seed(seed)
    random.shuffle(X)
    split_index = int(N * train_size_ratio)

    X_train, X_val = X[:split_index], X[split_index:]
    return X_train, X_val


def write_entry(
    entry, mode: str = "train", output_dir: Path = OUTPUT_DIR_YOLOV8_SEGMENTATION
) -> None:
    """Given an `entry` and a mode in #{`train`, `val`}, it writes it in a
    YOLOv8 format."""
    source_image_filepath = entry["image_filepath"]
    source_label_filepath = entry["label_filepath"]
    destination_image_filepath = (
        output_dir / mode / "images" / source_image_filepath.name
    )
    destination_label_filepath = (
        output_dir / mode / "labels" / source_label_filepath.name
    )

    assert os.path.exists(
        source_image_filepath
    ), f"should exist {source_image_filepath}"
    assert os.path.exists(
        source_label_filepath
    ), f"should exist {source_label_filepath}"
    assert os.path.exists(
        output_dir / mode / "images"
    ), f"the images folder should exist in {output_dir}"
    assert os.path.exists(
        output_dir / mode / "labels"
    ), f"the labels folder should exist  in {output_dir}"

    shutil.copyfile(source_image_filepath, destination_image_filepath)
    shutil.copyfile(source_label_filepath, destination_label_filepath)


def write_dataset(
    X_train: list[Entry],
    X_val: list[Entry],
    output_dir: Path = OUTPUT_DIR_YOLOV8_SEGMENTATION,
) -> None:
    """Writes the dataset splitted in X_train and X_val into the right folder
    structure for YOLOv8."""
    print(f"Generating train set - {len(X_train)} datapoints")
    for entry in X_train:
        write_entry(entry, mode="train", output_dir=output_dir)

    print(f"Generating val set - {len(X_val)} datapoints")
    for entry in X_val:
        write_entry(entry, mode="val", output_dir=output_dir)


def generate(
    dataset_names: list[str],
    seed: int = 42,
    train_size_ratio: float = 0.8,
    output_dir: Path = OUTPUT_DIR_YOLOV8_SEGMENTATION,
) -> None:
    """Main function to generate the full dataset ready for YOLOv8 to be
    trained on."""
    init_yolov8_dataset_folder_structure(output_dir=output_dir)
    print(
        "Splitting datapoints between train and val sets for the datasets:"
        f" {' '.join(dataset_names)}"
    )
    invalid_seaview_quadratids = get_invalid_seaview_quadratids(
        csv_data_path=INPUT_LABEL_MISMATCH_CSV_DATA
    )
    print(f"Found {len(invalid_seaview_quadratids)} mislabelled quadratid")
    invalid_image_filepaths = get_image_filepaths_with_empty_masks(dataset_names)
    print(f"Found {len(invalid_image_filepaths)} empty masks")
    X = get_X(
        dataset_names,
        invalid_seaview_quadratids,
        invalid_image_filepaths=invalid_image_filepaths,
    )
    X_train, X_val = split_train_val(X, train_size_ratio=train_size_ratio, seed=seed)
    print(f"Writing the data in {output_dir}")
    write_dataset(X_train, X_val, output_dir=output_dir)
    print("Writing config.yaml file")
    write_config_yaml(
        path=output_dir,
        X_train=X_train,
        X_val=X_val,
        dataset_names=dataset_names,
        seed=seed,
        train_size_ratio=train_size_ratio,
    )

## Generation

### Script

In [7]:
# Add the dataset names in that list to inlude them in the generated set
dataset_names = [
    # "SEAFLOWER_BOLIVAR",
    # 'SEAFLOWER_COURTOWN',
    "SEAVIEW_ATL",
    # 'SEAVIEW_IDN_PHL',
    # 'SEAVIEW_PAC_AUS',
    # 'SEAVIEW_PAC_USA',
    # 'TETES_PROVIDENCIA',
]

# Parameters to generate the dataset
seed = 42
train_size_ratio = 0.80
# This will generate all the folder structure in `OUTPUT_DIR_YOLOV8_SEGMENTATION` for yolov8 to consume
generate(
    dataset_names,
    seed=seed,
    train_size_ratio=train_size_ratio,
    output_dir=OUTPUT_DIR_YOLOV8_SEGMENTATION,
)

clearing folder /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/train/images
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/train/labels
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/val/images
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/val/labels
Writing data.yaml file
Writing README.md file
Splitting datapoints between train and val sets for the datasets: SEAVIEW_ATL
Found 1054 mislabelled quadratid
Looking for empty masks in SEAVIEW_ATL
    Found 328 empty label files
Found 328 empty masks
Excluding 72 files from SEAVIEW_ATL because of label mismatch or empty masks.
Writing the data in /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation


### Sanity check

Below is what it should look like on your filesystem.

```
├── config.yaml
├── data.yaml
├── README.md
├── train
│   ├── images
│   └── labels
└── val
    ├── images
    └── labels
```

In [8]:
!tree -L 2 $OUTPUT_DIR_YOLOV8_SEGMENTATION

[01;34m/home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation[0m
├── config.yaml
├── data.yaml
├── README.md
├── [01;34mtrain[0m
│   ├── [01;34mimages[0m
│   └── [01;34mlabels[0m
└── [01;34mval[0m
    ├── [01;34mimages[0m
    └── [01;34mlabels[0m

6 directories, 3 files


### Export

Export the generated dataset as a zip file to make it available anywhere (Eg. Colab instance to train on GPUs).

In [9]:
def make_archive(
    output_dir: Path = OUTPUT_DIR_YOLOV8_SEGMENTATION, archive_name: str = "archive"
) -> None:
    """Generates an archive file from the `output_dir`"""
    shutil.make_archive(str(output_dir.parent / archive_name), "zip", output_dir)


def get_archive_name(dataset_names: list[str]) -> str:
    return f"archive_{'_and_'.join(dataset_names)}"

In [121]:
make_archive(archive_name=get_archive_name(dataset_names))

#### Export some datasets combinations

##### All individual dataset regions

In [14]:
# Add the dataset names in that list to inlude them in the generated set
all_dataset_names = {
    "SEAFLOWER_BOLIVAR",
    "SEAFLOWER_COURTOWN",
    "SEAVIEW_ATL",
    "SEAVIEW_IDN_PHL",
    "SEAVIEW_PAC_AUS",
    "SEAVIEW_PAC_USA",
    "TETES_PROVIDENCIA",
}

# Parameters to generate the dataset
seed = 42
train_size_ratio = 0.80

for dataset_name in tqdm(all_dataset_names):
    print(f"Generating dataset for {dataset_name}")
    generate(
        [dataset_name],
        seed=seed,
        train_size_ratio=train_size_ratio,
        output_dir=OUTPUT_DIR_YOLOV8_SEGMENTATION,
    )
    archive_name = get_archive_name([dataset_name])
    print(f"Making archive {archive_name}.zip")
    make_archive(archive_name=archive_name)

  0%|          | 0/7 [00:00<?, ?it/s]

Generating dataset for TETES_PROVIDENCIA
clearing folder /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/train/images
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/train/labels
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/val/images
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/val/labels
Writing data.yaml file
Writing README.md file
Splitting datapoints between train and val sets for the datasets: TETES_PROVIDENCIA
Found 1054 mislabelled quadratid
Looking for empty masks in TETES_PROVIDENCIA
    Found 0 empty label files
Found 0 empty masks
Writing the data in /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation
Generating train set - 84 dat

##### combinations of all regions but SEAVIEW_PAC_AUS

In [15]:
all_dataset_names_but_seaview_pac_aus = [
    "SEAFLOWER_BOLIVAR",
    "SEAFLOWER_COURTOWN",
    "SEAVIEW_ATL",
    "SEAVIEW_IDN_PHL",
    "SEAVIEW_PAC_AUS",
    "TETES_PROVIDENCIA",
]

# Parameters to generate the dataset
seed = 42
train_size_ratio = 0.80

print(f"Generating dataset")
generate(
    all_dataset_names_but_seaview_pac_aus,
    seed=seed,
    train_size_ratio=train_size_ratio,
    output_dir=OUTPUT_DIR_YOLOV8_SEGMENTATION,
)
archive_name = get_archive_name(all_dataset_names_but_seaview_pac_aus)
print(f"Making archive {archive_name}.zip")
make_archive(archive_name=archive_name)

Generating dataset
clearing folder /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/train/images
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/train/labels
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/val/images
Making directory: /home/chouffe/playground/datasets/yolov8_ready/benthic_datasets/instance_segmentation/val/labels
Writing data.yaml file
Writing README.md file
Splitting datapoints between train and val sets for the datasets: SEAFLOWER_BOLIVAR SEAFLOWER_COURTOWN SEAVIEW_ATL SEAVIEW_IDN_PHL SEAVIEW_PAC_AUS TETES_PROVIDENCIA
Found 1054 mislabelled quadratid
Looking for empty masks in SEAFLOWER_BOLIVAR
    Found 1 empty label files
Looking for empty masks in SEAFLOWER_COURTOWN
    Found 0 empty label files
Looking for empty ma