# YOLOv8 Training Notebook

In this Notebook, we setup the training pipeline for the YOLOv8 model.

## Setup

### Dependencies

In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.0.225-py3-none-any.whl (660 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/660.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/660.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m655.4/660.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m660.1/660.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.0.225


### Imports

In [2]:
from pathlib import Path
import os
import shutil
from google.colab import drive
import yaml

### Utils

In [3]:
class MyDumper(yaml.Dumper):
  def increase_indent(self, flow=False, indentless=False):
      return super(MyDumper, self).increase_indent(flow, False)


def yaml_content(path: Path) -> dict:
  """
  Returns yaml content as a python dict
  """
  with open(path, 'r') as f:
      return yaml.safe_load(f)


def write_yaml(path: Path, data: dict) -> None:
  """
  Writes yaml `data` (as a dict) to file `path` using the MyDumper class.
  """
  with open(path, 'w') as f:
      yaml.dump(data, f, Dumper=MyDumper, default_flow_style=False, sort_keys=False)


def absolute_train_val_paths(extract_folder_path: Path, data: dict) -> dict:
  """
  Make sure the paths in data.yaml are absolute and pointing to the right images
  and labels.
  """
  result = dict(data)
  result['train'] = str(extract_folder_path / 'train' / 'images')
  result['val'] = str(extract_folder_path / 'val' / 'images')
  return result


def swap_coral_class_order(extract_folder_path: Path, data: dict) -> dict:
  """
  Note: an error was made using opencv when making the dataset.
  blue and red classes got inverted and this function fixes the class string
  labels.
  """
  result = dict(data)
  result['names'][0], result['names'][1] = result['names'][1], result['names'][0]
  return result


def archive_path_to_extract_folder_path(path_archive: Path, output_dir: str = '/content/datasets_ready_for_yolov8_training') -> Path:
  extract_folder_name = path_archive.name.split('.')[0].replace('archive_', '')
  return Path(output_dir) / extract_folder_name


def extract_archive(path_archive: Path) -> dict:
  # Extract the path_archive
  extract_folder_path = archive_path_to_extract_folder_path(path_archive)
  os.makedirs(extract_folder_path, exist_ok=True)
  shutil.unpack_archive(path_archive, extract_folder_path)
  print(f'archive {path_archive} extracted in {extract_folder_path}')

  # Update the data_yaml file to point to the right files and labels
  path_data_yaml: Path = extract_folder_path / 'data.yaml'
  data_yaml: dict = yaml_content(path_data_yaml)
  # new_data_yaml: dict = absolute_train_val_paths(extract_folder_path, data_yaml)
  new_data_yaml: dict = swap_coral_class_order(
      extract_folder_path,
      absolute_train_val_paths(extract_folder_path, data_yaml)
  )
  write_yaml(path_data_yaml, new_data_yaml)
  print(f'updating absolute paths in data.yaml content {new_data_yaml}')

  return {
      'extract_folder_path': extract_folder_path,
      'new_data_yaml': new_data_yaml,
      }

## Training YOLOv8

### Getting the dataset ready

We first need to mount GDrive and extract the archive file in the temporary directory.
By default, the archive is extracted in `/content/datasets_ready_for_yolov8_training`.
One needs to update the `GDRIVE_ARCHIVE_ROOT_DIR` variable that points to the root of all the archive files and the 'ARCHIVE_NAME` that contains the filename of the archive.

In [4]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
GDRIVE_ARCHIVE_ROOT_DIR = '/content/drive/MyDrive/fruitpunchai/coralreefs/datasets_ready_for_yolov8_training/'
# ARCHIVE_NAME = 'archive_SEAFLOWER_BOLIVAR.zip'
ARCHIVE_NAME = 'archive_SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA.zip'

In [6]:
# List all available archive names
# Update `ARCHIVE_NAME` with the one you want to use
[f for f in os.listdir(GDRIVE_ARCHIVE_ROOT_DIR) if f.endswith('.zip')]

['archive_SEAFLOWER_COURTOWN.zip',
 'archive_SEAVIEW_PAC_USA.zip',
 'archive_SEAVIEW_ATL.zip',
 'archive_TETES_PROVIDENCIA.zip',
 'archive_SEAVIEW_IDN_PHL.zip',
 'archive_SEAVIEW_PAC_AUS.zip',
 'archive_SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA.zip',
 'archive_SEAFLOWER_BOLIVAR.zip']

In [7]:
# Archive extraction
path_archive = Path(GDRIVE_ARCHIVE_ROOT_DIR) / ARCHIVE_NAME
archive_result = extract_archive(path_archive)
archive_result

archive /content/drive/MyDrive/fruitpunchai/coralreefs/datasets_ready_for_yolov8_training/archive_SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA.zip extracted in /content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA
updating absolute paths in data.yaml content {'train': '/content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA/train/images', 'val': '/content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA/val/images', 'nc': 2, 'names': ['hard_coral', 'soft_coral']}


{'extract_folder_path': PosixPath('/content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA'),
 'new_data_yaml': {'train': '/content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA/train/images',
  'val': '/content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA/val/images',
  'nc': 2,
  'names': ['hard_coral', 'soft_coral']}}

### Training

To establish our baseline models, we picked the following parameters:

```python
MODEL_SIZE: str = 'm'          
EPOCHS: int = 20              
CV_TASK: str = 'segmentation'  
```

In [8]:
# Choose the training parameters
MODEL_SIZE: str = 'm'          # Can be n, s, m, l, x
EPOCHS: int = 20               # Positive integer
CV_TASK: str = 'segmentation'  # `segmentation` or `object_detection`


# TODO: add others like learning_rate, Optimizer, etc.

In [9]:
# Derived parameters from the above cells
yolo_model = f'yolov8{MODEL_SIZE}{"-seg" if CV_TASK == "segmentation" else ""}.pt'
yolo_data_yaml_path = str(archive_result['extract_folder_path'] / 'data.yaml')

In [None]:
!yolo train data=$yolo_data_yaml_path model=$yolo_model epochs=$EPOCHS

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8m-seg.pt to 'yolov8m-seg.pt'...
100% 52.4M/52.4M [00:00<00:00, 77.5MB/s]
Ultralytics YOLOv8.0.225 🚀 Python-3.10.12 torch-2.1.0+cu118 CUDA:0 (Tesla T4, 15102MiB)
[34m[1mengine/trainer: [0mtask=segment, mode=train, model=yolov8m-seg.pt, data=/content/datasets_ready_for_yolov8_training/SEAFLOWER_BOLIVAR_and_SEAFLOWER_COURTOWN_and_SEAVIEW_ATL_and_SEAVIEW_IDN_PHL_and_SEAVIEW_PAC_AUS_and_TETES_PROVIDENCIA/data.yaml, epochs=20, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, d

In [None]:
shutil.make_archive(
  # '/content/SEAFLOWER_BOLIVAR_baseline_yolov8_session_runs',
  '/content/ALL_REGIONS_baseline_yolov8_session_runs',
  'zip',
  '/content/runs'
  )