In [1]:
import datetime
import os
import shutil
from pathlib import Path
from collections import Counter

import yaml
import numpy as np
import pandas as pd
# from ultralytics import YOLO
from sklearn.model_selection import KFold

# Generate dataset

In [2]:
dataset_path = Path('/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/') # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("*labels/*.txt")) # all data in 'labels'

In [3]:
yaml_file = '/media/jess/DATA/PhD/cnn_bakeoff2/yolo/config/trial_1.yaml'  # your data YAML with data directories and names dictionary
with open(yaml_file, 'r', encoding="utf8") as y:
    classes = yaml.safe_load(y)['names']
cls_idx = sorted(classes.keys())

In [4]:
indx = [l.stem for l in labels] # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)

In [5]:
for label in labels:
    lbl_counter = Counter()

    with open(label,'r') as lf:
        lines = lf.readlines()

    for l in lines:
        # classes for YOLO label uses integer at first position of each line
        lbl_counter[int(l.split(' ')[0])] += 1

    # labels_df.loc[label.stem] = lbl_counter
    labels_df.loc[label.stem] = [lbl_counter[cls] for cls in classes]

labels_df = labels_df.fillna(0.0) # replace `nan` values with `0.0`

In [6]:
labels_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
001A_2017_Hook_IMG_0016_028e1462,0,0,0,0,0,0,0,1,0,0,0,0,0,0
001A_2017_Hook_IMG_0436_efe2e308,0,0,0,0,0,0,0,1,0,0,0,0,0,0
001B_2017_Hook_IMG_0022_d0a6e9bc,0,0,0,0,0,0,0,1,0,0,0,0,0,0
001B_2017_Hook_IMG_0026_0b1f2d88,0,0,0,0,0,0,0,1,0,0,0,0,0,0
001B_2017_Hook_IMG_0123_225f268d,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TW_01_20180325_IMG_2157_66155066,0,0,0,0,0,0,1,0,0,0,0,0,0,0
TW_01_20180325_IMG_2160_96f64882,0,0,0,0,0,0,1,0,0,0,0,0,0,0
TW_02_20181019_IMG_0167_1fe4e6f9,0,0,0,0,0,0,1,0,0,0,0,0,0,0
TW_03_20171115_IMG_1112_ac8d3b89,0,0,0,0,0,0,0,0,1,0,0,0,0,0


# 5-fold data split

In [7]:
ksplit = 5
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)   # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [8]:
folds = [f'split_{n}' for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)

for idx, (train, val) in enumerate(kfolds, start=1):
    folds_df[f'split_{idx}'].loc[labels_df.iloc[train].index] = 'train'
    folds_df[f'split_{idx}'].loc[labels_df.iloc[val].index] = 'val'

In [9]:
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1E-7)
    fold_lbl_distrb.loc[f'split_{n}'] = ratio

In [10]:
supported_extensions = ['.jpg', '.jpeg', '.png']

# Initialize an empty list to store image file paths
images = []

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(sorted((dataset_path / 'images').rglob(f"*{ext}")))

# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(dataset_path / f'{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val')
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / 'train' / 'images').mkdir(parents=True, exist_ok=True)
    (split_dir / 'train' / 'labels').mkdir(parents=True, exist_ok=True)
    (split_dir / 'val' / 'images').mkdir(parents=True, exist_ok=True)
    (split_dir / 'val' / 'labels').mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f'{split}_dataset.yaml'
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, 'w') as ds_y:
        yaml.safe_dump({
            'path': split_dir.as_posix(),
            'train': 'train',
            'val': 'val',
            'names': classes
        }, ds_y)

In [11]:
for image, label in zip(images, labels):
    for split, k_split in folds_df.loc[image.stem].items():
        # Destination directory
        img_to_path = save_path / split / k_split / 'images'
        lbl_to_path = save_path / split / k_split / 'labels'

        # Copy image and label files to new directory (SamefileError if file already exists)
        shutil.copy(image, img_to_path / image.name)
        shutil.copy(label, lbl_to_path / label.name)

In [75]:
#save records
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")

## Checking text files

In [17]:
def dataset_summary_b(path):
    
    classes_dict = {}

    for filename in os.listdir(path):
        if filename.endswith('.txt'):  # Ensure it's a text file
            with open(os.path.join(path, filename), 'r') as file:
                for line in file:
                    sp_class = line.split(' ')[0]
                    classes_dict[sp_class] = classes_dict.get(sp_class, 0) + 1

    return dict(sorted(classes_dict.items()))

In [20]:
print(dataset_summary_b('/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/2024-05-09_5-Fold_Cross-val/split_1/train/labels'))
print(dataset_summary_b('/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/2024-05-09_5-Fold_Cross-val/split_2/train/labels'))
print(dataset_summary_b('/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/2024-05-09_5-Fold_Cross-val/split_3/train/labels'))
print(dataset_summary_b('/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/2024-05-09_5-Fold_Cross-val/split_4/train/labels'))
print(dataset_summary_b('/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/2024-05-09_5-Fold_Cross-val/split_5/train/labels'))

{'0': 807, '10': 791, '11': 798, '14': 780, '15': 809, '16': 822, '17': 788, '19': 793, '2': 810, '22': 787, '3': 816, '6': 805, '7': 793, '8': 801}
{'0': 801, '10': 818, '11': 802, '14': 807, '15': 796, '16': 794, '17': 806, '19': 800, '2': 794, '22': 811, '3': 795, '6': 791, '7': 813, '8': 772}
{'0': 815, '10': 792, '11': 797, '14': 796, '15': 803, '16': 804, '17': 797, '19': 807, '2': 818, '22': 784, '3': 797, '6': 794, '7': 792, '8': 804}
{'0': 809, '10': 803, '11': 805, '14': 808, '15': 780, '16': 794, '17': 819, '19': 786, '2': 799, '22': 804, '3': 793, '6': 807, '7': 786, '8': 807}
{'0': 768, '10': 796, '11': 798, '14': 809, '15': 812, '16': 786, '17': 790, '19': 814, '2': 779, '22': 814, '3': 799, '6': 803, '7': 816, '8': 816}


# Run YOLO

In [9]:
#change to yolov8 env
import torch
from ultralytics import YOLO

device = '0' if torch.cuda.is_available() else 'cpu'
if device=='0':
    torch.cuda.set_device(0)
print(f'Device: {device}')

model = YOLO('yolov8n.yaml')
model = YOLO('yolov8n.pt')
model = YOLO('yolov8n.yaml').load('yolov8n.pt')

Device: 0
Transferred 355/355 items from pretrained weights


In [15]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [16]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

ksplit = 5
results = {}

# Define your additional arguments here
batch = 64
project = 'kfold_1000images'
epochs = 100
patience = 10

for k in range(ksplit):
    ds_yaml = f'/media/jess/DATA/PhD/data/ecoflow/yolo_labels/14_classes/train/2024-05-09_5-Fold_Cross-val/split_{k+1}/split_{k+1}_dataset.yaml'
    dataset_yaml = ds_yaml
    model.train(data=dataset_yaml,
                epochs=epochs,
                batch=batch,
                patience=patience,
                project=project)  # include any train arguments
    results[k+1] = model.metrics  # save output metrics for further analysis

print("Training results:", results)

New https://pypi.org/project/ultralytics/8.2.11 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.10 🚀 Python-3.10.13 torch-2.3.0+cu121 CUDA:0 (NVIDIA RTX A6000, 48669MiB)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
