In [1]:
import pandas as pd
import yaml
from collections import Counter
from pathlib import Path
from sklearn.model_selection import KFold
from ultralytics import YOLO
import os
import glob
import shutil

In [2]:
TARGET_IMAGES_PATH = './images/'
TARGET_LABELS_PATH = './labels/'

image_paths = glob.glob(TARGET_IMAGES_PATH + "*.jpg")
label_paths = glob.glob(TARGET_LABELS_PATH + "*.txt")

print(image_paths)
print(label_paths)


# source_folder_path = "./axial/cross_data/{split}/{type}"
# splits = ["train", "valid"]
# types = ["images/*.jpg", "labels/*.txt"]

# os.makedirs(TARGET_IMAGES_PATH, exist_ok=True)
# os.makedirs(TARGET_LABELS_PATH, exist_ok=True)

# image_paths = list()
# label_paths = list()

# for split in splits:
#     for data_type in types:
#         files = glob.glob(source_folder_path.format(split=split, type=data_type))
#         for file_ in files:
#             if "image" in data_type:
#                 shutil.copy(file_, TARGET_IMAGES_PATH)
#             else:
#                 shutil.copy(file_, TARGET_LABELS_PATH)

# # Store image and label paths for future use
# image_paths = glob.glob(TARGET_IMAGES_PATH + "*.jpg")
# label_paths = glob.glob(TARGET_LABELS_PATH + "*.txt")

# print(f"Copied {len(image_paths)} images to {TARGET_IMAGES_PATH}")
# print(f"Copied {len(label_paths)} labels to {TARGET_LABELS_PATH}")

['./images/00110_162.jpg', './images/00154_229.jpg', './images/00071_259.jpg', './images/00059_122.jpg', './images/00177_125.jpg', './images/00133_247.jpg', './images/00138_235.jpg', './images/00184_127.jpg', './images/00098_266.jpg', './images/00192_191.jpg', './images/00128_243.jpg', './images/00123_241.jpg', './images/00110_178.jpg', './images/00104_168.jpg', './images/00138_217.jpg', './images/00165_176.jpg', './images/00060_72.jpg', './images/00074_249.jpg', './images/00159_189.jpg', './images/00177_221.jpg', './images/00137_264.jpg', './images/00143_236.jpg', './images/00095_201.jpg', './images/00060_69.jpg', './images/00090_204.jpg', './images/00159_133.jpg', './images/00165_140.jpg', './images/00066_259.jpg', './images/00177_241.jpg', './images/00156_196.jpg', './images/00132_130.jpg', './images/00071_240.jpg', './images/00178_158.jpg', './images/00060_71.jpg', './images/00056_147.jpg', './images/00100_119.jpg', './images/00142_144.jpg', './images/00090_112.jpg', './images/0011

In [3]:
dataset_path = Path("./")  # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("*labels/*.txt"))  # all data in 'labels'
labels

[PosixPath('labels/00054_145.txt'),
 PosixPath('labels/00054_164.txt'),
 PosixPath('labels/00056_110.txt'),
 PosixPath('labels/00056_129.txt'),
 PosixPath('labels/00056_147.txt'),
 PosixPath('labels/00056_239.txt'),
 PosixPath('labels/00056_92.txt'),
 PosixPath('labels/00058_102.txt'),
 PosixPath('labels/00058_122.txt'),
 PosixPath('labels/00058_141.txt'),
 PosixPath('labels/00058_179.txt'),
 PosixPath('labels/00059_103.txt'),
 PosixPath('labels/00059_122.txt'),
 PosixPath('labels/00059_141.txt'),
 PosixPath('labels/00059_84.txt'),
 PosixPath('labels/00060_69.txt'),
 PosixPath('labels/00060_70.txt'),
 PosixPath('labels/00060_71.txt'),
 PosixPath('labels/00060_72.txt'),
 PosixPath('labels/00061_108.txt'),
 PosixPath('labels/00061_127.txt'),
 PosixPath('labels/00061_146.txt'),
 PosixPath('labels/00061_184.txt'),
 PosixPath('labels/00063_197.txt'),
 PosixPath('labels/00063_217.txt'),
 PosixPath('labels/00064_101.txt'),
 PosixPath('labels/00064_121.txt'),
 PosixPath('labels/00064_140.txt')

In [4]:
yaml_file = './no_val.yaml'  # your data YAML with data directories and names dictionary
with open(yaml_file, 'r', encoding="utf8") as y:
    classes = yaml.safe_load(y)['names']
cls_idx = list(range(len(classes)))
print(list(zip(classes, cls_idx)))

[('negative', 0), ('positive', 1)]


In [5]:
indx = [l.stem for l in labels] # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)
labels_df

Unnamed: 0,0,1
00054_145,,
00054_164,,
00056_110,,
00056_129,,
00056_147,,
...,...,...
00195_208,,
00195_225,,
00201_102,,
00201_121,,


In [6]:
for label in labels:
    lbl_counter = Counter()

    with open(label,'r') as lf:
        lines = lf.readlines()

    for l in lines:
        # classes for YOLO label uses integer at first position of each line
        lbl_counter[int(l.split(' ')[0])] += 1

    labels_df.loc[label.stem] = lbl_counter

labels_df = labels_df.fillna(0.0) # replace `nan` values with `0.0`
labels_df

  labels_df = labels_df.fillna(0.0) # replace `nan` values with `0.0`


Unnamed: 0,0,1
00054_145,0.0,1.0
00054_164,0.0,1.0
00056_110,0.0,2.0
00056_129,0.0,1.0
00056_147,0.0,1.0
...,...,...
00195_208,1.0,0.0
00195_225,1.0,0.0
00201_102,1.0,0.0
00201_121,1.0,0.0


In [7]:
ksplit = 10
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)   # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [8]:
folds = [f'split_{n}' for n in range(1, ksplit + 1)]
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1E-7)
    fold_lbl_distrb.loc[f'split_{n}'] = ratio

In [9]:
fold_lbl_distrb

Unnamed: 0,0,1
split_1,0.103175,0.157025
split_2,0.103175,0.138211
split_3,0.14876,0.076923
split_4,0.112,0.12
split_5,0.103175,0.111111
split_6,0.14876,0.068702
split_7,0.085937,0.129032
split_8,0.085937,0.12
split_9,0.103175,0.102362
split_10,0.120968,0.09375


In [10]:
import os
import shutil
import yaml
from pathlib import Path

kfold_base_path = Path('./kfold_sagittal_sharp_10')
shutil.rmtree(kfold_base_path) if kfold_base_path.is_dir() else None  # 기존 폴더가 있으면 삭제
os.makedirs(str(kfold_base_path))  # 새 폴더 생성

yaml_paths = list()
train_txt_paths = list()
val_txt_paths = list()

# 이미지 및 라벨 파일 경로를 절대 경로로 변경
absolute_image_paths = [str(Path(p).absolute()) for p in image_paths]

for i, (train_idx, val_idx) in enumerate(kfolds):
    # Get image paths for train-val split
    train_paths = [absolute_image_paths[j] for j in train_idx]
    val_paths = [absolute_image_paths[j] for j in val_idx]
    
    # Create text files to store image paths
    train_txt = kfold_base_path / f"train_{i}.txt"
    val_txt = kfold_base_path / f"val_{i}.txt"
    
    # Write images paths for training and validation in split i
    with open(str(train_txt), 'w') as f:
        f.writelines(s + '\n' for s in train_paths)
    with open(str(val_txt), 'w') as f:
        f.writelines(s + '\n' for s in val_paths)
    
    train_txt_paths.append(str(train_txt))
    val_txt_paths.append(str(val_txt))
    
    # Create dataset yaml file
    dataset_yaml = kfold_base_path / f"data_{i}.yaml"
    yaml_content = {
        'path': str(kfold_base_path),  # Root path for dataset
        'train': str(train_txt),       # Train txt file path
        'val': str(val_txt),           # Validation txt file path
        'test': "/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/yolo_data/images/test",
        'names': classes               # Class names
    }
    with open(dataset_yaml, 'w') as f:
        yaml.dump(yaml_content, f)
    
    yaml_paths.append(str(dataset_yaml))

# 디버깅을 위한 출력 추가
for i in range(len(kfolds)):
    print(f"Using dataset YAML: {yaml_paths[i]}")
    with open(yaml_paths[i], 'r') as f:
        print(f.read())
    print(f"Train file contents: {train_txt_paths[i]}")
    with open(train_txt_paths[i], 'r') as f:
        print(f.read())
    print(f"Val file contents: {val_txt_paths[i]}")
    with open(val_txt_paths[i], 'r') as f:
        print(f.read())


Using dataset YAML: kfold_sagittal_sharp_10/data_0.yaml
names:
- negative
- positive
path: kfold_sagittal_sharp_10
test: /home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/yolo_data/images/test
train: kfold_sagittal_sharp_10/train_0.txt
val: kfold_sagittal_sharp_10/val_0.txt

Train file contents: kfold_sagittal_sharp_10/train_0.txt
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00110_162.jpg
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00154_229.jpg
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00071_259.jpg
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00059_122.jpg
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00177_125.jpg
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00133_247.jpg
/home/under1/Detect/jeongui/cross_val/sagittal_sharp_10/cross_data/images/00138_235.jpg
/home/under1/Detect/jeongui/cross_val/sagitta

In [12]:
from IPython.display import clear_output
import time

batch = 16
project = 'kfold_demo'
epochs = 200

results = list()

for i in range(ksplit):
    model = YOLO('yolov8n.pt')
    dataset_yaml =yaml_paths[i]
    print(f"Training for fold={i} using {dataset_yaml}")
    model.train(data=dataset_yaml, batch=batch, project=project, epochs=epochs, device=0, exist_ok=True, conf=0.25, nms=True, agnostic_nms=True, save_txt=True, save_conf=True, val=False)
    result = model.metrics # Metrics on validation set
    results.append(result) # save output metrics for further analysis
    clear_output()

In [13]:
metric_values = dict()

for result in results:
    for metric, metric_val in result.results_dict.items():
        if metric not in metric_values:
            metric_values[metric] = []
        metric_values[metric].append(metric_val)

metric_df = pd.DataFrame.from_dict(metric_values)
visualize_metric = ['mean', 'std', 'min', 'max']
metric_df.describe().loc[visualize_metric]

Unnamed: 0,metrics/precision(B),metrics/recall(B),metrics/mAP50(B),metrics/mAP50-95(B),fitness
mean,0.752057,0.676193,0.723579,0.561019,0.577275
std,0.098,0.095154,0.088251,0.08003,0.079572
min,0.579167,0.589286,0.601459,0.443467,0.461548
max,0.891139,0.854396,0.84955,0.648748,0.668828


In [14]:
metric_values

{'metrics/precision(B)': [0.6942564735191463,
  0.5791666666666666,
  0.7209566141466792,
  0.6373540880508755,
  0.7352536849450431,
  0.8911387493056537,
  0.8479929313262646,
  0.7932631270726509,
  0.7814453247303366,
  0.8397435897435898],
 'metrics/recall(B)': [0.7113229767878724,
  0.5982142857142857,
  0.5906862745098039,
  0.5892857142857143,
  0.6933675142099253,
  0.8543956043956045,
  0.6971153846153846,
  0.611764705882353,
  0.60625,
  0.8095238095238095],
 'metrics/mAP50(B)': [0.7409414285714286,
  0.6242699999999999,
  0.6123763392857143,
  0.6014592731829574,
  0.7542580197580198,
  0.8495499999999999,
  0.7842849927849926,
  0.7230885897435897,
  0.712304761904762,
  0.8332551282051283],
 'metrics/mAP50-95(B)': [0.6183924624819624,
  0.44346749442409433,
  0.4534247962870464,
  0.49984643928439987,
  0.6265821092067654,
  0.6487481295251466,
  0.5595241147186147,
  0.6299673038766789,
  0.5001963748996101,
  0.6300368772893773],
 'fitness': [0.6306473590909091,
  0.46