In [1]:
import csv
import pandas as pd
import os
from ultralytics.data.split import autosplit, split_classify_dataset
import shutil

In [2]:
path_vnl = 'training_data/kaggle_vnl'
path_ltv = ''
output_path = 'training_data/dataset_detection'
classes = ['player', 'referee', 'ball']

In [11]:
def change_classes_from_kaggle():
    directory = "training_data/kaggle_vnl_all/labels"
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            
            new_lines = []
            with open(filepath, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if not parts:
                        continue  # skip empty lines
    
                    # Change only the first number (class id)
                    if parts[0] == "0":
                        parts[0] = "2"
                    elif parts[0] == "2":
                        parts[0] = "1"
                    elif parts[0] == "1":
                        parts[0] = "1"  # stays the same
    
                    new_lines.append(" ".join(parts))
    
            # overwrite the file with updated content
            with open(filepath, "w") as f:
                f.write("\n".join(new_lines) + "\n")

In [27]:
def change_classes_from_yolo():
    directory = "runs/detect/predict9/labels"
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            
            new_lines = []
            with open(filepath, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if not parts:
                        continue  # skip empty lines
    
                    # Change only the first number (class id)
                    if parts[0] == "0":
                        parts[0] = "1"
                    elif parts[0] == "32":
                        parts[0] = "0"
    
                    new_lines.append(" ".join(parts))
    
            # overwrite the file with updated content
            with open(filepath, "w") as f:
                f.write("\n".join(new_lines) + "\n")

In [21]:
autosplit(path='training_data/different_data/all/images', weights=(0.87, 0.1, 0.03))

Autosplitting images from training_data/different_data/all/images


100%|██████████| 189/189 [00:00<00:00, 12678.50it/s]


In [23]:
split_files = [
    'different_data/all/autosplit_test.txt',
    'different_data/all/autosplit_train.txt',
    'different_data/all/autosplit_val.txt'
]


destination_dir = "dataset"

def copy_split_files(file, dir):
    with open(file, "r") as f:
        for line in f:
            src_path = line.strip()
            if not src_path:
                continue  # skip empty lines
            if not os.path.isfile(src_path):
                print(f"⚠️ File not found: {src_path}")
                continue
    
            # copy file into destination (keep same filename)
            shutil.copy(src_path, dir)
            print(f"✅ Copied: {src_path} -> {destination_dir}")

In [None]:
copy_split_files(split_files[0])

In [3]:
split_classify_dataset('training_data/different_data/project-3-at-2025-09-02-21-57-f5ebf23b', 0.80)

Splitting training_data/different_data/project-3-at-2025-09-02-21-57-f5ebf23b (2 classes, 1272 images) into 80% train, 20% val...
Split complete in training_data/different_data/project-3-at-2025-09-02-21-57-f5ebf23b_split ✅


PosixPath('training_data/different_data/project-3-at-2025-09-02-21-57-f5ebf23b_split')

In [28]:
change_classes_from_yolo()

In [5]:
import os
import random
import shutil
from pathlib import Path
import yaml

def split_yolo_dataset(
    images_dir,
    labels_dir,
    output_dir,
    train_ratio=0.7,
    val_ratio=0.2,
    seed=42
):
    """
    Splits a YOLO-format dataset into train/val/test sets and generates data.yaml.

    Args:
        images_dir (str): Path to images folder.
        labels_dir (str): Path to labels folder.
        output_dir (str): Path where split dataset will be saved.
        class_names (list): List of class names.
        train_ratio (float): Proportion of training set.
        val_ratio (float): Proportion of validation set.
        test_ratio (float): Proportion of test set.
        seed (int): Random seed for reproducibility.
    """


    random.seed(seed)

    # Collect all image files
    images = [f for f in os.listdir(images_dir) if f.lower().endswith((".jpg", ".png", ".jpeg"))]
    random.shuffle(images)

    n_total = len(images)
    n_train = int(n_total * train_ratio)
    n_val = int(n_total * val_ratio)

    train_files = images[:n_train]
    val_files = images[n_train:n_train+n_val]


    splits = {"train": train_files, "val": val_files}

    # Create output folders
    for split in splits.keys():
        (Path(output_dir) / "images" / split).mkdir(parents=True, exist_ok=True)
        (Path(output_dir) / "labels" / split).mkdir(parents=True, exist_ok=True)

    # Copy files
    for split, files in splits.items():
        for img_file in files:
            # Image
            src_img = os.path.join(images_dir, img_file)
            dst_img = os.path.join(output_dir, "images", split, img_file)
            shutil.copy2(src_img, dst_img)

            # Label (same name but .txt)
            label_file = os.path.splitext(img_file)[0] + ".txt"
            src_label = os.path.join(labels_dir, label_file)
            dst_label = os.path.join(output_dir, "labels", split, label_file)

            if os.path.exists(src_label):
                shutil.copy2(src_label, dst_label)
            else:
                print(f"⚠️ Warning: missing label for {img_file}")



    print(f"✅ Split complete: {n_train} train, {len(val_files)} val, {len(test_files)} test.")



In [7]:
split_yolo_dataset('datasets/different_data/gala_ball/images', 'datasets/different_data/gala_ball/labels', 'datasets/new2', 0.8, 0.2)

✅ Split complete: 508 train, 127 val, 1 test.


In [9]:
import os

labels_dir = "datasets/new2/labels/val"  # change to val/test as needed
num_classes = 3  # must match nc in data.yaml

for file in os.listdir(labels_dir):
    if file.endswith(".txt"):
        path = os.path.join(labels_dir, file)
        with open(path, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 1:
                    try:
                        cls = int(float(parts[0]))
                        if cls < 0 or cls >= num_classes:
                            print(f"⚠️ Invalid class {cls} in {file}")
                    except ValueError:
                        print(f"⚠️ Non-numeric class in {file}: {line.strip()}")