In [1]:
import os

os.environ["CHECKPOINTS_PATH"] = "../checkpoints"
os.environ["TRACKING_RESULTS_PATH"] = "data/processed_tracking_results"

import itertools
import shutil
from pathlib import Path

import matplotlib.pyplot as plt

from experiment.settings import (
    CLASS_ID_TO_NAME,
    FULLY_LABELED_RECORDINGS,
    LABELING_REC_SAME_BACKGROUND_ID,
    TRAINING_DATASETS_PATH,
    SIMROOM_ID,
    RECORDINGS_PATH,
    RECORDING_FRAMES_PATH,
    IGNORED_CLASS_IDS
)
from src.api.db import Session, engine
from src.api.repositories import simrooms_repo
from src.api.services import simrooms_service
import cv2
import numpy as np
from src.utils import extract_frames_to_dir
import albumentations as A
from tqdm import tqdm

%matplotlib inline

# Global Config

In [19]:
OBJECT_DETECTION_DATASETS_PATH = TRAINING_DATASETS_PATH / "object_detection"

In [20]:
def get_tracking_results_per_class(session: Session, labeling_recording_id: str):
    calibration_id = simrooms_repo.get_calibration_recording(
        session, simroom_id=SIMROOM_ID, recording_id=labeling_recording_id
    ).id
    tracked_classes = simrooms_repo.get_tracked_classes(session, calibration_id)

    if len(tracked_classes) != 15:
        raise ValueError(f"Expected 15 tracked classes but got {len(tracked_classes)}")

    tracking_results_per_class = {
        tracked_class.id: simrooms_repo.get_tracking_result_paths(
            session, calibration_id, tracked_class.id
        )
        for tracked_class in tracked_classes
    }

    return tracking_results_per_class, tracked_classes

def get_per_class_metadata(
    session: Session,
    labeling_recording_id: str,
    ignore_classes: list = None,
):
    tracking_results_per_class, tracked_classes = get_tracking_results_per_class(
        session, labeling_recording_id
    )
    per_class_metadata = {}

    for tracked_class in tracked_classes:
        if ignore_classes and tracked_class.id in ignore_classes:
            print(
                f"Class {CLASS_ID_TO_NAME[tracked_class.id]} is ignored, skipping it"
            )
            continue

        class_id = tracked_class.id
        tracking_results = tracking_results_per_class[class_id]

        frame_indexes = []
        laplacian_variances = []
        mask_areas = []
        bboxes = []
        for tracking_result in tracking_results:
            file = np.load(tracking_result)
            if int(tracking_result.stem) != int(file["frame_idx"]):
                raise ValueError(
                    f"Frame index mismatch: {tracking_result.stem} != {file['frame_idx']}"
                )

            frame_indexes.append(int(tracking_result.stem))
            laplacian_variances.append(file["laplacian_variance"])
            mask_areas.append(np.sum(file["mask"]))
            bboxes.append(file["box"])

        per_class_metadata[class_id] = {
            "class_name": tracked_class.class_name,
            "color": tracked_class.color,
            "frame_indexes": frame_indexes,
            "laplacian_variances": laplacian_variances,
            "mask_areas": mask_areas,
            "bboxes": bboxes,
        }

    return per_class_metadata


# Create Training Data

In [21]:
def select_samples_per_class(per_class_metadata, num_samples_per_class):
    selected_samples_per_class = {}

    for class_id, metadata in per_class_metadata.items():
        frame_indexes = metadata["frame_indexes"]
        bboxes = metadata["bboxes"]

        sample_list = list(zip(frame_indexes, bboxes))
        num_available = len(sample_list)
                
        selected_indices_list = []

        if num_available >= num_samples_per_class:
            # Enough samples available, sample WITHOUT replacement
            selected_indices_list = np.random.choice(
                num_available, 
                size=num_samples_per_class, 
                replace=False
            ).tolist()
        else:
            # Not enough samples (num_available < num_samples_per_class), so oversample.
            # First, include all available samples once.
            selected_indices_list.extend(list(range(num_available)))
            
            num_remaining_to_sample = num_samples_per_class - num_available
            
            if num_remaining_to_sample > 0:
                # Sample the remainder with replacement from the available samples.
                oversampled_indices = np.random.choice(
                    num_available,
                    size=num_remaining_to_sample,
                    replace=True
                ).tolist()
                selected_indices_list.extend(oversampled_indices)
            
            # Shuffle the combined list to mix original and oversampled instances.
            np.random.shuffle(selected_indices_list)

        # Retrieve the actual sample data using the selected indices.
        final_selected_samples_tuples = [sample_list[i] for i in selected_indices_list]
        selected_frame_indexes, selected_bboxes = zip(*final_selected_samples_tuples)
        
        selected_samples_per_class[class_id] = {
            "frame_indexes": list(selected_frame_indexes),
            "bboxes": list(selected_bboxes),
        }

    return selected_samples_per_class
    

def get_samples_per_frame(per_class_metadata):
    samples_per_frame = {}

    for class_id, metadata in per_class_metadata.items():
        frame_indexes = metadata["frame_indexes"]
        bboxes = metadata["bboxes"]

        # zip all metadata together
        sample_list = list(zip(frame_indexes, bboxes))

        for frame_index, bbox in sample_list:
            if frame_index not in samples_per_frame:
                samples_per_frame[frame_index] = []
            samples_per_frame[frame_index].append((class_id, bbox))

    return samples_per_frame


def get_train_val_split(
    selected_samples_per_class,
    train_ratio=0.8,
):
    train_samples_per_class = {}
    val_samples_per_class = {}

    for class_id, metadata in selected_samples_per_class.items():
        frame_indexes = metadata["frame_indexes"]
        bboxes = metadata["bboxes"]

        # zip all metadata together
        sample_list = list(zip(frame_indexes, bboxes))

        # shuffle the samples
        np.random.shuffle(sample_list)

        # split the samples into train and val
        split_index = int(len(sample_list) * train_ratio)
        train_samples = sample_list[:split_index]
        val_samples = sample_list[split_index:]

        # unzip the selected samples
        train_frame_indexes, train_bboxes = zip(*train_samples)
        val_frame_indexes, val_bboxes = zip(*val_samples)

        train_samples_per_class[class_id] = {
            "frame_indexes": train_frame_indexes,
            "bboxes": train_bboxes,
        }
        val_samples_per_class[class_id] = {
            "frame_indexes": val_frame_indexes,
            "bboxes": val_bboxes,
        }

    return train_samples_per_class, val_samples_per_class


def plot_per_class_laplacian_variance(per_class_metadata):
    # Plot boxplots of laplacian variances per class in a single graph (one boxplot per class)
    fig, ax = plt.subplots(figsize=(12, 6))
    for class_id, metadata in per_class_metadata.items():
        laplacian_variances = metadata["laplacian_variances"]
        ax.boxplot(
            laplacian_variances,
            positions=[class_id],
            widths=0.5,
            patch_artist=True,
            boxprops=dict(facecolor=metadata["color"]),
        )
    ax.set_xticks(list(per_class_metadata.keys()))
    ax.set_xticklabels([
        metadata["class_name"] for metadata in per_class_metadata.values()
    ])
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right")
    ax.set_xlabel("Class")
    ax.set_ylabel("Laplacian Variance")
    ax.set_title("Laplacian Variance per Class")
    plt.grid()
    plt.show()

    # Plot histograms of laplacian variances per class in a single figure (one histogram per row)
    fig, axs = plt.subplots(
        len(per_class_metadata), 1, figsize=(12, 6 * len(per_class_metadata))
    )
    for i, (class_id, metadata) in enumerate(per_class_metadata.items()):
        laplacian_variances = metadata["laplacian_variances"]
        axs[i].hist(
            laplacian_variances,
            bins=50,
            color=metadata["color"],
            alpha=0.7,
            edgecolor="black",
        )
        axs[i].set_title(f"Laplacian Variance Histogram - {metadata['class_name']}")
        axs[i].set_xlabel("Laplacian Variance")
        axs[i].set_ylabel("Frequency")
        axs[i].grid()
    plt.tight_layout()
    plt.show()

In [22]:
frames, tmp_frames_dir = simrooms_service.extract_tmp_frames(
    LABELING_REC_SAME_BACKGROUND_ID
)

In [None]:
with Session(engine) as session:
    per_class_metadata = get_per_class_metadata(session, LABELING_REC_SAME_BACKGROUND_ID, IGNORED_CLASS_IDS)

plot_per_class_laplacian_variance(per_class_metadata)

Class ampulepoeder is ignored, skipping it


In [23]:
def draw_bboxes(
    image_np, bboxes, labels, class_name_map=None, color=(0, 255, 0), thickness=2
):
    img_res = image_np.copy()
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.5
    font_thickness = 1

    if not isinstance(bboxes, (list, np.ndarray)):
        print(f"Warning: bboxes is not a list or ndarray: {type(bboxes)}")
        return img_res
    if not isinstance(labels, (list, np.ndarray)):
        print(f"Warning: labels is not a list or ndarray: {type(labels)}")
        # Attempt to proceed if labels seem usable, otherwise return
        if len(bboxes) != len(labels):
            print("Warning: bbox and label length mismatch, cannot draw labels.")
            labels = ["?" for _ in bboxes]  # Placeholder
        elif not all(isinstance(l, (str, int, float)) for l in labels):
            print("Warning: labels contain non-primitive types, cannot draw reliably.")
            labels = ["?" for _ in bboxes]

    for bbox, label in zip(bboxes, labels):
        # Assuming bbox format allows direct conversion to int x_min, y_min, x_max, y_max
        # This might need adjustment based on the ACTUAL format in your bboxes list
        # Example for pascal_voc or albumentations (after denormalizing)
        try:
            # Check if bbox has at least 4 elements
            if len(bbox) < 4:
                print(f"Warning: Skipping invalid bbox (fewer than 4 coords): {bbox}")
                continue
            x_min, y_min, x_max, y_max = map(int, bbox[:4])
        except (ValueError, TypeError) as e:
            print(f"Warning: Could not convert bbox coords to int: {bbox}, Error: {e}")
            continue  # Skip this bbox

        cv2.rectangle(img_res, (x_min, y_min), (x_max, y_max), color, thickness)

        label_name = (
            str(label)
            if class_name_map is None
            else class_name_map.get(label, str(label))
        )
        # Simple text placement above the box
        (text_width, text_height), baseline = cv2.getTextSize(
            label_name, font, font_scale, font_thickness
        )
        text_y = (
            y_min - baseline if y_min - baseline > text_height else y_min + text_height
        )
        cv2.putText(
            img_res, label_name, (x_min, text_y), font, font_scale, color, font_thickness
        )

    return img_res

In [24]:
def samples_per_class_to_samples_per_frame(samples_per_class):
    samples_per_frame = {}
    for class_id, metadata in samples_per_class.items():
        frame_indexes = metadata["frame_indexes"]
        bboxes = metadata["bboxes"]
        sample_list = list(zip(frame_indexes, bboxes))

        for frame_index, bbox in sample_list:
            if frame_index not in samples_per_frame:
                samples_per_frame[frame_index] = []
            samples_per_frame[frame_index].append((class_id, bbox))

    return samples_per_frame


def create_data_files(
    labels_path: Path,
    images_path: Path,
    class_label_to_model_id,
    sample_idx,
    image,
    bboxes,
    class_labels,
):
    padded_sample_idx = str(sample_idx).zfill(10)
    image_width, image_height = image.shape[1], image.shape[0]

    # Save image
    image_path = images_path / f"{padded_sample_idx}.jpg"
    cv2.imwrite(image_path, image)

    # Save labels
    labels_path = labels_path / f"{padded_sample_idx}.txt"

    # transform bboxes to YOLO format
    file_rows = []
    for bbox, class_label in zip(bboxes, class_labels):
        x1, y1, x2, y2 = bbox

        # get xywh format
        x_center = (x1 + x2) / 2
        y_center = (y1 + y2) / 2
        width = x2 - x1
        height = y2 - y1

        # normalize the values
        x_center /= image_width
        y_center /= image_height
        width /= image_width
        height /= image_height

        model_id = class_label_to_model_id[class_label]
        file_rows.append(f"{model_id} {x_center} {y_center} {width} {height}")

    with open(labels_path, "w") as f:
        for row in file_rows:
            f.write(row + "\n")


def create_train_or_val_dataset(
    per_class_metadata,
    class_label_to_model_id,
    train_samples_per_class,
    samples_per_frame,
    frames,
    images_path,
    labels_path,
    crop_size: int,
    is_validation=False,
):
    selected_samples_per_frame = samples_per_class_to_samples_per_frame(
        train_samples_per_class
    )

    current_sample_idx = 0
    for frame_idx, frame in enumerate(tqdm(frames)):
        # check if the frame has any samples
        if selected_samples_per_frame.get(frame_idx) is None:
            continue

        image = cv2.imread(str(frame))

        # gather boxes and labels for the current frame
        class_ids, bboxes = zip(*samples_per_frame[frame_idx])
        bboxes = np.array(bboxes)
        class_labels = [
            per_class_metadata[class_id]["class_name"] for class_id in class_ids
        ]

        # for all selected samples in this frame, create crops
        for class_id, box in selected_samples_per_frame[frame_idx]:
            x1, y1, x2, y2 = box
            cx, cy = (x1 + x2) // 2, (y1 + y2) // 2

            # create a crop around the center of the box
            half_crop = crop_size // 2
            x_min = max(0, cx - half_crop)
            y_min = max(0, cy - half_crop)
            x_max = min(image.shape[1], cx + half_crop)
            y_max = min(image.shape[0], cy + half_crop)

            transform_steps = [
                A.Crop(x_min=x_min, y_min=y_min, x_max=x_max, y_max=y_max),
                A.PadIfNeeded(min_height=crop_size, min_width=crop_size),
            ]

            if not is_validation:
                transform_steps.append(A.HorizontalFlip(p=0.5))
                transform_steps.append(A.RandomBrightnessContrast(p=0.2))

            transform = A.Compose(
                transform_steps,
                bbox_params=A.BboxParams(
                    format="pascal_voc", label_fields=["class_labels"], min_visibility=0.7
                ),
            )

            # Augment the image and boxes
            augmented = transform(image=image, bboxes=bboxes, class_labels=class_labels)
            transformed_image = augmented["image"]
            transformed_bboxes = augmented["bboxes"]
            transformed_class_labels = augmented["class_labels"]

            # Save the transformed image and labels
            create_data_files(
                labels_path,
                images_path,
                class_label_to_model_id,
                current_sample_idx,
                transformed_image,
                transformed_bboxes,
                transformed_class_labels,
            )

            current_sample_idx += 1


def create_metadata_yaml(
    dataset_path: Path,
    per_class_metadata: dict,
):
    abs_dataset_path = dataset_path.resolve()

    metadata_yaml = f"""
path: {abs_dataset_path}
train: images/train
val: images/val
names:
"""

    class_label_to_model_id = {}
    for i, metadata in enumerate(per_class_metadata.values()):
        metadata_yaml += f"  {i}: {metadata['class_name']}\n"
        class_label_to_model_id[metadata["class_name"]] = i

    metadata_yaml_path = dataset_path / "data.yaml"
    with open(metadata_yaml_path, "w") as f:
        f.write(metadata_yaml)

    return class_label_to_model_id


def create_dataset(
    per_class_metadata: dict,
    frames: list[Path],
    datasets_path: Path,
    crop_size: int,
    dataset_type: str,
    num_samples_per_class: int,
):
    print(f"Creating dataset with {num_samples_per_class} samples per class")

    dataset_name = f"{dataset_type}_{crop_size}_{num_samples_per_class}"
    dataset_path = datasets_path / dataset_name
    train_images_path = dataset_path / "images/train"
    train_labels_path = dataset_path / "labels/train"
    val_images_path = dataset_path / "images/val"
    val_labels_path = dataset_path / "labels/val"

    train_images_path.mkdir(parents=True, exist_ok=True)
    train_labels_path.mkdir(parents=True, exist_ok=True)
    val_images_path.mkdir(parents=True, exist_ok=True)
    val_labels_path.mkdir(parents=True, exist_ok=True)

    # Gather necessary metadata
    selected_samples_per_class = select_samples_per_class(
        per_class_metadata, num_samples_per_class
    )
    samples_per_frame = get_samples_per_frame(per_class_metadata)
    train_samples_per_class, val_samples_per_class = get_train_val_split(
        selected_samples_per_class, train_ratio=0.8
    )

    # Create the dataset
    print(f"Creating training dataset")
    class_label_to_model_id = create_metadata_yaml(dataset_path, per_class_metadata)
    create_train_or_val_dataset(
        per_class_metadata,
        class_label_to_model_id,
        train_samples_per_class,
        samples_per_frame,
        frames,
        train_images_path,
        train_labels_path,
        crop_size=crop_size,
    )
    print(f"Creating validation dataset")
    create_train_or_val_dataset(
        per_class_metadata,
        class_label_to_model_id,
        val_samples_per_class,
        samples_per_frame,
        frames,
        val_images_path,
        val_labels_path,
        crop_size=crop_size,
        is_validation=True,
    )

    return dataset_path

In [None]:
if OBJECT_DETECTION_DATASETS_PATH.exists():
    shutil.rmtree(OBJECT_DETECTION_DATASETS_PATH)
OBJECT_DETECTION_DATASETS_PATH.mkdir(parents=True, exist_ok=True)

crop_size = [480, 640]
dataset_type = ["same_background"]#, "different_background", "mixed_background"]
num_samples_per_class = [1000, 2000, 3000, 4000]

all_combinations = list(
    itertools.product(
        crop_size,
        dataset_type,
        num_samples_per_class,
    )
)

print(f"Creating datasets with {len(all_combinations)} combinations")

print(f"Extracting metadata for recording {LABELING_REC_SAME_BACKGROUND_ID}")
with Session(engine) as session:
    per_class_metadata = get_per_class_metadata(session, LABELING_REC_SAME_BACKGROUND_ID, IGNORED_CLASS_IDS)

print(f"Extracting frames for recording {LABELING_REC_SAME_BACKGROUND_ID}")
frames, tmp_frames_dir = simrooms_service.extract_tmp_frames(
    LABELING_REC_SAME_BACKGROUND_ID
)

for combination in tqdm(all_combinations, desc="Creating datasets"):
    crop_size, dataset_type, num_samples = combination

    dataset_path = create_dataset(
        per_class_metadata=per_class_metadata,
        frames=frames,
        datasets_path=OBJECT_DETECTION_DATASETS_PATH,
        crop_size=crop_size,
        dataset_type=dataset_type,
        num_samples_per_class=num_samples,
    )

Class ampulepoeder is ignored, skipping it


Creating datasets:   0%|          | 0/8 [00:00<?, ?it/s]

Creating dataset with 1000 samples per class
Creating training dataset


100%|██████████| 14121/14121 [00:47<00:00, 298.98it/s]


Creating validation dataset


100%|██████████| 14121/14121 [00:12<00:00, 1131.99it/s]
Creating datasets:  12%|█▎        | 1/8 [00:59<06:58, 59.76s/it]

Creating dataset with 2000 samples per class
Creating training dataset


100%|██████████| 14121/14121 [01:22<00:00, 171.95it/s]


Creating validation dataset


  0%|          | 0/14121 [00:00<?, ?it/s][A

# Model Training

In [4]:
from ultralytics import YOLO

In [5]:
model = YOLO("yolo11n.pt")  # load a pretrained model

dataset_path = OBJECT_DETECTION_DATASETS_PATH / "2000_samples/data.yaml"
results = model.train(
    data=dataset_path, epochs=20, imgsz=IMG_CROP_SIZE, device="cuda", batch=32
)

New https://pypi.org/project/ultralytics/8.3.143 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.67 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4090, 24564MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=data/training_datasets/object_detection/2000_samples/data.yaml, epochs=20, time=None, patience=100, batch=32, imgsz=640, save=True, save_period=-1, cache=False, device=cuda, workers=8, project=None, name=train14, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, cla

E0000 00:00:1748010793.456841    2153 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748010793.473549    2153 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Overriding model.yaml nc=80 with nc=14

                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      6640  ultralytics.nn.modules.block.C3k2            [32, 64, 1, False, 0.25]      
  3                  -1  1     36992  ultralytics.nn.modules.conv.Conv             [64, 64, 3, 2]                
  4                  -1  1     26080  ultralytics.nn.modules.block.C3k2            [64, 128, 1, False, 0.25]     
  5                  -1  1    147712  ultralytics.nn.modules.conv.Conv             [128, 128, 3, 2]              
  6                  -1  1     87040  ultralytics.nn.modules.block.C3k2            [128, 128, 1, True]           
  7                  -1  1    295424  ultralytic

[34m[1mtrain: [0mScanning /home/zilian/projects/bachelorproef/experiment/data/training_datasets/object_detection/2000_samples/labels/train.cache... 22400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 22400/22400 [00:00<?, ?it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


[34m[1mval: [0mScanning /home/zilian/projects/bachelorproef/experiment/data/training_datasets/object_detection/2000_samples/labels/val.cache... 5600 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5600/5600 [00:00<?, ?it/s]


Plotting labels to /home/zilian/projects/bachelorproef/runs/detect/train14/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000556, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1m/home/zilian/projects/bachelorproef/runs/detect/train14[0m
Starting training for 20 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/20      4.64G      0.916      1.956     0.9556        163        640: 100%|██████████| 700/700 [01:31<00:00,  7.62it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:16<00:00,  5.37it/s]


                   all       5600      19124      0.875        0.9       0.92      0.731

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/20      4.85G     0.7645     0.8427     0.9072        159        640: 100%|██████████| 700/700 [01:23<00:00,  8.37it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:16<00:00,  5.36it/s]


                   all       5600      19124      0.909      0.928      0.952      0.783

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/20      4.64G     0.7304     0.6724     0.8978        167        640: 100%|██████████| 700/700 [01:17<00:00,  9.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:17<00:00,  5.16it/s]

                   all       5600      19124      0.904      0.914      0.944       0.78






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/20      4.72G     0.7049     0.5956     0.8895        170        640: 100%|██████████| 700/700 [01:13<00:00,  9.56it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:16<00:00,  5.23it/s]

                   all       5600      19124      0.914      0.938      0.961      0.813






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/20      4.79G     0.6692      0.541     0.8804        127        640: 100%|██████████| 700/700 [01:13<00:00,  9.46it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:16<00:00,  5.21it/s]

                   all       5600      19124      0.923      0.929      0.959      0.814






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/20      4.65G     0.6452     0.5106     0.8747        145        640: 100%|██████████| 700/700 [01:17<00:00,  9.06it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.81it/s]

                   all       5600      19124      0.928      0.938      0.968      0.837






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/20      4.66G     0.6276     0.4857     0.8702        157        640: 100%|██████████| 700/700 [01:13<00:00,  9.50it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:17<00:00,  5.08it/s]

                   all       5600      19124      0.925      0.943      0.972      0.847






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/20      4.65G     0.6141     0.4688     0.8685        182        640: 100%|██████████| 700/700 [01:13<00:00,  9.47it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.84it/s]

                   all       5600      19124      0.931      0.947      0.972      0.846






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/20      4.64G     0.6008     0.4531     0.8632        125        640: 100%|██████████| 700/700 [01:15<00:00,  9.32it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.78it/s]

                   all       5600      19124      0.932      0.951      0.975      0.862






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/20      4.64G     0.5864     0.4418     0.8601        146        640: 100%|██████████| 700/700 [01:19<00:00,  8.76it/s] 
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:15<00:00,  5.87it/s]

                   all       5600      19124      0.927      0.954      0.975      0.863





Closing dataloader mosaic
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/20      4.61G     0.5486     0.3934      0.834         99        640: 100%|██████████| 700/700 [01:14<00:00,  9.43it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:15<00:00,  5.80it/s]

                   all       5600      19124      0.943      0.956      0.978       0.87






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/20      4.61G     0.5347     0.3753     0.8302         84        640: 100%|██████████| 700/700 [01:11<00:00,  9.75it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:17<00:00,  5.12it/s]

                   all       5600      19124      0.945       0.96      0.979      0.872






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/20      4.61G     0.5205     0.3634      0.826         98        640: 100%|██████████| 700/700 [01:11<00:00,  9.75it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:17<00:00,  5.16it/s]

                   all       5600      19124      0.947      0.961       0.98      0.877






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/20      4.61G       0.51      0.354     0.8235        102        640: 100%|██████████| 700/700 [01:11<00:00,  9.77it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.81it/s]

                   all       5600      19124      0.954      0.961      0.982      0.884






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/20      4.61G     0.5015     0.3431     0.8208         98        640: 100%|██████████| 700/700 [01:11<00:00,  9.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.81it/s]

                   all       5600      19124      0.952      0.961      0.982      0.886






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      16/20      4.61G     0.4931      0.334      0.819         93        640: 100%|██████████| 700/700 [01:11<00:00,  9.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.88it/s]

                   all       5600      19124      0.953      0.967      0.984      0.889






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      17/20      4.61G     0.4817     0.3234     0.8164        109        640: 100%|██████████| 700/700 [01:10<00:00,  9.94it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:17<00:00,  5.00it/s]

                   all       5600      19124      0.958      0.966      0.985      0.893






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      18/20      4.61G     0.4725     0.3148     0.8136        111        640: 100%|██████████| 700/700 [01:11<00:00,  9.83it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.85it/s]

                   all       5600      19124      0.958      0.967      0.985      0.895






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      19/20      4.61G     0.4657     0.3074     0.8127        119        640: 100%|██████████| 700/700 [01:14<00:00,  9.36it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:17<00:00,  4.98it/s]

                   all       5600      19124      0.962      0.964      0.986      0.896






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      20/20      4.61G     0.4555     0.2983       0.81        104        640: 100%|██████████| 700/700 [01:10<00:00,  9.92it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:18<00:00,  4.77it/s]

                   all       5600      19124       0.96      0.968      0.986      0.899






20 epochs completed in 0.517 hours.
Optimizer stripped from /home/zilian/projects/bachelorproef/runs/detect/train14/weights/last.pt, 5.5MB
Optimizer stripped from /home/zilian/projects/bachelorproef/runs/detect/train14/weights/best.pt, 5.5MB

Validating /home/zilian/projects/bachelorproef/runs/detect/train14/weights/best.pt...
Ultralytics 8.3.67 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce RTX 4090, 24564MiB)
YOLO11n summary (fused): 238 layers, 2,584,882 parameters, 0 gradients, 6.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 88/88 [00:23<00:00,  3.81it/s]


                   all       5600      19124       0.96      0.969      0.986      0.899
        naaldcontainer       1460       1460      0.975      0.998      0.994      0.975
                 spuit       1951       1951      0.909       0.83      0.941       0.69
             keukenmes       2123       2123      0.979      0.978       0.99       0.93
                infuus        479        479      0.895      0.943      0.977      0.867
           stethoscoop       2014       2014      0.965      0.995      0.992      0.974
               bol wol       1200       1200      0.943      0.974      0.984      0.863
                 snoep       1598       1598      0.956      0.964      0.988      0.865
               nuchter        756        756      0.955      0.989       0.99      0.974
             fotokader       1100       1100      0.986      0.994      0.994       0.98
              iced tea       1628       1628       0.99      0.993      0.994      0.887
                  bri

In [6]:
model_path = Path("data/models/object_detection") / "2000_samples.pt"
model.save(str(model_path))

## Extract frames for trial recordings

In [7]:
if not RECORDING_FRAMES_PATH.exists():
    RECORDING_FRAMES_PATH.mkdir(parents=True, exist_ok=True)

for recording_id in tqdm(FULLY_LABELED_RECORDINGS, desc="Extracting frames"):
    print(f"Extracting frames for {recording_id}")
    recording_video_path = RECORDINGS_PATH / f"{recording_id}.mp4"
    recording_frames_path = RECORDING_FRAMES_PATH / recording_id

    if recording_frames_path.exists():
        shutil.rmtree(recording_frames_path)
    recording_frames_path.mkdir(parents=True, exist_ok=True)
    
    extract_frames_to_dir(recording_video_path, recording_frames_path)

Extracting frames:   0%|          | 0/10 [00:00<?, ?it/s]

Extracting frames for 67b71a70-da64-467a-9fb6-91bc29265fd1


Extracting frames:  10%|█         | 1/10 [00:07<01:03,  7.05s/it]

Extracting frames for 32f02db7-adc0-4556-a2da-ed2ba60a58c9


Extracting frames:  20%|██        | 2/10 [00:11<00:44,  5.57s/it]

Extracting frames for b8eeecc0-06b1-47f7-acb5-89aab3c1724d


Extracting frames:  30%|███       | 3/10 [00:16<00:37,  5.39s/it]

Extracting frames for d50c5f3b-2822-4462-9880-5a8f0dd46bfb


Extracting frames:  40%|████      | 4/10 [00:21<00:31,  5.29s/it]

Extracting frames for 9fa3e3b8-ed94-4b06-ba49-e66e3997d710


Extracting frames:  50%|█████     | 5/10 [00:25<00:24,  4.86s/it]

Extracting frames for 98128cdc-ffeb-40cb-9528-573e25028e87


Extracting frames:  60%|██████    | 6/10 [00:29<00:17,  4.32s/it]

Extracting frames for 89b60530-e0e4-4f5d-9ee6-af85c8d99ff4


Extracting frames:  70%|███████   | 7/10 [00:33<00:12,  4.29s/it]

Extracting frames for 2fe01600-c057-40ee-8434-4e9e0688ca2d


Extracting frames:  80%|████████  | 8/10 [00:40<00:10,  5.07s/it]

Extracting frames for 67823ccd-a1f0-4cde-b954-3b9e5fe160c1


Extracting frames:  90%|█████████ | 9/10 [00:45<00:05,  5.14s/it]

Extracting frames for b8f453aa-5a12-4cbb-a0ec-20eb503f8797


Extracting frames: 100%|██████████| 10/10 [00:50<00:00,  5.02s/it]
Extracting frames: 100%|██████████| 10/10 [00:50<00:00,  5.02s/it]
