# Лабораторная работа 8, студент Устинов Денис Александрович М8О-406Б-21

## 1. Выбор начальных условий

### a. Набор данных для задачи сегментации
В качестве датасета был выбран BCCD (https://github.com/Shenggan/BCCD_Dataset).

Датасет включает изображения, на которых представлены различные типы клеток крови, такие как эритроциты, лейкоциты и тромбоциты. Он предназначен для задач компьютерного зрения и детекции объектов. Аннотации на изображениях содержат информацию о типах клеток и их местоположении в виде прямоугольных ограничивающих рамок 

Обоснование выбора:

С практической точки зрения, этот датасет может быть использован для разработки автоматизированных систем, которые помогают в диагностике заболеваний, таких как анемия, лейкемия, инфекции и другие расстройства крови. Модели детекции, обученные на BCCD, могут автоматически классифицировать и подсчитывать различные типы клеток крови, что значительно ускоряет процесс диагностики

### b. Выбор метрик качества

1) Precision - показывает, насколько уверены предсказания модели
2) Recall - показывает, насколько хорошо модель находит все настоящие объекты
3) F1-Score - гармоническое среднее Precision и Recall.
4) mAP - учитывает точность и позиционирование bounding box'ов 

## 2. Создание бейзлайна и оценка качества

### Подготовим датасет

In [None]:
import os
import shutil
import random
import xml.etree.ElementTree as ET

base_dir = 'BCCD_Dataset/BCCD'
images_dir = os.path.join(base_dir, 'JPEGImages')
annotations_dir = os.path.join(base_dir, 'Annotations')

output_dir = 'bccd_yolo'
images_out = os.path.join(output_dir, 'images')
labels_out = os.path.join(output_dir, 'labels')

for split in ['train', 'val']:
    os.makedirs(os.path.join(images_out, split), exist_ok=True)
    os.makedirs(os.path.join(labels_out, split), exist_ok=True)

classes = ['RBC', 'WBC', 'Platelets']
class_to_id = {name: i for i, name in enumerate(classes)}

files = [f for f in os.listdir(annotations_dir) if f.endswith('.xml')]
random.shuffle(files)
split_index = int(0.8 * len(files))
splits = {'train': files[:split_index], 'val': files[split_index:]}

def convert_bbox(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[1]) / 2.0
    y = (box[2] + box[3]) / 2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    return (x * dw, y * dh, w * dw, h * dh)

for split, xml_files in splits.items():
    for xml_file in xml_files:
        xml_path = os.path.join(annotations_dir, xml_file)
        tree = ET.parse(xml_path)
        root = tree.getroot()

        image_name = root.find('filename').text
        image_path = os.path.join(images_dir, image_name)
        shutil.copy(image_path, os.path.join(images_out, split, image_name))

        size = root.find('size')
        w = int(size.find('width').text)
        h = int(size.find('height').text)

        label_path = os.path.join(labels_out, split, image_name.replace('.jpg', '.txt'))
        with open(label_path, 'w') as f:
            for obj in root.iter('object'):
                cls = obj.find('name').text
                if cls not in classes:
                    continue
                xml_box = obj.find('bndbox')
                b = (
                    float(xml_box.find('xmin').text),
                    float(xml_box.find('xmax').text),
                    float(xml_box.find('ymin').text),
                    float(xml_box.find('ymax').text),
                )
                bb = convert_bbox((w, h), b)
                f.write(f"{class_to_id[cls]} {' '.join(map(str, bb))}\n")

yaml_path = os.path.join(output_dir, 'data.yaml')
with open(yaml_path, 'w') as f:
    f.write(f"""train: {os.path.abspath(images_out + '/train')}
val: {os.path.abspath(images_out + '/val')}

nc: 3
names: ['RBC', 'WBC', 'Platelets']
""")

### a. Обучить сверточную модель (yolo11n) из ultralytics для выбранного набора данных и оценить качество моделей по выбранным метрикам на выбранном наборе данных

In [2]:
from ultralytics import YOLO

BATCH_SIZE = 4
EPOCHS = 5

model = YOLO('yolo11n.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    batch=BATCH_SIZE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (yolo11n):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 20.7MB/s]


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=bccd_yolo/data.yaml, epochs=5, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train8, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, format

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]


Plotting labels to runs/detect/train8/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001429, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train8[0m
Starting training for 5 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/5         0G      1.332      2.868      1.349        124        640: 100%|██████████| 73/73 [01:07<00:00,  1.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]

                   all         73        987      0.921       0.21      0.488      0.282

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        2/5         0G      1.292      1.745      1.246         51        640: 100%|██████████| 73/73 [01:06<00:00,  1.10it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:03<00:00,  2.62it/s]

                   all         73        987      0.764      0.895      0.863      0.541

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        3/5         0G      1.213      1.288       1.26         42        640: 100%|██████████| 73/73 [01:02<00:00,  1.17it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:03<00:00,  2.50it/s]

                   all         73        987      0.775      0.887      0.875      0.577

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        4/5         0G      1.148      1.155      1.217         48        640: 100%|██████████| 73/73 [01:02<00:00,  1.16it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.48it/s]

                   all         73        987      0.795       0.93      0.893      0.585

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        5/5         0G      1.121      1.062      1.199        104        640: 100%|██████████| 73/73 [01:12<00:00,  1.00it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.35it/s]

                   all         73        987      0.822      0.885      0.896      0.597

5 epochs completed in 0.098 hours.





Optimizer stripped from runs/detect/train8/weights/last.pt, 5.5MB
Optimizer stripped from runs/detect/train8/weights/best.pt, 5.5MB

Validating runs/detect/train8/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:03<00:00,  2.53it/s]


                   all         73        987      0.823      0.883      0.896      0.596
                   RBC         68        827      0.716      0.824       0.82      0.563
                   WBC         71         73      0.935          1      0.971      0.771
             Platelets         48         87      0.818      0.825      0.895      0.455
Speed: 0.7ms preprocess, 47.3ms inference, 0.0ms loss, 3.7ms postprocess per image
Results saved to [1mruns/detect/train8[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [00:03<00:00,  4.88it/s]


                   all         73        987      0.823      0.883      0.896      0.596
                   RBC         68        827      0.716      0.824       0.82      0.563
                   WBC         71         73      0.935          1      0.971      0.771
             Platelets         48         87      0.818      0.825      0.895      0.455
Speed: 0.5ms preprocess, 45.9ms inference, 0.0ms loss, 3.9ms postprocess per image
Results saved to [1mruns/detect/train82[0m
Метрики (yolo11n):
Precision: 0.8232
Recall: 0.8832
F1-score: 0.8521
mAP@0.5: 0.8955
mAP@0.5:0.95: 0.5963


### b. Обучить трансформерную модель (rtdetr-l) из ultralytics для выбранного набора данных и оценить качество моделей по выбранным метрикам на выбранном наборе данных

Обучим модель rtdetr-l из Utralytics, так как семейство моделей YoloV11 не предоставляет трансформерных моделей

In [None]:
from ultralytics import RTDETR

BATCH_SIZE = 4
EPOCHS = 2

model = RTDETR('rtdetr-l.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    batch=BATCH_SIZE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (rtdetr-l):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=rtdetr-l.pt, data=bccd_yolo/data.yaml, epochs=2, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train9, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, forma

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train9/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001429, momentum=0.9) with parameter groups 143 weight(decay=0.0), 206 weight(decay=0.0005), 226 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train9[0m
Starting training for 2 epochs...

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        1/2         0G     0.8144      2.808     0.4955        112        640: 100%|██████████| 73/73 [08:15<00:00,  6.79s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:33<00:00,  3.38s/it]

                   all         73        987       0.86      0.234      0.214      0.137






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        2/2         0G     0.4867     0.7636     0.2374         46        640: 100%|██████████| 73/73 [08:06<00:00,  6.66s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:29<00:00,  3.00s/it]

                   all         73        987      0.632      0.882      0.759        0.5






2 epochs completed in 0.291 hours.
Optimizer stripped from runs/detect/train9/weights/last.pt, 66.1MB
Optimizer stripped from runs/detect/train9/weights/best.pt, 66.1MB

Validating runs/detect/train9/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:28<00:00,  2.84s/it]


                   all         73        987      0.628      0.881      0.758        0.5
                   RBC         68        827      0.432      0.839       0.63      0.429
                   WBC         71         73       0.83          1      0.953      0.738
             Platelets         48         87      0.622      0.805      0.692      0.334
Speed: 0.8ms preprocess, 385.3ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train9[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [00:29<00:00,  1.54s/it]


                   all         73        987      0.628      0.881      0.758        0.5
                   RBC         68        827      0.432      0.839       0.63      0.429
                   WBC         71         73       0.83          1      0.953      0.738
             Platelets         48         87      0.622      0.805      0.692      0.334
Speed: 0.6ms preprocess, 395.7ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train92[0m
Метрики (yolo8-vit):
Precision: 0.6280
Recall: 0.8813
F1-score: 0.7334
mAP@0.5: 0.7582
mAP@0.5:0.95: 0.5001


## 3. Улучшение бейзлайна

### a. Сформулировать гипотезы (аугментации данных, подбор моделей, подбор гиперпараметров и т.д)

Так как модели yolo11 и rtdetr-l используют аугментацию по умолчанию, то в гипотезу по улучшению бейзлайна не включаю это.

1. **Оптимизация гиперпараметров**. Подбор learning rate, batch size и количества эпох
2. **Использование другого метода оптимизации**. Для улучшения точности можно попытаться заменить стандартный оптимизатор на SGD

### Обучение моделей, оценка качества обучения моделей по метрикам

#### Оптимизация гиперпараметров

##### Сверточная модель yolo11n

In [4]:
from ultralytics import YOLO

BATCH_SIZE = 8
EPOCHS = 7
LEARNING_RATE = 0.0005

model = YOLO('yolo11n.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    batch=BATCH_SIZE,
    optimizer='AdamW',
    lr0=LEARNING_RATE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (yolo11n):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=bccd_yolo/data.yaml, epochs=7, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train10, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, form

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train10/labels.jpg... 





[34m[1moptimizer:[0m AdamW(lr=0.0005, momentum=0.937) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train10[0m
Starting training for 7 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/7         0G      1.291      1.986      1.236        108        640: 100%|██████████| 37/37 [01:02<00:00,  1.69s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:13<00:00,  2.70s/it]

                   all         73        987     0.0359      0.542      0.337      0.201

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        2/7         0G      1.176      1.389      1.136         73        640: 100%|██████████| 37/37 [01:03<00:00,  1.71s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.27s/it]

                   all         73        987      0.571      0.549      0.575      0.392

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        3/7         0G       1.15      1.238      1.124         43        640: 100%|██████████| 37/37 [00:54<00:00,  1.47s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.22s/it]

                   all         73        987      0.887      0.595      0.783      0.534

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        4/7         0G      1.132      1.181      1.128         61        640: 100%|██████████| 37/37 [00:55<00:00,  1.51s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.34s/it]

                   all         73        987      0.759      0.822      0.892      0.602

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        5/7         0G      1.109      1.103       1.11        121        640: 100%|██████████| 37/37 [01:02<00:00,  1.70s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.37s/it]

                   all         73        987       0.84      0.839      0.884        0.6

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        6/7         0G      1.076       1.07      1.099         77        640: 100%|██████████| 37/37 [00:55<00:00,  1.49s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.36s/it]

                   all         73        987      0.821      0.911      0.894      0.602

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        7/7         0G       1.09      1.057      1.115         83        640: 100%|██████████| 37/37 [00:57<00:00,  1.56s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.32s/it]

                   all         73        987       0.83      0.904      0.892      0.603

7 epochs completed in 0.138 hours.
Optimizer stripped from runs/detect/train10/weights/last.pt, 5.5MB





Optimizer stripped from runs/detect/train10/weights/best.pt, 5.5MB

Validating runs/detect/train10/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:10<00:00,  2.16s/it]


                   all         73        987       0.83      0.905      0.892      0.602
                   RBC         68        827      0.706      0.843      0.816      0.566
                   WBC         71         73       0.95          1       0.99      0.784
             Platelets         48         87      0.835      0.871      0.869      0.457
Speed: 0.8ms preprocess, 139.8ms inference, 0.0ms loss, 4.6ms postprocess per image
Results saved to [1mruns/detect/train10[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:03<00:00,  2.95it/s]


                   all         73        987       0.83      0.905      0.892      0.602
                   RBC         68        827      0.706      0.843      0.816      0.566
                   WBC         71         73       0.95          1       0.99      0.784
             Platelets         48         87      0.835      0.871      0.869      0.457
Speed: 0.8ms preprocess, 38.5ms inference, 0.0ms loss, 4.5ms postprocess per image
Results saved to [1mruns/detect/train102[0m
Метрики (yolo11n):
Precision: 0.8305
Recall: 0.9046
F1-score: 0.8659
mAP@0.5: 0.8916
mAP@0.5:0.95: 0.6022


##### Трансформерная модель rtdetr-l

In [5]:
from ultralytics import RTDETR

BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 0.0005

model = RTDETR('rtdetr-l.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    batch=BATCH_SIZE,
    optimizer='AdamW',
    lr0=LEARNING_RATE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (rtdetr-l):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=rtdetr-l.pt, data=bccd_yolo/data.yaml, epochs=4, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train11, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, for

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train11/labels.jpg... 





[34m[1moptimizer:[0m AdamW(lr=0.0005, momentum=0.937) with parameter groups 143 weight(decay=0.0), 206 weight(decay=0.0005), 226 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train11[0m
Starting training for 4 epochs...

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        1/4         0G      1.229     0.9256      0.724        103        640: 100%|██████████| 37/37 [08:04<00:00, 13.09s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [01:24<00:00, 16.83s/it]

                   all         73        987    0.00412      0.109    0.00633    0.00156






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        2/4         0G     0.7153      1.011     0.3001         66        640: 100%|██████████| 37/37 [07:48<00:00, 12.65s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [01:24<00:00, 16.97s/it]

                   all         73        987     0.0123      0.327     0.0586     0.0275






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        3/4         0G     0.4957      1.193     0.2144         36        640: 100%|██████████| 37/37 [07:40<00:00, 12.45s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [01:24<00:00, 16.95s/it]

                   all         73        987     0.0125      0.331     0.0541     0.0324






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        4/4         0G     0.4443       1.22     0.1933         55        640: 100%|██████████| 37/37 [08:04<00:00, 13.09s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [01:29<00:00, 17.97s/it]

                   all         73        987     0.0125      0.331     0.0445     0.0283






4 epochs completed in 0.623 hours.
Optimizer stripped from runs/detect/train11/weights/last.pt, 66.1MB
Optimizer stripped from runs/detect/train11/weights/best.pt, 66.1MB

Validating runs/detect/train11/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [01:21<00:00, 16.36s/it]


                   all         73        987     0.0125      0.331     0.0534     0.0321
                   RBC         68        827     0.0375      0.994       0.16     0.0962
                   WBC         71         73          0          0          0          0
             Platelets         48         87          0          0          0          0
Speed: 0.8ms preprocess, 1116.5ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train11[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:26<00:00,  2.66s/it]


                   all         73        987     0.0125      0.331     0.0534     0.0321
                   RBC         68        827     0.0375      0.994       0.16     0.0962
                   WBC         71         73          0          0          0          0
             Platelets         48         87          0          0          0          0
Speed: 1.0ms preprocess, 359.8ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train112[0m
Метрики (rtdetr-l):
Precision: 0.0125
Recall: 0.3313
F1-score: 0.0241
mAP@0.5: 0.0534
mAP@0.5:0.95: 0.0321


#### Использование другого метода оптимизации

##### Сверточная модель yolo11

In [12]:
from ultralytics import YOLO

BATCH_SIZE = 4
EPOCHS = 5

model = YOLO('yolo11n.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    optimizer='SGD',
    batch=BATCH_SIZE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (yolo11n):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=bccd_yolo/data.yaml, epochs=5, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train14, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, format

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train14/labels.jpg... 





[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.937) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train14[0m
Starting training for 5 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/5         0G      1.335      2.829      1.406        124        640: 100%|██████████| 73/73 [01:06<00:00,  1.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.20it/s]

                   all         73        987      0.917      0.183      0.334      0.215

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        2/5         0G      1.254      1.643      1.262         51        640: 100%|██████████| 73/73 [01:03<00:00,  1.16it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.22it/s]

                   all         73        987      0.489      0.169      0.211      0.128

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        3/5         0G      1.214      1.331      1.228         42        640: 100%|██████████| 73/73 [01:00<00:00,  1.20it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.45it/s]

                   all         73        987      0.814      0.881      0.892      0.586

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        4/5         0G      1.146       1.16      1.187         48        640: 100%|██████████| 73/73 [00:56<00:00,  1.30it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.39it/s]

                   all         73        987      0.845      0.881        0.9      0.595

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        5/5         0G      1.117      1.069      1.163        104        640: 100%|██████████| 73/73 [00:55<00:00,  1.33it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.43it/s]

                   all         73        987      0.829      0.904      0.904      0.599

5 epochs completed in 0.090 hours.
Optimizer stripped from runs/detect/train14/weights/last.pt, 5.5MB





Optimizer stripped from runs/detect/train14/weights/best.pt, 5.5MB

Validating runs/detect/train14/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.50it/s]


                   all         73        987      0.829      0.904      0.904      0.599
                   RBC         68        827      0.666      0.862      0.826      0.558
                   WBC         71         73      0.974          1      0.991      0.779
             Platelets         48         87      0.847      0.851      0.896       0.46
Speed: 1.0ms preprocess, 42.0ms inference, 0.0ms loss, 9.3ms postprocess per image
Results saved to [1mruns/detect/train14[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [00:04<00:00,  4.67it/s]


                   all         73        987      0.829      0.904      0.904      0.599
                   RBC         68        827      0.666      0.862      0.826      0.558
                   WBC         71         73      0.974          1      0.991      0.779
             Platelets         48         87      0.847      0.851      0.896       0.46
Speed: 0.8ms preprocess, 42.9ms inference, 0.0ms loss, 9.1ms postprocess per image
Results saved to [1mruns/detect/train142[0m
Метрики (yolo11n):
Precision: 0.8291
Recall: 0.9042
F1-score: 0.8651
mAP@0.5: 0.9043
mAP@0.5:0.95: 0.5991


##### Трансформерная модель rtdetr-l

In [None]:
from ultralytics import RTDETR

BATCH_SIZE = 4
EPOCHS = 2

model = RTDETR('rtdetr-l.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    optimizer = 'SGD',
    batch=BATCH_SIZE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (rtdetr-l):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=rtdetr-l.pt, data=bccd_yolo/data.yaml, epochs=2, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train15, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, forma

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train15/labels.jpg... 





[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.937) with parameter groups 143 weight(decay=0.0), 206 weight(decay=0.0005), 226 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train15[0m
Starting training for 2 epochs...

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        1/2         0G      1.222      1.705     0.7222        112        640: 100%|██████████| 73/73 [06:49<00:00,  5.61s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:29<00:00,  2.99s/it]

                   all         73        987     0.0171      0.243     0.0202    0.00642






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        2/2         0G     0.9802      0.742     0.4727         46        640: 100%|██████████| 73/73 [06:49<00:00,  5.61s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:31<00:00,  3.12s/it]

                   all         73        987      0.369      0.155     0.0283    0.00803






2 epochs completed in 0.245 hours.
Optimizer stripped from runs/detect/train15/weights/last.pt, 66.1MB
Optimizer stripped from runs/detect/train15/weights/best.pt, 66.1MB

Validating runs/detect/train15/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:30<00:00,  3.06s/it]


                   all         73        987      0.368      0.155      0.029    0.00827
                   RBC         68        827     0.0711       0.45     0.0798     0.0226
                   WBC         71         73      0.034     0.0137    0.00706    0.00216
             Platelets         48         87          1          0          0          0
Speed: 1.0ms preprocess, 413.7ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train15[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [00:31<00:00,  1.68s/it]


                   all         73        987      0.368      0.155      0.029    0.00827
                   RBC         68        827     0.0711       0.45     0.0798     0.0226
                   WBC         71         73      0.034     0.0137    0.00706    0.00216
             Platelets         48         87          1          0          0          0
Speed: 0.7ms preprocess, 432.5ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train152[0m
Метрики (rtdetr-l):
Precision: 0.3684
Recall: 0.1545
F1-score: 0.2177
mAP@0.5: 0.0290
mAP@0.5:0.95: 0.0083


### Окончательный улучшенный бейзлайн

#### Сверточная модель yolo11n

Используем улучшения из обоих гипотез - обе гипотезы показали лучше результат по сравнению с изначальным вариантом

In [17]:
from ultralytics import YOLO

BATCH_SIZE = 4
EPOCHS = 7

model = YOLO('yolo11n.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    batch=BATCH_SIZE,
    optimizer='SGD',
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (yolo11n):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=bccd_yolo/data.yaml, epochs=7, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train19, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, format

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train19/labels.jpg... 





[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.937) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train19[0m
Starting training for 7 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/7         0G      1.335      2.829      1.406        124        640: 100%|██████████| 73/73 [01:00<00:00,  1.21it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:03<00:00,  2.52it/s]

                   all         73        987      0.917      0.183      0.334      0.215

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        2/7         0G      1.254      1.641      1.262         51        640: 100%|██████████| 73/73 [00:57<00:00,  1.26it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.41it/s]

                   all         73        987      0.172      0.713       0.54      0.318






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/7         0G      1.234      1.323      1.232         42        640: 100%|██████████| 73/73 [00:57<00:00,  1.26it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.43it/s]

                   all         73        987      0.658      0.256      0.418      0.231

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        4/7         0G      1.178      1.171      1.197         48        640: 100%|██████████| 73/73 [00:57<00:00,  1.27it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.31it/s]

                   all         73        987      0.803      0.442      0.648      0.358

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        5/7         0G      1.138      1.064      1.174        104        640: 100%|██████████| 73/73 [00:57<00:00,  1.27it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.45it/s]

                   all         73        987      0.792      0.851       0.87      0.577

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        6/7         0G      1.085      1.031      1.167         81        640: 100%|██████████| 73/73 [00:56<00:00,  1.29it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.36it/s]

                   all         73        987      0.851      0.857      0.895      0.588

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size



        7/7         0G      1.088      1.019      1.171         74        640: 100%|██████████| 73/73 [00:57<00:00,  1.27it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.38it/s]

                   all         73        987      0.846       0.87      0.908      0.607

7 epochs completed in 0.121 hours.
Optimizer stripped from runs/detect/train19/weights/last.pt, 5.5MB





Optimizer stripped from runs/detect/train19/weights/best.pt, 5.5MB

Validating runs/detect/train19/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:04<00:00,  2.50it/s]


                   all         73        987      0.846      0.869      0.908      0.607
                   RBC         68        827      0.752      0.765      0.817      0.568
                   WBC         71         73      0.966          1      0.987      0.794
             Platelets         48         87      0.821      0.841      0.921      0.459
Speed: 0.5ms preprocess, 42.1ms inference, 0.0ms loss, 9.5ms postprocess per image
Results saved to [1mruns/detect/train19[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
YOLO11n summary (fused): 100 layers, 2,582,737 parameters, 0 gradients, 6.3 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [00:04<00:00,  4.63it/s]


                   all         73        987      0.846      0.869      0.908      0.607
                   RBC         68        827      0.752      0.765      0.817      0.568
                   WBC         71         73      0.966          1      0.987      0.794
             Platelets         48         87      0.821      0.841      0.921      0.459
Speed: 1.1ms preprocess, 42.4ms inference, 0.0ms loss, 9.7ms postprocess per image
Results saved to [1mruns/detect/train192[0m
Метрики (yolo11n):
Precision: 0.8460
Recall: 0.8687
F1-score: 0.8572
mAP@0.5: 0.9083
mAP@0.5:0.95: 0.6073


#### Трансформерная модель rtdetr-l

Используем улучшение из 1 гипотезы - увеличим количество эпох

In [None]:
from ultralytics import RTDETR

BATCH_SIZE = 4
EPOCHS = 4

model = RTDETR('rtdetr-l.pt')

results = model.train(
    data='bccd_yolo/data.yaml',
    epochs=EPOCHS,
    imgsz=640,
    device='cpu',
    batch=BATCH_SIZE,
    verbose=True
)

metrics = model.val()
precision = metrics.box.p.mean()
recall = metrics.box.r.mean()
f1_score = 2 * (precision * recall) / (precision + recall + 1e-9)
map50 = metrics.box.map50.mean()
map50_95 = metrics.box.map.mean()

print(f"Метрики (rtdetr-l):")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")
print(f"mAP@0.5: {map50:.4f}")
print(f"mAP@0.5:0.95: {map50_95:.4f}")


Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=rtdetr-l.pt, data=bccd_yolo/data.yaml, epochs=4, time=None, patience=100, batch=4, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train16, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, form

[34m[1mtrain: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/train.cache... 291 images, 0 backgrounds, 0 corrupt: 100%|██████████| 291/291 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]

Plotting labels to runs/detect/train16/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001429, momentum=0.9) with parameter groups 143 weight(decay=0.0), 206 weight(decay=0.0005), 226 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train16[0m
Starting training for 4 epochs...

      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        1/4         0G     0.8144      2.808     0.4955        112        640: 100%|██████████| 73/73 [07:27<00:00,  6.14s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:29<00:00,  2.95s/it]

                   all         73        987       0.86      0.234      0.214      0.137






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        2/4         0G     0.4833     0.7467     0.2358         46        640: 100%|██████████| 73/73 [07:14<00:00,  5.96s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:29<00:00,  2.98s/it]

                   all         73        987      0.669      0.794       0.75      0.492






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        3/4         0G     0.4105     0.6784     0.1966         36        640: 100%|██████████| 73/73 [07:11<00:00,  5.91s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:29<00:00,  2.98s/it]

                   all         73        987      0.713      0.845       0.79      0.531






      Epoch    GPU_mem  giou_loss   cls_loss    l1_loss  Instances       Size


        4/4         0G       0.38     0.6145     0.1774         42        640: 100%|██████████| 73/73 [07:16<00:00,  5.98s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:29<00:00,  2.93s/it]

                   all         73        987      0.751      0.835      0.831      0.564






4 epochs completed in 0.520 hours.
Optimizer stripped from runs/detect/train16/weights/last.pt, 66.1MB
Optimizer stripped from runs/detect/train16/weights/best.pt, 66.1MB

Validating runs/detect/train16/weights/best.pt...
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 10/10 [00:31<00:00,  3.15s/it]


                   all         73        987      0.751      0.837      0.831      0.565
                   RBC         68        827      0.585      0.698      0.681      0.472
                   WBC         71         73       0.93          1      0.987      0.783
             Platelets         48         87      0.739      0.812      0.825      0.439
Speed: 0.6ms preprocess, 426.4ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train16[0m
Ultralytics 8.3.107 🚀 Python-3.12.6 torch-2.6.0 CPU (Apple M1 Pro)
rt-detr-l summary: 302 layers, 31,989,905 parameters, 0 gradients, 103.4 GFLOPs


[34m[1mval: [0mScanning /Users/daustinov/study/multimedia/bccd_yolo/labels/val.cache... 73 images, 0 backgrounds, 0 corrupt: 100%|██████████| 73/73 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 19/19 [00:30<00:00,  1.61s/it]


                   all         73        987      0.751      0.837      0.831      0.565
                   RBC         68        827      0.585      0.698      0.681      0.472
                   WBC         71         73       0.93          1      0.987      0.783
             Platelets         48         87      0.739      0.812      0.825      0.439
Speed: 1.0ms preprocess, 412.6ms inference, 0.0ms loss, 0.1ms postprocess per image
Results saved to [1mruns/detect/train162[0m
Метрики (rtdetr-l):
Precision: 0.7514
Recall: 0.8365
F1-score: 0.7917
mAP@0.5: 0.8310
mAP@0.5:0.95: 0.5646


### Выводы

После проведения базового обучения моделей yolo11n и rtdetr-l было решено проверить гипотезы, направленные на улучшение точности и общей производительности моделей. В качестве основы для улучшения использовались два подхода: оптимизация гиперпараметров (в том числе настройка learning rate, batch size и количества эпох) и замена стандартного оптимизатора на SGD с моментумом.

Для сверточной модели yolo11n оба подхода оказались полезными: подбор гиперпараметров позволил более точно настроить процесс обучения под особенности датасета BCCD, а использование оптимизатора SGD дало прирост в качестве обобщения. После внесения этих изменений точность (Precision) модели увеличилась с 0.8232 до 0.8460, а mAP@0.5 вырос с 0.8955 до 0.9083. Также наблюдается улучшение F1-меры и других метрик, что указывает на то, что модель стала не только точнее, но и более устойчивой к ошибкам.

Для трансформерной модели rtdetr-l использование SGD не дало положительного эффекта, зато увеличение количества эпох позволило существенно улучшить качество. Особенно заметен прирост по метрике mAP@0.5: с 0.7582 до 0.8310. Это подтверждает, что модель нуждается в большем количестве итераций для эффективного обучения на данном датасете.

Таким образом, гипотезы о влиянии гиперпараметров и выборе оптимизатора подтвердились частично. Для сверточной модели полезными оказались оба подхода, а для трансформерной - только увеличение количества эпох

## 4. Имплементация алгоритма машинного обучения 

### Имплементация сверточной модели. Обучение и подсчет метрик

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torchvision.ops import box_iou
from PIL import Image
import glob
import numpy as np
from tqdm import tqdm

class YOLOLike(nn.Module):
    def __init__(self, grid_size=7, num_boxes=2, num_classes=3):
        super().__init__()
        self.grid_size = grid_size
        self.num_boxes = num_boxes
        self.num_classes = num_classes
        
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 32, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.AdaptiveAvgPool2d((grid_size, grid_size))
        )
        
        self.detection = nn.Sequential(
            nn.Conv2d(512, (5 * num_boxes + num_classes), 1, 1, 0),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.detection(x)
        return x.permute(0, 2, 3, 1)

class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=416, grid_size=7):
        self.img_paths = sorted(glob.glob(f"{img_dir}/*.jpg"))
        self.label_paths = [os.path.join(label_dir, os.path.basename(p).replace('.jpg', '.txt')) 
                          for p in self.img_paths]
        self.img_size = img_size
        self.grid_size = grid_size
        self.transform = T.Compose([
            T.Resize((img_size, img_size)),
            T.ToTensor()
        ])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        image = Image.open(self.img_paths[idx]).convert('RGB')
        orig_w, orig_h = image.size
        image = self.transform(image)
        
        label_tensor = torch.zeros((self.grid_size, self.grid_size, 5 + 3))
        
        try:
            with open(self.label_paths[idx], 'r') as f:
                for line in f:
                    cls, x, y, w, h = map(float, line.strip().split())
                    
                    grid_x = int(x * self.grid_size)
                    grid_y = int(y * self.grid_size)
                    
                    cell_x = x * self.grid_size - grid_x
                    cell_y = y * self.grid_size - grid_y
                    
                    label_tensor[grid_y, grid_x, 0] = 1.0
                    label_tensor[grid_y, grid_x, 1:5] = torch.tensor([cell_x, cell_y, w, h])
                    label_tensor[grid_y, grid_x, 5 + int(cls)] = 1.0 
        except:
            pass
            
        return image, label_tensor

def compute_metrics(detections, annotations, iou_thresholds):
    metrics = {
        'tp': defaultdict(int),
        'fp': defaultdict(int),
        'fn': defaultdict(int)
    }
    
    for iou_thresh in iou_thresholds:
        for img_dets, img_annots in zip(detections, annotations):
            for cls_id in range(3):
                gt_boxes = [b[:4] for b in img_annots.get(cls_id, [])]
                det_boxes = [b[:5] for b in img_dets.get(cls_id, [])]
                
                matched_gt = np.zeros(len(gt_boxes), dtype=bool)
                for det in det_boxes:
                    if len(gt_boxes) == 0:
                        metrics['fp'][iou_thresh] += 1
                        continue
                    
                    ious = box_iou(torch.tensor([det[:4]]), torch.tensor(gt_boxes))
                    max_iou, max_idx = torch.max(ious, dim=1)
                    max_iou = max_iou.item()
                    max_idx = max_idx.item()

                    if max_iou >= iou_thresh and not matched_gt[max_idx]:
                        metrics['tp'][iou_thresh] += 1
                        matched_gt[max_idx] = True
                    else:
                        metrics['fp'][iou_thresh] += 1
                
                metrics['fn'][iou_thresh] += np.sum(~matched_gt)
    
    results = {}
    iou_5095 = np.arange(0.5, 1.0, 0.05)
    
    aps = []
    for thresh in iou_5095:
        tp = metrics['tp'][thresh]
        fp = metrics['fp'][thresh]
        fn = metrics['fn'][thresh]
        ap = tp / (tp + fp + 1e-6)
        aps.append(ap)
    results['map5095'] = np.mean(aps)
    
    results['map50'] = metrics['tp'][0.5] / (metrics['tp'][0.5] + metrics['fp'][0.5] + 1e-6)
    
    total_tp = sum(metrics['tp'].values())
    total_fp = sum(metrics['fp'].values())
    total_fn = sum(metrics['fn'].values())
    
    results['precision'] = total_tp / (total_tp + total_fp + 1e-6)
    results['recall'] = total_tp / (total_tp + total_fn + 1e-6)
    results['f1'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'] + 1e-6)
    
    return results

def compute_map(model, dataloader, grid_size=7, num_classes=3):
    model.eval()
    all_detections = []
    all_annotations = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            annotations = convert_targets_to_annotations(targets, grid_size)
            all_annotations.extend(annotations)
            
            preds = model(images)
            detections = process_predictions(preds, grid_size, num_classes)
            all_detections.extend(detections)
    
    return compute_metrics(all_detections, all_annotations, iou_thresholds=[0.5] + list(np.arange(0.5, 1.0, 0.05)))

def convert_targets_to_annotations(targets, grid_size):
    annotations = []
    for batch_idx in range(targets.size(0)):
        image_annotations = defaultdict(list)
        for i in range(grid_size):
            for j in range(grid_size):
                if targets[batch_idx, i, j, 0] == 1:
                    x_center = (j + targets[batch_idx, i, j, 1]) / grid_size
                    y_center = (i + targets[batch_idx, i, j, 2]) / grid_size
                    width = targets[batch_idx, i, j, 3]
                    height = targets[batch_idx, i, j, 4]
                    class_id = torch.argmax(targets[batch_idx, i, j, 5:])
                    
                    box = [
                        x_center - width/2,
                        y_center - height/2,
                        x_center + width/2,
                        y_center + height/2,
                        1.0,
                        class_id.item()
                    ]
                    image_annotations[class_id.item()].append(box)
        annotations.append(image_annotations)
    return annotations

def process_predictions(preds, grid_size, num_classes, confidence_threshold=0.5):
    detections = []
    for batch_idx in range(preds.size(0)):
        image_detections = defaultdict(list)
        
        for i in range(grid_size):
            for j in range(grid_size):
                confidence = preds[batch_idx, i, j, 0]
                if confidence < confidence_threshold:
                    continue
                
                x_center = (j + preds[batch_idx, i, j, 1]) / grid_size
                y_center = (i + preds[batch_idx, i, j, 2]) / grid_size
                width = preds[batch_idx, i, j, 3]
                height = preds[batch_idx, i, j, 4]
                
                class_probs = preds[batch_idx, i, j, 5:5+num_classes]
                class_id = torch.argmax(class_probs)
                class_confidence = class_probs[class_id]
                
                box = [
                    x_center - width/2,
                    y_center - height/2,
                    x_center + width/2,
                    y_center + height/2,
                    confidence * class_confidence,
                    class_id.item()
                ]
                image_detections[class_id.item()].append(box)
        
        for class_id, boxes in image_detections.items():
            boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
            filtered_boxes = []
            while boxes:
                best = boxes.pop(0)
                filtered_boxes.append(best)
                boxes = [box for box in boxes if 
                        box_iou(torch.tensor([best[:4]]), 
                                 torch.tensor([box[:4]]))[0][0] < 0.5]
            image_detections[class_id] = filtered_boxes
        
        detections.append(image_detections)
    return detections

def compute_ap(detections, annotations, iou_threshold):
    aps = {}
    for class_id in range(3):
        class_detections = []
        class_annotations = []
        
        for img_idx, (img_dets, img_annots) in enumerate(zip(detections, annotations)):
            gt_boxes = [box[:4] for box in img_annots.get(class_id, [])]
            det_boxes = [box[:5] for box in img_dets.get(class_id, [])]
            
            tp = np.zeros(len(det_boxes))
            fp = np.zeros(len(det_boxes))
            
            if len(gt_boxes) == 0:
                fp = np.ones(len(det_boxes))
            else:
                gt_tensor = torch.tensor(gt_boxes).view(-1, 4)
                
                for i, det in enumerate(det_boxes):
                    det_tensor = torch.tensor([det[:4]]).view(-1, 4)
                    
                    ious = box_iou(det_tensor, gt_tensor)
                    max_iou = torch.max(ious).item() if gt_tensor.size(0) > 0 else 0.0
                    
                    if max_iou >= iou_threshold:
                        tp[i] = 1
                        gt_tensor = gt_tensor[torch.argmax(ious) != torch.arange(gt_tensor.size(0))]
                    else:
                        fp[i] = 1

            scores = np.array([det[4] for det in det_boxes])
            sort_idx = np.argsort(-scores)
            tp = tp[sort_idx]
            fp = fp[sort_idx]
            
            class_detections.extend(zip(tp, fp))
            class_annotations.extend([1]*len(img_annots.get(class_id, [])))
        
        tp_fp = np.array(class_detections)
        if tp_fp.size == 0:
            ap = 0
        else:
            tp_cumsum = np.cumsum(tp_fp[:, 0])
            fp_cumsum = np.cumsum(tp_fp[:, 1])
            
            recalls = tp_cumsum / (len(class_annotations) + 1e-6)
            precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
            
            ap = 0
            for t in np.linspace(0, 1, 11):
                mask = recalls >= t
                if np.any(mask):
                    ap += np.max(precisions[mask]) / 11
        
        aps[class_id] = ap
    return aps

def compute_iou(box1, box2):
    box1 = torch.tensor([box1[0] - box1[2]/2, box1[1] - box1[3]/2,
                         box1[0] + box1[2]/2, box1[1] + box1[3]/2])
    box2 = torch.tensor([box2[0] - box2[2]/2, box2[1] - box2[3]/2,
                         box2[0] + box2[2]/2, box2[1] + box2[3]/2])
    return box_iou(box1.unsqueeze(0), box2.unsqueeze(0)).item()

def collate_fn(batch):
    images = torch.stack([item[0] for item in batch])
    labels = torch.stack([item[1] for item in batch])
    return images, labels

def train(model, train_dataloader, val_dataloader, optimizer, epochs=10, grid_size=7, num_classes=3):
    best_map = 0.0
    history = {'precision': [], 'recall': [], 'f1': [], 'map50': [], 'map5095': []}
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        total_objects = 0
        correct_boxes = 0
        
        for imgs, targets in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            preds = model(imgs)
            
            batch_size = preds.size(0)
            preds = preds.view(batch_size, grid_size * grid_size, -1)
            targets = targets.view(batch_size, grid_size * grid_size, -1)
            
            pred_obj = preds[..., 0]
            pred_box = preds[..., 1:5]
            pred_cls = preds[..., 5:5+num_classes]
            
            target_obj = targets[..., 0]
            target_box = targets[..., 1:5]
            target_cls = targets[..., 5:5+num_classes]
            
            obj_mask = target_obj == 1
            no_obj_mask = target_obj == 0
            
            obj_loss = nn.BCELoss()(pred_obj[obj_mask], target_obj[obj_mask])
            no_obj_loss = nn.BCELoss()(pred_obj[no_obj_mask], target_obj[no_obj_mask])
            coord_loss = nn.MSELoss()(pred_box[obj_mask], target_box[obj_mask])
            class_loss = nn.BCELoss()(pred_cls[obj_mask], target_cls[obj_mask])
            
            total_loss = 5*coord_loss + obj_loss + 0.5*no_obj_loss + class_loss
            total_loss.backward()
            optimizer.step()
            
            epoch_loss += total_loss.item()
            total_objects += obj_mask.sum().item()
            
            with torch.no_grad():
                for i in range(batch_size):
                    for j in range(grid_size * grid_size):
                        if obj_mask[i, j]:
                            pred_b = pred_box[i, j]
                            true_b = target_box[i, j]
                            if compute_iou(pred_b, true_b) > 0.5:
                                correct_boxes += 1
        
        model.eval()
        metrics = compute_map(model, val_dataloader, grid_size, num_classes)
                
        for key in history.keys():
            history[key].append(metrics[key])
        
        print(f"\nEpoch {epoch+1}")
        print(f"Precision: {metrics['precision']:.4f} | Recall: {metrics['recall']:.4f} | F1: {metrics['f1']:.4f}")
        print(f"mAP@0.5: {metrics['map50']:.4f} | mAP@0.5:0.95: {metrics['map5095']:.4f}")
        
        if metrics['map5095'] > best_map:
            best_map = metrics['map5095']
    
    return history

grid_size = 7
num_classes = 3
img_size = 416
batch_size = 8
lr = 0.001
epochs = 10

train_ds = YOLODataset('bccd_yolo/images/train', 'bccd_yolo/labels/train', img_size, grid_size)
val_ds = YOLODataset('bccd_yolo/images/val', 'bccd_yolo/labels/val', img_size, grid_size)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_fn)

model = YOLOLike(grid_size=grid_size, num_classes=num_classes)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

train(model, train_dl, val_dl, optimizer, epochs, grid_size, num_classes)

Epoch 1/10: 100%|██████████| 37/37 [00:15<00:00,  2.37it/s]



Epoch 1
Precision: 0.0042 | Recall: 0.0155 | F1: 0.0066
mAP@0.5: 0.0156 | mAP@0.5:0.95: 0.0031


Epoch 2/10: 100%|██████████| 37/37 [00:14<00:00,  2.58it/s]



Epoch 2
Precision: 0.0291 | Recall: 0.1082 | F1: 0.0458
mAP@0.5: 0.0826 | mAP@0.5:0.95: 0.0237


Epoch 3/10: 100%|██████████| 37/37 [00:15<00:00,  2.39it/s]



Epoch 3
Precision: 0.0261 | Recall: 0.0970 | F1: 0.0411
mAP@0.5: 0.0777 | mAP@0.5:0.95: 0.0209


Epoch 4/10: 100%|██████████| 37/37 [00:13<00:00,  2.66it/s]



Epoch 4
Precision: 0.0380 | Recall: 0.1407 | F1: 0.0599
mAP@0.5: 0.1107 | mAP@0.5:0.95: 0.0307


Epoch 5/10: 100%|██████████| 37/37 [00:14<00:00,  2.55it/s]



Epoch 5
Precision: 0.0610 | Recall: 0.1589 | F1: 0.0882
mAP@0.5: 0.1638 | mAP@0.5:0.95: 0.0507


Epoch 6/10: 100%|██████████| 37/37 [00:14<00:00,  2.55it/s]



Epoch 6
Precision: 0.0342 | Recall: 0.1212 | F1: 0.0534
mAP@0.5: 0.1004 | mAP@0.5:0.95: 0.0276


Epoch 7/10: 100%|██████████| 37/37 [00:13<00:00,  2.65it/s]



Epoch 7
Precision: 0.0558 | Recall: 0.1953 | F1: 0.0868
mAP@0.5: 0.1484 | mAP@0.5:0.95: 0.0466


Epoch 8/10: 100%|██████████| 37/37 [00:15<00:00,  2.37it/s]



Epoch 8
Precision: 0.0699 | Recall: 0.2032 | F1: 0.1040
mAP@0.5: 0.1807 | mAP@0.5:0.95: 0.0588


Epoch 9/10: 100%|██████████| 37/37 [00:14<00:00,  2.56it/s]



Epoch 9
Precision: 0.0224 | Recall: 0.0701 | F1: 0.0339
mAP@0.5: 0.0724 | mAP@0.5:0.95: 0.0173


Epoch 10/10: 100%|██████████| 37/37 [00:15<00:00,  2.36it/s]



Epoch 10
Precision: 0.0790 | Recall: 0.2073 | F1: 0.1144
mAP@0.5: 0.2062 | mAP@0.5:0.95: 0.0663


{'precision': [0.004195408215685282,
  0.029055258466283287,
  0.0260869565210739,
  0.038013268781525826,
  0.061018676955582385,
  0.034217989190587564,
  0.05581899471465061,
  0.0698849375771738,
  0.022357170910832436,
  0.07899013977393758],
 'recall': [0.015451701581623688,
  0.10816191107136582,
  0.09697601667485296,
  0.14067684139343284,
  0.1588776187165724,
  0.12124371977237237,
  0.19527917336285153,
  0.20324201344172507,
  0.07014882926626706,
  0.20731822919638654],
 'f1': [0.006598720836824717,
  0.0458054832384776,
  0.0411137242669093,
  0.059852855332184714,
  0.08817299969338657,
  0.053372494539331954,
  0.08682057554613612,
  0.10400660473926575,
  0.033907258090701414,
  0.1143944116170727],
 'map50': [0.015571913927580418,
  0.08263305320971526,
  0.07769985973664799,
  0.11073541841212518,
  0.16379655583424177,
  0.10035314889635769,
  0.14843517136386958,
  0.1807099318428272,
  0.07244931869849629,
  0.20619785454783512],
 'map5095': [0.003057757643748822

### Имплементация трансформерной модели модели. Обучение модели и подсчет метрик

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torchvision.models import resnet18
from torchvision.ops import box_iou
from scipy.optimize import linear_sum_assignment
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import numpy as np

device = torch.device("cpu")
CLASSES = ['RBC', 'WBC', 'Platelets']
NUM_CLASSES = len(CLASSES)
IMG_SIZE = 256
MAX_DETECTIONS = 20

class YOLODetectionDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, label_dir, transform=None):
        self.img_dir = Path(img_dir)
        self.label_dir = Path(label_dir)
        self.transform = transform or T.Compose([
            T.Resize((IMG_SIZE, IMG_SIZE)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.images = sorted(self.img_dir.glob('*.jpg'))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        label_path = self.label_dir / f"{img_path.stem}.txt"

        img = Image.open(img_path).convert("RGB")
        orig_w, orig_h = img.size
        img_tensor = self.transform(img)

        boxes = []
        labels = []
        if label_path.exists():
            with open(label_path) as f:
                for line in f:
                    cls, cx, cy, bw, bh = map(float, line.strip().split())
                    
                    scale_x = IMG_SIZE / orig_w
                    scale_y = IMG_SIZE / orig_h
                    
                    x1 = (cx - bw/2) * scale_x
                    y1 = (cy - bh/2) * scale_y
                    x2 = (cx + bw/2) * scale_x
                    y2 = (cy + bh/2) * scale_y
                    
                    boxes.append([x1, y1, x2, y2])
                    labels.append(int(cls))

        return {
            'image': img_tensor,
            'boxes': torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4)),
            'labels': torch.tensor(labels, dtype=torch.long) if labels else torch.zeros(0, dtype=torch.long)
        }

def collate_fn(batch):
    return {
        'image': torch.stack([x['image'] for x in batch]),
        'boxes': [x['boxes'] for x in batch],
        'labels': [x['labels'] for x in batch]
    }

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        layers = []
        for _ in range(num_layers-1):
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU()
            ])
            input_dim = hidden_dim
        layers.append(nn.Linear(hidden_dim, output_dim))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

class DETRLike(nn.Module):
    def __init__(self, num_classes, num_queries=MAX_DETECTIONS, hidden_dim=256):
        super().__init__()
        
        self.backbone = nn.Sequential(*list(resnet18(pretrained=False).children())[:-2])
        self.conv = nn.Conv2d(512, hidden_dim, 1)
        
        self.encoder_pos = nn.Parameter(torch.randn(1, hidden_dim, IMG_SIZE//32, IMG_SIZE//32))
        self.decoder_pos = nn.Parameter(torch.randn(num_queries, hidden_dim))
        
        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=8,
            num_encoder_layers=3,
            num_decoder_layers=3,
            dim_feedforward=2048,
            dropout=0.1
        )
        
        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)

    def forward(self, x):
        features = self.backbone(x)
        src = self.conv(features)
        
        bs, c, h, w = src.shape
        src = src.flatten(2).permute(2, 0, 1)
        pos_enc = self.encoder_pos.expand(bs, -1, h, w).flatten(2).permute(2, 0, 1)
        src = src + pos_enc
        
        query_embed = self.decoder_pos.unsqueeze(1).repeat(1, bs, 1)
        
        hs = self.transformer(
            src=src,
            tgt=query_embed,
            src_key_padding_mask=None,
            tgt_key_padding_mask=None,
            memory_key_padding_mask=None
        )
        
        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        
        return outputs_class.permute(1, 0, 2), outputs_coord.permute(1, 0, 2)

class HungarianMatcher(nn.Module):
    def __init__(self, cost_class=1, cost_bbox=5, cost_giou=2):
        super().__init__()
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou

    @torch.no_grad()
    def forward(self, outputs, targets):
        bs, num_queries = outputs["pred_logits"].shape[:2]
        
        indices = []
        for i in range(bs):
            out_prob = outputs["pred_logits"][i].softmax(-1)
            out_bbox = outputs["pred_boxes"][i]

            tgt_bbox = targets[i]["boxes"]
            tgt_ids = targets[i]["labels"]
            
            cost_class = -out_prob[:, tgt_ids]
            
            cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
            
            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class
            C = C.reshape(num_queries, -1).cpu()
            
            indices.append(linear_sum_assignment(C))
        
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]

def loss_fn(outputs, targets):
    pred_logits, pred_boxes = outputs
    
    matcher = HungarianMatcher()
    indices = matcher({"pred_logits": pred_logits, "pred_boxes": pred_boxes}, targets)
    
    src_logits = torch.cat([pred_logits[i][idx] for i, (idx, _) in enumerate(indices)])
    target_classes = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)])
    loss_cls = F.cross_entropy(src_logits, target_classes)
    
    src_boxes = torch.cat([pred_boxes[i][idx] for i, (idx, _) in enumerate(indices)])
    target_boxes = torch.cat([t["boxes"][j] for t, (_, j) in zip(targets, indices)])
    loss_bbox = F.l1_loss(src_boxes, target_boxes)
    
    return loss_cls + 5 * loss_bbox

def train(model, dataloader, optimizer, epochs=50):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
            images = batch['image'].to(device)
            targets = [
                {"boxes": b.to(device), "labels": l.to(device)}
                for b, l in zip(batch['boxes'], batch['labels'])
            ]
            
            outputs = model(images)
            loss = loss_fn(outputs, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

def evaluate(model, dataloader, conf_thresh=0.5):
    model.eval()
    results = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            images = batch['image'].to(device)
            outputs = model(images)
            pred_logits, pred_boxes = outputs
            
            probs = F.softmax(pred_logits, dim=-1)
            scores, labels = torch.max(probs, dim=-1)
            
            for i in range(images.size(0)):
                keep = scores[i] > conf_thresh
                pred = {
                    'boxes': pred_boxes[i][keep].cpu(),
                    'scores': scores[i][keep].cpu(),
                    'labels': labels[i][keep].cpu()
                }
                results.append((pred, batch['boxes'][i], batch['labels'][i]))
    
    aps = []
    tp_total = fp_total = fn_total = 0
    
    for iou_threshold in np.linspace(0.5, 0.95, 10):
        tp = fp = fn = 0
        
        for pred, gt_boxes, gt_labels in results:
            pred_boxes = pred['boxes']
            pred_labels = pred['labels']
            
            if len(pred_boxes) == 0:
                fn += len(gt_labels)
                continue
                
            ious = box_iou(pred_boxes, gt_boxes)
            
            matches = (ious > iou_threshold) & (pred_labels.unsqueeze(1) == gt_labels)
            matched_gt = set()
            matched_pred = set()
            
            for pred_idx, gt_idx in zip(*torch.where(matches)):
                if gt_idx not in matched_gt and pred_idx not in matched_pred:
                    matched_gt.add(gt_idx.item())
                    matched_pred.add(pred_idx.item())
            
            cur_tp = len(matched_gt)
            cur_fp = len(pred_boxes) - len(matched_pred)
            cur_fn = len(gt_labels) - len(matched_gt)
            
            if iou_threshold == 0.5:
                tp_total += cur_tp
                fp_total += cur_fp
                fn_total += cur_fn
            
            tp += cur_tp
            fp += cur_fp
            fn += cur_fn
        
        precision = tp / (tp + fp + 1e-6)
        recall = tp / (tp + fn + 1e-6)
        aps.append(precision)
    
    precision = tp_total / (tp_total + fp_total + 1e-6)
    recall = tp_total / (tp_total + fn_total + 1e-6)
    f1 = 2 * precision * recall / (precision + recall + 1e-6)
    
    map50 = np.mean(aps[:1])
    map5095 = np.mean(aps)
    
    print(f"Precision@0.5: {precision:.4f}")
    print(f"Recall@0.5: {recall:.4f}")
    print(f"F1@0.5: {f1:.4f}")
    print(f"mAP@0.5: {map50:.4f}, mAP@0.5:0.95: {map5095:.4f}")

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")

model = DETRLike(NUM_CLASSES).to(device)
model.apply(init_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

train_dataset = YOLODetectionDataset("bccd_yolo/images/train", "bccd_yolo/labels/train")
val_dataset = YOLODetectionDataset("bccd_yolo/images/val", "bccd_yolo/labels/val")

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn
)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=4, collate_fn=collate_fn
)

train(model, train_loader, optimizer, epochs=20)
evaluate(model, val_loader)

Epoch 1: 100%|██████████| 37/37 [00:21<00:00,  1.73it/s]


Epoch 1, Loss: 1.0942


Epoch 2: 100%|██████████| 37/37 [00:17<00:00,  2.11it/s]


Epoch 2, Loss: 0.7241


Epoch 3: 100%|██████████| 37/37 [00:16<00:00,  2.27it/s]


Epoch 3, Loss: 0.6747


Epoch 4: 100%|██████████| 37/37 [00:17<00:00,  2.08it/s]


Epoch 4, Loss: 0.6466


Epoch 5: 100%|██████████| 37/37 [00:20<00:00,  1.80it/s]


Epoch 5, Loss: 0.6138


Epoch 6: 100%|██████████| 37/37 [00:18<00:00,  1.98it/s]


Epoch 6, Loss: 0.5966


Epoch 7: 100%|██████████| 37/37 [00:18<00:00,  2.04it/s]


Epoch 7, Loss: 0.5924


Epoch 8: 100%|██████████| 37/37 [00:19<00:00,  1.85it/s]


Epoch 8, Loss: 0.5412


Epoch 9: 100%|██████████| 37/37 [00:19<00:00,  1.88it/s]


Epoch 9, Loss: 0.5099


Epoch 10: 100%|██████████| 37/37 [00:17<00:00,  2.14it/s]


Epoch 10, Loss: 0.4865


Epoch 11: 100%|██████████| 37/37 [00:19<00:00,  1.93it/s]


Epoch 11, Loss: 0.4419


Epoch 12: 100%|██████████| 37/37 [00:19<00:00,  1.91it/s]


Epoch 12, Loss: 0.4302


Epoch 13: 100%|██████████| 37/37 [00:18<00:00,  2.00it/s]


Epoch 13, Loss: 0.4230


Epoch 14: 100%|██████████| 37/37 [00:18<00:00,  1.99it/s]


Epoch 14, Loss: 0.4223


Epoch 15: 100%|██████████| 37/37 [00:17<00:00,  2.10it/s]


Epoch 15, Loss: 0.4188


Epoch 16: 100%|██████████| 37/37 [00:19<00:00,  1.87it/s]


Epoch 16, Loss: 0.3849


Epoch 17: 100%|██████████| 37/37 [00:17<00:00,  2.13it/s]


Epoch 17, Loss: 0.3896


Epoch 18: 100%|██████████| 37/37 [00:20<00:00,  1.85it/s]


Epoch 18, Loss: 0.3985


Epoch 19: 100%|██████████| 37/37 [00:21<00:00,  1.73it/s]


Epoch 19, Loss: 0.4073


Epoch 20: 100%|██████████| 37/37 [00:20<00:00,  1.82it/s]


Epoch 20, Loss: 0.3709


Evaluating: 100%|██████████| 19/19 [00:01<00:00, 10.22it/s]

Precision@0.5: 0.0892
Recall@0.5: 0.1317
F1@0.5: 0.1064
mAP@0.5: 0.0892, mAP@0.5:0.95: 0.0232





### Сравнение результатов с п.2. Выводы

В процессе работы были реализованы две модели детекции объектов: сверточная модель, построенная по мотивам yolo, и трансформерная модель, упрощённая версия rtdetr. Обе модели были обучены на датасете BCCD с использованием yolo-формата разметки и протестированы на реальных данных. Полученные результаты были сопоставлены с результатами аналогичных моделей из библиотеки ultralytics, таких как yolo11n и rtdetr-l.

Сравнение метрик демонстрирует ощутимую разницу в качестве детекции между готовыми и самописными моделями. Например, сверточная модель из ultralytics показала высокие значения метрик: точность (Precision) составила 0.8232, полнота (Recall) - 0.8832, F1-мера - 0.8521, а значение mAP@0.5 достигло 0.8955. В то время как собственная сверточная модель значительно уступает по всем показателям, выдав Precision 0.0790, Recall 0.2073, F1-score 0.1144 и mAP@0.5 равный 0.2062. Аналогичная ситуация наблюдается и с трансформерной моделью: rtdetr-l демонстрирует достаточно высокий уровень Recall (0.8813) и mAP@0.5 (0.7582), в то время как простая самописная модель на базе resnet18 и трансформера показала всего 0.0892 по Precision и 0.0892 по mAP@0.5.

Таким образом, можно сделать вывод, что несмотря на корректную реализацию архитектур и успешное обучение моделей, результаты самописных версий существенно отстают от производительных и оптимизированных решений, представленных в библиотеке ultralytics. Это связано, в первую очередь, с упрощенной архитектурой, а также недостаточной глубиной и количеством параметров моделей.

### Улучшение бейзлайна. Добавлений техник для каждой из моделей из пункта 3c

#### Сверточная модель

Используем улучшения из обоих гипотез - обе гипотезы показали лучше результат по сравнению с изначальным вариантом

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from torchvision.ops import box_iou
from PIL import Image
import glob
from collections import defaultdict
import numpy as np
from tqdm import tqdm

class YOLOLike(nn.Module):
    def __init__(self, grid_size=7, num_boxes=2, num_classes=3):
        super().__init__()
        self.grid_size = grid_size
        self.num_boxes = num_boxes
        self.num_classes = num_classes
        
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(16, 32, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, 1, 1), nn.LeakyReLU(0.1), nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.Conv2d(256, 512, 3, 1, 1), nn.LeakyReLU(0.1),
            nn.AdaptiveAvgPool2d((grid_size, grid_size))
        )
        
        self.detection = nn.Sequential(
            nn.Conv2d(512, (5 * num_boxes + num_classes), 1, 1, 0),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.detection(x)
        return x.permute(0, 2, 3, 1)

class YOLODataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=416, grid_size=7):
        self.img_paths = sorted(glob.glob(f"{img_dir}/*.jpg"))
        self.label_paths = [os.path.join(label_dir, os.path.basename(p).replace('.jpg', '.txt')) 
                          for p in self.img_paths]
        self.img_size = img_size
        self.grid_size = grid_size
        self.transform = T.Compose([
            T.Resize((img_size, img_size)),
            T.ToTensor()
        ])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        image = Image.open(self.img_paths[idx]).convert('RGB')
        orig_w, orig_h = image.size
        image = self.transform(image)
        
        label_tensor = torch.zeros((self.grid_size, self.grid_size, 5 + 3))
        
        try:
            with open(self.label_paths[idx], 'r') as f:
                for line in f:
                    cls, x, y, w, h = map(float, line.strip().split())
                    
                    grid_x = int(x * self.grid_size)
                    grid_y = int(y * self.grid_size)
                    
                    cell_x = x * self.grid_size - grid_x
                    cell_y = y * self.grid_size - grid_y
                    
                    label_tensor[grid_y, grid_x, 0] = 1.0
                    label_tensor[grid_y, grid_x, 1:5] = torch.tensor([cell_x, cell_y, w, h])
                    label_tensor[grid_y, grid_x, 5 + int(cls)] = 1.0 
        except:
            pass
            
        return image, label_tensor

def compute_metrics(detections, annotations, iou_thresholds):
    metrics = {
        'tp': defaultdict(int),
        'fp': defaultdict(int),
        'fn': defaultdict(int)
    }
    
    for iou_thresh in iou_thresholds:
        for img_dets, img_annots in zip(detections, annotations):
            for cls_id in range(3):
                gt_boxes = [b[:4] for b in img_annots.get(cls_id, [])]
                det_boxes = [b[:5] for b in img_dets.get(cls_id, [])]
                
                matched_gt = np.zeros(len(gt_boxes), dtype=bool)
                for det in det_boxes:
                    if len(gt_boxes) == 0:
                        metrics['fp'][iou_thresh] += 1
                        continue
                    
                    ious = box_iou(torch.tensor([det[:4]]), torch.tensor(gt_boxes))
                    max_iou, max_idx = torch.max(ious, dim=1)
                    max_iou = max_iou.item()
                    max_idx = max_idx.item()

                    if max_iou >= iou_thresh and not matched_gt[max_idx]:
                        metrics['tp'][iou_thresh] += 1
                        matched_gt[max_idx] = True
                    else:
                        metrics['fp'][iou_thresh] += 1
                
                metrics['fn'][iou_thresh] += np.sum(~matched_gt)
    
    results = {}
    iou_5095 = np.arange(0.5, 1.0, 0.05)
    
    aps = []
    for thresh in iou_5095:
        tp = metrics['tp'][thresh]
        fp = metrics['fp'][thresh]
        fn = metrics['fn'][thresh]
        ap = tp / (tp + fp + 1e-6)
        aps.append(ap)
    results['map5095'] = np.mean(aps)
    
    results['map50'] = metrics['tp'][0.5] / (metrics['tp'][0.5] + metrics['fp'][0.5] + 1e-6)
    
    total_tp = sum(metrics['tp'].values())
    total_fp = sum(metrics['fp'].values())
    total_fn = sum(metrics['fn'].values())
    
    results['precision'] = total_tp / (total_tp + total_fp + 1e-6)
    results['recall'] = total_tp / (total_tp + total_fn + 1e-6)
    results['f1'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'] + 1e-6)
    
    return results

def compute_map(model, dataloader, grid_size=7, num_classes=3):
    model.eval()
    all_detections = []
    all_annotations = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            annotations = convert_targets_to_annotations(targets, grid_size)
            all_annotations.extend(annotations)
            
            preds = model(images)
            detections = process_predictions(preds, grid_size, num_classes)
            all_detections.extend(detections)
    
    return compute_metrics(all_detections, all_annotations, iou_thresholds=[0.5] + list(np.arange(0.5, 1.0, 0.05)))

def convert_targets_to_annotations(targets, grid_size):
    annotations = []
    for batch_idx in range(targets.size(0)):
        image_annotations = defaultdict(list)
        for i in range(grid_size):
            for j in range(grid_size):
                if targets[batch_idx, i, j, 0] == 1:
                    x_center = (j + targets[batch_idx, i, j, 1]) / grid_size
                    y_center = (i + targets[batch_idx, i, j, 2]) / grid_size
                    width = targets[batch_idx, i, j, 3]
                    height = targets[batch_idx, i, j, 4]
                    class_id = torch.argmax(targets[batch_idx, i, j, 5:])
                    
                    box = [
                        x_center - width/2,
                        y_center - height/2,
                        x_center + width/2,
                        y_center + height/2,
                        1.0,
                        class_id.item()
                    ]
                    image_annotations[class_id.item()].append(box)
        annotations.append(image_annotations)
    return annotations

def process_predictions(preds, grid_size, num_classes, confidence_threshold=0.5):
    detections = []
    for batch_idx in range(preds.size(0)):
        image_detections = defaultdict(list)
        
        for i in range(grid_size):
            for j in range(grid_size):
                confidence = preds[batch_idx, i, j, 0]
                if confidence < confidence_threshold:
                    continue
                
                x_center = (j + preds[batch_idx, i, j, 1]) / grid_size
                y_center = (i + preds[batch_idx, i, j, 2]) / grid_size
                width = preds[batch_idx, i, j, 3]
                height = preds[batch_idx, i, j, 4]
                
                class_probs = preds[batch_idx, i, j, 5:5+num_classes]
                class_id = torch.argmax(class_probs)
                class_confidence = class_probs[class_id]
                
                box = [
                    x_center - width/2,
                    y_center - height/2,
                    x_center + width/2,
                    y_center + height/2,
                    confidence * class_confidence,
                    class_id.item()
                ]
                image_detections[class_id.item()].append(box)
        
        for class_id, boxes in image_detections.items():
            boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
            filtered_boxes = []
            while boxes:
                best = boxes.pop(0)
                filtered_boxes.append(best)
                boxes = [box for box in boxes if 
                        box_iou(torch.tensor([best[:4]]), 
                                 torch.tensor([box[:4]]))[0][0] < 0.5]
            image_detections[class_id] = filtered_boxes
        
        detections.append(image_detections)
    return detections

def compute_ap(detections, annotations, iou_threshold):
    aps = {}
    for class_id in range(3):
        class_detections = []
        class_annotations = []
        
        for img_idx, (img_dets, img_annots) in enumerate(zip(detections, annotations)):
            gt_boxes = [box[:4] for box in img_annots.get(class_id, [])]
            det_boxes = [box[:5] for box in img_dets.get(class_id, [])]
            
            tp = np.zeros(len(det_boxes))
            fp = np.zeros(len(det_boxes))
            
            if len(gt_boxes) == 0:
                fp = np.ones(len(det_boxes))
            else:
                gt_tensor = torch.tensor(gt_boxes).view(-1, 4)
                
                for i, det in enumerate(det_boxes):
                    det_tensor = torch.tensor([det[:4]]).view(-1, 4)
                    
                    ious = box_iou(det_tensor, gt_tensor)
                    max_iou = torch.max(ious).item() if gt_tensor.size(0) > 0 else 0.0
                    
                    if max_iou >= iou_threshold:
                        tp[i] = 1
                        gt_tensor = gt_tensor[torch.argmax(ious) != torch.arange(gt_tensor.size(0))]
                    else:
                        fp[i] = 1

            scores = np.array([det[4] for det in det_boxes])
            sort_idx = np.argsort(-scores)
            tp = tp[sort_idx]
            fp = fp[sort_idx]
            
            class_detections.extend(zip(tp, fp))
            class_annotations.extend([1]*len(img_annots.get(class_id, [])))
        
        tp_fp = np.array(class_detections)
        if tp_fp.size == 0:
            ap = 0
        else:
            tp_cumsum = np.cumsum(tp_fp[:, 0])
            fp_cumsum = np.cumsum(tp_fp[:, 1])
            
            recalls = tp_cumsum / (len(class_annotations) + 1e-6)
            precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
            
            ap = 0
            for t in np.linspace(0, 1, 11):
                mask = recalls >= t
                if np.any(mask):
                    ap += np.max(precisions[mask]) / 11
        
        aps[class_id] = ap
    return aps

def compute_iou(box1, box2):
    box1 = torch.tensor([box1[0] - box1[2]/2, box1[1] - box1[3]/2,
                         box1[0] + box1[2]/2, box1[1] + box1[3]/2])
    box2 = torch.tensor([box2[0] - box2[2]/2, box2[1] - box2[3]/2,
                         box2[0] + box2[2]/2, box2[1] + box2[3]/2])
    return box_iou(box1.unsqueeze(0), box2.unsqueeze(0)).item()

def collate_fn(batch):
    images = torch.stack([item[0] for item in batch])
    labels = torch.stack([item[1] for item in batch])
    return images, labels

def train(model, train_dataloader, val_dataloader, optimizer, epochs=10, grid_size=7, num_classes=3):
    best_map = 0.0
    history = {'precision': [], 'recall': [], 'f1': [], 'map50': [], 'map5095': []}
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        total_objects = 0
        correct_boxes = 0
        
        for imgs, targets in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            preds = model(imgs)
            
            batch_size = preds.size(0)
            preds = preds.view(batch_size, grid_size * grid_size, -1)
            targets = targets.view(batch_size, grid_size * grid_size, -1)
            
            pred_obj = preds[..., 0]
            pred_box = preds[..., 1:5]
            pred_cls = preds[..., 5:5+num_classes]
            
            target_obj = targets[..., 0]
            target_box = targets[..., 1:5]
            target_cls = targets[..., 5:5+num_classes]
            
            obj_mask = target_obj == 1
            no_obj_mask = target_obj == 0
            
            obj_loss = nn.BCELoss()(pred_obj[obj_mask], target_obj[obj_mask])
            no_obj_loss = nn.BCELoss()(pred_obj[no_obj_mask], target_obj[no_obj_mask])
            coord_loss = nn.MSELoss()(pred_box[obj_mask], target_box[obj_mask])
            class_loss = nn.BCELoss()(pred_cls[obj_mask], target_cls[obj_mask])
            
            total_loss = 5*coord_loss + obj_loss + 0.5*no_obj_loss + class_loss
            total_loss.backward()
            optimizer.step()
            
            epoch_loss += total_loss.item()
            total_objects += obj_mask.sum().item()
            
            with torch.no_grad():
                for i in range(batch_size):
                    for j in range(grid_size * grid_size):
                        if obj_mask[i, j]:
                            pred_b = pred_box[i, j]
                            true_b = target_box[i, j]
                            if compute_iou(pred_b, true_b) > 0.5:
                                correct_boxes += 1
        
        model.eval()
        metrics = compute_map(model, val_dataloader, grid_size, num_classes)
                
        for key in history.keys():
            history[key].append(metrics[key])
        
        print(f"\nEpoch {epoch+1}")
        print(f"Precision: {metrics['precision']:.4f} | Recall: {metrics['recall']:.4f} | F1: {metrics['f1']:.4f}")
        print(f"mAP@0.5: {metrics['map50']:.4f} | mAP@0.5:0.95: {metrics['map5095']:.4f}")
        
        if metrics['map5095'] > best_map:
            best_map = metrics['map5095']
    
    return history

grid_size = 7
num_classes = 3
img_size = 416
batch_size = 8
lr = 0.001
epochs = 20

train_ds = YOLODataset('bccd_yolo/images/train', 'bccd_yolo/labels/train', img_size, grid_size)
val_ds = YOLODataset('bccd_yolo/images/val', 'bccd_yolo/labels/val', img_size, grid_size)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_fn)

model = YOLOLike(grid_size=grid_size, num_classes=num_classes)
optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-4)

train(model, train_dl, val_dl, optimizer, epochs, grid_size, num_classes)

Epoch 1/20: 100%|██████████| 37/37 [00:15<00:00,  2.37it/s]



Epoch 1
Precision: 0.0204 | Recall: 0.0759 | F1: 0.0321
mAP@0.5: 0.0654 | mAP@0.5:0.95: 0.0159


Epoch 2/20: 100%|██████████| 37/37 [00:13<00:00,  2.68it/s]



Epoch 2
Precision: 0.0188 | Recall: 0.0700 | F1: 0.0296
mAP@0.5: 0.0604 | mAP@0.5:0.95: 0.0146


Epoch 3/20: 100%|██████████| 37/37 [00:14<00:00,  2.52it/s]



Epoch 3
Precision: 0.0441 | Recall: 0.1480 | F1: 0.0679
mAP@0.5: 0.1267 | mAP@0.5:0.95: 0.0358


Epoch 4/20: 100%|██████████| 37/37 [00:13<00:00,  2.65it/s]



Epoch 4
Precision: 0.0629 | Recall: 0.1941 | F1: 0.0950
mAP@0.5: 0.1685 | mAP@0.5:0.95: 0.0523


Epoch 5/20: 100%|██████████| 37/37 [00:14<00:00,  2.61it/s]



Epoch 5
Precision: 0.0455 | Recall: 0.1680 | F1: 0.0716
mAP@0.5: 0.1238 | mAP@0.5:0.95: 0.0377


Epoch 6/20: 100%|██████████| 37/37 [00:14<00:00,  2.62it/s]



Epoch 6
Precision: 0.0463 | Recall: 0.1685 | F1: 0.0726
mAP@0.5: 0.1286 | mAP@0.5:0.95: 0.0380


Epoch 7/20: 100%|██████████| 37/37 [00:13<00:00,  2.68it/s]



Epoch 7
Precision: 0.0490 | Recall: 0.1400 | F1: 0.0725
mAP@0.5: 0.1458 | mAP@0.5:0.95: 0.0393


Epoch 8/20: 100%|██████████| 37/37 [00:14<00:00,  2.60it/s]



Epoch 8
Precision: 0.0364 | Recall: 0.1142 | F1: 0.0552
mAP@0.5: 0.1047 | mAP@0.5:0.95: 0.0296


Epoch 9/20: 100%|██████████| 37/37 [00:13<00:00,  2.68it/s]



Epoch 9
Precision: 0.0373 | Recall: 0.1025 | F1: 0.0546
mAP@0.5: 0.1058 | mAP@0.5:0.95: 0.0304


Epoch 10/20: 100%|██████████| 37/37 [00:15<00:00,  2.37it/s]



Epoch 10
Precision: 0.0387 | Recall: 0.1121 | F1: 0.0575
mAP@0.5: 0.1082 | mAP@0.5:0.95: 0.0317


Epoch 11/20: 100%|██████████| 37/37 [00:16<00:00,  2.20it/s]



Epoch 11
Precision: 0.0954 | Recall: 0.2054 | F1: 0.1303
mAP@0.5: 0.2437 | mAP@0.5:0.95: 0.0806


Epoch 12/20: 100%|██████████| 37/37 [00:15<00:00,  2.45it/s]



Epoch 12
Precision: 0.1076 | Recall: 0.1965 | F1: 0.1391
mAP@0.5: 0.2678 | mAP@0.5:0.95: 0.0916


Epoch 13/20: 100%|██████████| 37/37 [00:16<00:00,  2.27it/s]



Epoch 13
Precision: 0.1174 | Recall: 0.2595 | F1: 0.1617
mAP@0.5: 0.2836 | mAP@0.5:0.95: 0.1008


Epoch 14/20: 100%|██████████| 37/37 [00:16<00:00,  2.29it/s]



Epoch 14
Precision: 0.1244 | Recall: 0.2758 | F1: 0.1715
mAP@0.5: 0.2819 | mAP@0.5:0.95: 0.1087


Epoch 15/20: 100%|██████████| 37/37 [00:14<00:00,  2.60it/s]



Epoch 15
Precision: 0.1396 | Recall: 0.2581 | F1: 0.1812
mAP@0.5: 0.3215 | mAP@0.5:0.95: 0.1214


Epoch 16/20: 100%|██████████| 37/37 [00:14<00:00,  2.61it/s]



Epoch 16
Precision: 0.1235 | Recall: 0.2584 | F1: 0.1672
mAP@0.5: 0.2941 | mAP@0.5:0.95: 0.1065


Epoch 17/20: 100%|██████████| 37/37 [00:14<00:00,  2.58it/s]



Epoch 17
Precision: 0.1306 | Recall: 0.2725 | F1: 0.1766
mAP@0.5: 0.3048 | mAP@0.5:0.95: 0.1132


Epoch 18/20: 100%|██████████| 37/37 [00:15<00:00,  2.34it/s]



Epoch 18
Precision: 0.1487 | Recall: 0.2489 | F1: 0.1862
mAP@0.5: 0.3421 | mAP@0.5:0.95: 0.1294


Epoch 19/20: 100%|██████████| 37/37 [00:15<00:00,  2.42it/s]



Epoch 19
Precision: 0.1240 | Recall: 0.2768 | F1: 0.1713
mAP@0.5: 0.2938 | mAP@0.5:0.95: 0.1070


Epoch 20/20: 100%|██████████| 37/37 [00:16<00:00,  2.29it/s]



Epoch 20
Precision: 0.1414 | Recall: 0.2160 | F1: 0.1709
mAP@0.5: 0.3406 | mAP@0.5:0.95: 0.1215


{'precision': [0.020357333468362077,
  0.018756194880962813,
  0.04408483718704045,
  0.06287802032289834,
  0.045518764929859006,
  0.04626158492017438,
  0.048951048949426604,
  0.03641803674938292,
  0.03725273967753626,
  0.03865760407690159,
  0.09544573642990462,
  0.10762681064806465,
  0.11742245484073008,
  0.12444919785563853,
  0.1396195457037574,
  0.12353847547704438,
  0.1306165099209206,
  0.14873973377803795,
  0.12398624261712937,
  0.14142103629280664],
 'recall': [0.07593136789497285,
  0.06995923783581769,
  0.1479761114657336,
  0.19414162478015531,
  0.1679780073781422,
  0.1684519859542656,
  0.14001327138686004,
  0.11422883684574567,
  0.10247416815788471,
  0.1121433311108026,
  0.20542231489189283,
  0.19651151766077243,
  0.2594558725699634,
  0.27576073558860925,
  0.25812873255681784,
  0.25841311970249187,
  0.27253768127096994,
  0.24893354818002336,
  0.2768034884560808,
  0.21603943499705758],
 'f1': [0.03210644866607587,
  0.029581196141220182,
  0.06

#### Трансформерная модель

Используем улучшение из 1 гипотезы - увеличим количество эпох

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
from torchvision.models import resnet18
from torchvision.ops import box_iou
from scipy.optimize import linear_sum_assignment
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import numpy as np

device = torch.device("cpu")
CLASSES = ['RBC', 'WBC', 'Platelets']
NUM_CLASSES = len(CLASSES)
IMG_SIZE = 256
MAX_DETECTIONS = 20

class YOLODetectionDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, label_dir, transform=None):
        self.img_dir = Path(img_dir)
        self.label_dir = Path(label_dir)
        self.transform = transform or T.Compose([
            T.Resize((IMG_SIZE, IMG_SIZE)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.images = sorted(self.img_dir.glob('*.jpg'))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        label_path = self.label_dir / f"{img_path.stem}.txt"

        img = Image.open(img_path).convert("RGB")
        orig_w, orig_h = img.size
        img_tensor = self.transform(img)

        boxes = []
        labels = []
        if label_path.exists():
            with open(label_path) as f:
                for line in f:
                    cls, cx, cy, bw, bh = map(float, line.strip().split())
                    
                    scale_x = IMG_SIZE / orig_w
                    scale_y = IMG_SIZE / orig_h
                    
                    x1 = (cx - bw/2) * scale_x
                    y1 = (cy - bh/2) * scale_y
                    x2 = (cx + bw/2) * scale_x
                    y2 = (cy + bh/2) * scale_y
                    
                    boxes.append([x1, y1, x2, y2])
                    labels.append(int(cls))

        return {
            'image': img_tensor,
            'boxes': torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4)),
            'labels': torch.tensor(labels, dtype=torch.long) if labels else torch.zeros(0, dtype=torch.long)
        }

def collate_fn(batch):
    return {
        'image': torch.stack([x['image'] for x in batch]),
        'boxes': [x['boxes'] for x in batch],
        'labels': [x['labels'] for x in batch]
    }

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        layers = []
        for _ in range(num_layers-1):
            layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.ReLU()
            ])
            input_dim = hidden_dim
        layers.append(nn.Linear(hidden_dim, output_dim))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

class DETRLike(nn.Module):
    def __init__(self, num_classes, num_queries=MAX_DETECTIONS, hidden_dim=256):
        super().__init__()
        
        self.backbone = nn.Sequential(*list(resnet18(pretrained=False).children())[:-2])
        self.conv = nn.Conv2d(512, hidden_dim, 1)
        
        self.encoder_pos = nn.Parameter(torch.randn(1, hidden_dim, IMG_SIZE//32, IMG_SIZE//32))
        self.decoder_pos = nn.Parameter(torch.randn(num_queries, hidden_dim))
        
        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=8,
            num_encoder_layers=3,
            num_decoder_layers=3,
            dim_feedforward=2048,
            dropout=0.1
        )
        
        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)

    def forward(self, x):
        features = self.backbone(x)
        src = self.conv(features)
        
        bs, c, h, w = src.shape
        src = src.flatten(2).permute(2, 0, 1)
        pos_enc = self.encoder_pos.expand(bs, -1, h, w).flatten(2).permute(2, 0, 1)
        src = src + pos_enc
        
        query_embed = self.decoder_pos.unsqueeze(1).repeat(1, bs, 1)
        
        hs = self.transformer(
            src=src,
            tgt=query_embed,
            src_key_padding_mask=None,
            tgt_key_padding_mask=None,
            memory_key_padding_mask=None
        )
        
        outputs_class = self.class_embed(hs)
        outputs_coord = self.bbox_embed(hs).sigmoid()
        
        return outputs_class.permute(1, 0, 2), outputs_coord.permute(1, 0, 2)

class HungarianMatcher(nn.Module):
    def __init__(self, cost_class=1, cost_bbox=5, cost_giou=2):
        super().__init__()
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou

    @torch.no_grad()
    def forward(self, outputs, targets):
        bs, num_queries = outputs["pred_logits"].shape[:2]
        
        indices = []
        for i in range(bs):
            out_prob = outputs["pred_logits"][i].softmax(-1)
            out_bbox = outputs["pred_boxes"][i]

            tgt_bbox = targets[i]["boxes"]
            tgt_ids = targets[i]["labels"]
            
            cost_class = -out_prob[:, tgt_ids]
            
            cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
            
            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class
            C = C.reshape(num_queries, -1).cpu()
            
            indices.append(linear_sum_assignment(C))
        
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]

def loss_fn(outputs, targets):
    pred_logits, pred_boxes = outputs
    
    matcher = HungarianMatcher()
    indices = matcher({"pred_logits": pred_logits, "pred_boxes": pred_boxes}, targets)
    
    src_logits = torch.cat([pred_logits[i][idx] for i, (idx, _) in enumerate(indices)])
    target_classes = torch.cat([t["labels"][j] for t, (_, j) in zip(targets, indices)])
    loss_cls = F.cross_entropy(src_logits, target_classes)
    
    src_boxes = torch.cat([pred_boxes[i][idx] for i, (idx, _) in enumerate(indices)])
    target_boxes = torch.cat([t["boxes"][j] for t, (_, j) in zip(targets, indices)])
    loss_bbox = F.l1_loss(src_boxes, target_boxes)
    
    return loss_cls + 5 * loss_bbox

def train(model, dataloader, optimizer, epochs=50):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
            images = batch['image'].to(device)
            targets = [
                {"boxes": b.to(device), "labels": l.to(device)}
                for b, l in zip(batch['boxes'], batch['labels'])
            ]
            
            outputs = model(images)
            loss = loss_fn(outputs, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

def evaluate(model, dataloader, conf_thresh=0.5):
    model.eval()
    results = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            images = batch['image'].to(device)
            outputs = model(images)
            pred_logits, pred_boxes = outputs
            
            probs = F.softmax(pred_logits, dim=-1)
            scores, labels = torch.max(probs, dim=-1)
            
            for i in range(images.size(0)):
                keep = scores[i] > conf_thresh
                pred = {
                    'boxes': pred_boxes[i][keep].cpu(),
                    'scores': scores[i][keep].cpu(),
                    'labels': labels[i][keep].cpu()
                }
                results.append((pred, batch['boxes'][i], batch['labels'][i]))
    
    aps = []
    tp_total = fp_total = fn_total = 0
    
    for iou_threshold in np.linspace(0.5, 0.95, 10):
        tp = fp = fn = 0
        
        for pred, gt_boxes, gt_labels in results:
            pred_boxes = pred['boxes']
            pred_labels = pred['labels']
            
            if len(pred_boxes) == 0:
                fn += len(gt_labels)
                continue
                
            ious = box_iou(pred_boxes, gt_boxes)
            
            matches = (ious > iou_threshold) & (pred_labels.unsqueeze(1) == gt_labels)
            matched_gt = set()
            matched_pred = set()
            
            for pred_idx, gt_idx in zip(*torch.where(matches)):
                if gt_idx not in matched_gt and pred_idx not in matched_pred:
                    matched_gt.add(gt_idx.item())
                    matched_pred.add(pred_idx.item())
            
            cur_tp = len(matched_gt)
            cur_fp = len(pred_boxes) - len(matched_pred)
            cur_fn = len(gt_labels) - len(matched_gt)
            
            if iou_threshold == 0.5:
                tp_total += cur_tp
                fp_total += cur_fp
                fn_total += cur_fn
            
            tp += cur_tp
            fp += cur_fp
            fn += cur_fn
        
        precision = tp / (tp + fp + 1e-6)
        recall = tp / (tp + fn + 1e-6)
        aps.append(precision)
    
    precision = tp_total / (tp_total + fp_total + 1e-6)
    recall = tp_total / (tp_total + fn_total + 1e-6)
    f1 = 2 * precision * recall / (precision + recall + 1e-6)
    
    map50 = np.mean(aps[:1])
    map5095 = np.mean(aps)
    
    print(f"Precision@0.5: {precision:.4f}")
    print(f"Recall@0.5: {recall:.4f}")
    print(f"F1@0.5: {f1:.4f}")
    print(f"mAP@0.5: {map50:.4f}, mAP@0.5:0.95: {map5095:.4f}")

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")

model = DETRLike(NUM_CLASSES).to(device)
model.apply(init_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

train_dataset = YOLODetectionDataset("bccd_yolo/images/train", "bccd_yolo/labels/train")
val_dataset = YOLODetectionDataset("bccd_yolo/images/val", "bccd_yolo/labels/val")

train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn
)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=4, collate_fn=collate_fn
)

train(model, train_loader, optimizer, epochs=40)
evaluate(model, val_loader)

Epoch 1: 100%|██████████| 37/37 [00:20<00:00,  1.82it/s]


Epoch 1, Loss: 1.0418


Epoch 2: 100%|██████████| 37/37 [00:19<00:00,  1.90it/s]


Epoch 2, Loss: 0.7015


Epoch 3: 100%|██████████| 37/37 [00:20<00:00,  1.84it/s]


Epoch 3, Loss: 0.6439


Epoch 4: 100%|██████████| 37/37 [00:19<00:00,  1.90it/s]


Epoch 4, Loss: 0.6309


Epoch 5: 100%|██████████| 37/37 [00:18<00:00,  1.96it/s]


Epoch 5, Loss: 0.5579


Epoch 6: 100%|██████████| 37/37 [00:18<00:00,  1.99it/s]


Epoch 6, Loss: 0.5485


Epoch 7: 100%|██████████| 37/37 [00:18<00:00,  1.97it/s]


Epoch 7, Loss: 0.5133


Epoch 8: 100%|██████████| 37/37 [00:19<00:00,  1.88it/s]


Epoch 8, Loss: 0.4926


Epoch 9: 100%|██████████| 37/37 [00:19<00:00,  1.89it/s]


Epoch 9, Loss: 0.4747


Epoch 10: 100%|██████████| 37/37 [00:19<00:00,  1.85it/s]


Epoch 10, Loss: 0.4699


Epoch 11: 100%|██████████| 37/37 [00:19<00:00,  1.91it/s]


Epoch 11, Loss: 0.4424


Epoch 12: 100%|██████████| 37/37 [00:20<00:00,  1.80it/s]


Epoch 12, Loss: 0.4510


Epoch 13: 100%|██████████| 37/37 [00:18<00:00,  1.96it/s]


Epoch 13, Loss: 0.4054


Epoch 14: 100%|██████████| 37/37 [00:19<00:00,  1.91it/s]


Epoch 14, Loss: 0.4107


Epoch 15: 100%|██████████| 37/37 [00:19<00:00,  1.94it/s]


Epoch 15, Loss: 0.3830


Epoch 16: 100%|██████████| 37/37 [00:19<00:00,  1.89it/s]


Epoch 16, Loss: 0.3795


Epoch 17: 100%|██████████| 37/37 [00:18<00:00,  2.03it/s]


Epoch 17, Loss: 0.3672


Epoch 18: 100%|██████████| 37/37 [00:19<00:00,  1.88it/s]


Epoch 18, Loss: 0.3592


Epoch 19: 100%|██████████| 37/37 [00:19<00:00,  1.86it/s]


Epoch 19, Loss: 0.3653


Epoch 20: 100%|██████████| 37/37 [00:18<00:00,  1.99it/s]


Epoch 20, Loss: 0.3534


Epoch 21: 100%|██████████| 37/37 [00:18<00:00,  2.00it/s]


Epoch 21, Loss: 0.3368


Epoch 22: 100%|██████████| 37/37 [00:19<00:00,  1.94it/s]


Epoch 22, Loss: 0.3414


Epoch 23: 100%|██████████| 37/37 [00:18<00:00,  2.01it/s]


Epoch 23, Loss: 0.3520


Epoch 24: 100%|██████████| 37/37 [00:19<00:00,  1.89it/s]


Epoch 24, Loss: 0.3177


Epoch 25: 100%|██████████| 37/37 [00:18<00:00,  1.97it/s]


Epoch 25, Loss: 0.3405


Epoch 26: 100%|██████████| 37/37 [00:19<00:00,  1.86it/s]


Epoch 26, Loss: 0.3132


Epoch 27: 100%|██████████| 37/37 [00:20<00:00,  1.85it/s]


Epoch 27, Loss: 0.2952


Epoch 28: 100%|██████████| 37/37 [00:19<00:00,  1.89it/s]


Epoch 28, Loss: 0.2721


Epoch 29: 100%|██████████| 37/37 [00:21<00:00,  1.76it/s]


Epoch 29, Loss: 0.2689


Epoch 30: 100%|██████████| 37/37 [00:20<00:00,  1.80it/s]


Epoch 30, Loss: 0.2624


Epoch 31: 100%|██████████| 37/37 [00:17<00:00,  2.06it/s]


Epoch 31, Loss: 0.2661


Epoch 32: 100%|██████████| 37/37 [00:19<00:00,  1.91it/s]


Epoch 32, Loss: 0.2702


Epoch 33: 100%|██████████| 37/37 [00:19<00:00,  1.93it/s]


Epoch 33, Loss: 0.2662


Epoch 34: 100%|██████████| 37/37 [00:18<00:00,  2.04it/s]


Epoch 34, Loss: 0.2644


Epoch 35: 100%|██████████| 37/37 [00:18<00:00,  2.00it/s]


Epoch 35, Loss: 0.2595


Epoch 36: 100%|██████████| 37/37 [00:20<00:00,  1.82it/s]


Epoch 36, Loss: 0.2310


Epoch 37: 100%|██████████| 37/37 [00:20<00:00,  1.85it/s]


Epoch 37, Loss: 0.2460


Epoch 38: 100%|██████████| 37/37 [00:18<00:00,  1.99it/s]


Epoch 38, Loss: 0.2423


Epoch 39: 100%|██████████| 37/37 [00:19<00:00,  1.86it/s]


Epoch 39, Loss: 0.2321


Epoch 40: 100%|██████████| 37/37 [00:20<00:00,  1.82it/s]


Epoch 40, Loss: 0.2215


Evaluating: 100%|██████████| 19/19 [00:01<00:00, 10.76it/s]

Precision@0.5: 0.1272
Recall@0.5: 0.1884
F1@0.5: 0.1519
mAP@0.5: 0.1272, mAP@0.5:0.95: 0.0358





### Выводы

После применения улучшений, аналогичных тем, что использовались для финального бейзлайна моделей из библиотеки ultralytics, удалось добиться заметного прироста качества у самописных моделей детекции объектов. Улучшения включали увеличение числа эпох обучения, а также оптимизацию модели на уровне данных и архитектуры, в зависимости от типа модели.

Сверточная модель, построенная по аналогии с yolo, продемонстрировала наилучший прирост после применения обеих гипотез улучшения. По сравнению с изначальными результатами (Precision: 0.0790, Recall: 0.2073, F1: 0.1144, mAP@0.5: 0.2062, mAP@0.5:0.95: 0.0663), улучшенная версия модели показала значительный рост по всем ключевым метрикам: Precision увеличился до 0.1414, Recall - до 0.2160, F1 - до 0.1709, а значение mAP@0.5 достигло 0.3406, что почти в два раза превышает исходный результат. Также заметно выросла и сложная метрика mAP@0.5:0.95 - с 0.0663 до 0.1215.

Аналогичная динамика наблюдается и в трансформерной модели. Изначальные значения метрик (Precision: 0.0892, Recall: 0.1317, F1: 0.1064, mAP@0.5: 0.0892, mAP@0.5:0.95: 0.0232) выросли до Precision: 0.1272, Recall: 0.1884, F1: 0.1519, mAP@0.5: 0.1272 и mAP@0.5:0.95: 0.0358 после увеличения количества эпох. Хотя абсолютные значения остаются ниже, чем у сверточной модели, прирост у трансформерной модели также заметен и подтверждает эффективность гипотезы.

Тем не менее, по сравнению с финальными улучшенными моделями из ultralytics (где yolo11n достигал mAP@0.5: 0.9083, а rtdetr-l - mAP@0.5: 0.8310), самописные версии всё ещё существенно отстают. Однако полученные результаты показывают, что применяемые техники улучшения действительно работают, а модели реагируют на гиперпараметры и настройки предсказуемо.

Таким образом, пусть абсолютные значения пока остаются низкими, заметный рост метрик после применения улучшений подтверждает работоспособность реализованных моделей.