# Finetuning Yolov11m

In [11]:
from ultralytics import YOLO
import os

# load the pretrained medium model
model = YOLO("yolo11l.pt")

# Use absolute path to YAML
yaml_path = "/home/joey/Projects/chatbrain/backend/vision_model/chat_dataset.yaml"

# Verify paths
assert os.path.exists(yaml_path), "YAML file not found!"
assert os.path.exists(os.path.join(os.path.dirname(yaml_path), "dataset/train/images")), "Train images missing!"
assert os.path.exists(os.path.join(os.path.dirname(yaml_path), "dataset/val/images")), "Val images missing!"

results = model.train(
    data=yaml_path,
    epochs=100,
    time=9, # maxium training time in hours
    device="0",
    patience=10, # early stopping patience
    batch=4,  # Must reduce batch size dramatically
    multi_scale=False,
    freeze=15,  # Freeze first 15 layers
    mosaic=0.5,  # Reduce augmentation memory usage
    imgsz=640, # image size
    cache=True, # cache images for faster training
)

Ultralytics 8.3.67 🚀 Python-3.10.12 torch-2.5.1+cu124 CUDA:0 (NVIDIA GeForce GTX 1060 3GB, 3004MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolo11l.pt, data=/home/joey/Projects/chatbrain/backend/vision_model/chat_dataset.yaml, epochs=100, time=9, patience=10, batch=4, imgsz=640, save=True, save_period=-1, cache=True, device=0, workers=8, project=None, name=train17, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=15, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=Fal

[34m[1mtrain: [0mScanning /home/joey/Projects/chatbrain/backend/vision_model/dataset/train/labels.cache... 309 images, 46 backgrounds, 0 corrupt: 100%|██████████| 309/309 [00:00<?, ?it/s]








[34m[1mtrain: [0mCaching images (0.2GB RAM): 100%|██████████| 309/309 [00:01<00:00, 187.46it/s]
[34m[1mval: [0mScanning /home/joey/Projects/chatbrain/backend/vision_model/dataset/val/labels.cache... 78 images, 16 backgrounds, 0 corrupt: 100%|██████████| 78/78 [00:00<?, ?it/s]








[34m[1mval: [0mCaching images (0.1GB RAM): 100%|██████████| 78/78 [00:00<00:00, 157.18it/s]


Plotting labels to /home/joey/Projects/chatbrain/runs/detect/train17/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.001429, momentum=0.9) with parameter groups 167 weight(decay=0.0), 174 weight(decay=0.0005), 173 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 6 dataloader workers
Logging results to [1m/home/joey/Projects/chatbrain/runs/detect/train17[0m
Starting training for 9 hours...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100      1.75G      2.075      2.609      1.497         70        640:  26%|██▌       | 20/78 [00:18<00:54,  1.07it/s]


KeyboardInterrupt: 

#### Sort detected boxes vertically to get message order

In [6]:
def process_results(results):
    boxes = []
    for box in results[0].boxes:
        cls = int(box.cls)
        coords = box.xyxy[0].tolist()  # [ax, ay, bx, by]
        boxes.append((cls, coords))
    
    # Sort non-contact boxes by Y-coordinate (ay)
    contact_boxes = [b for b in boxes if b[0] == 0]
    message_boxes = sorted(
        [b for b in boxes if b[0] != 0],
        key=lambda x: x[1][1]  # Sort by ay (top Y-coordinate)
    )
    
    # Assign 'num' (0 for contacts, 1,2,3... for messages)
    final = []
    for cls, (ax, ay, bx, by) in contact_boxes:
        final.append([ax, ay, bx, by, 0, 0])  # Type 0, num=0
    
    for i, (cls, (ax, ay, bx, by)) in enumerate(message_boxes, 1):
        final.append([ax, ay, bx, by, cls, i])  # Type 1/2, num=i
    
    return final