In [1]:
# PASCAL VOC 2007数据集下载和转换
import os
import requests
import tarfile
from lxml import etree
import shutil

# ----------------------
# 配置参数
# ----------------------
VOC_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar"
VOC_TEST_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar"
DATA_DIR = "./dataset/"

# ----------------------
# 下载并解压数据集
# ----------------------
def download_and_extract(url, dest_dir):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir, exist_ok=True)

    filename = os.path.join(dest_dir, url.split("/")[-1])

    # 下载文件
    if not os.path.exists(filename):
        print(f"Downloading {url}...")
        response = requests.get(url, stream=True)
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

    # 解压文件
    print(f"Extracting {filename}...")
    with tarfile.open(filename) as tar:
        tar.extractall(path=dest_dir)



# 转换VOC格式到YOLO格式
# ----------------------
def convert_voc_to_yolo(voc_dir, output_dir):
    print(f"转换VOC格式到YOLO格式:{output_dir}")
    classes = [
        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
        'bus', 'car', 'cat', 'chair', 'cow',
        'diningtable', 'dog', 'horse', 'motorbike', 'person',
        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
    ]

    # 创建输出目录
    os.makedirs(os.path.join(output_dir, "labels"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)

    # 处理所有标注文件
    for split in ["trainval", "test"]:
        with open(os.path.join(voc_dir, f"VOCdevkit/VOC2007/ImageSets/Main/{split}.txt")) as f:
            ids = [line.strip() for line in f.readlines()]

        for img_id in ids:
            # 解析XML标注
            ann_path = os.path.join(voc_dir, f"VOCdevkit/VOC2007/Annotations/{img_id}.xml")
            tree = etree.parse(ann_path)
            root = tree.getroot()

            # 获取图像尺寸
            size = root.find("size")
            width = int(size.find("width").text)
            height = int(size.find("height").text)

            # 转换每个对象
            yolo_ann = []
            for obj in root.iter("object"):
                cls = obj.find("name").text
                cls_id = classes.index(cls)

                bbox = obj.find("bndbox")
                xmin = float(bbox.find("xmin").text)
                ymin = float(bbox.find("ymin").text)
                xmax = float(bbox.find("xmax").text)
                ymax = float(bbox.find("ymax").text)

                # 转换为YOLO格式
                x_center = (xmin + xmax) / 2 / width
                y_center = (ymin + ymax) / 2 / height
                w = (xmax - xmin) / width
                h = (ymax - ymin) / height

                yolo_ann.append(f"{cls_id} {x_center} {y_center} {w} {h}")

            # 保存YOLO标注
            with open(os.path.join(output_dir, f"labels/{img_id}.txt"), "w") as f:
                f.write("\n".join(yolo_ann))

            # 复制图像（这里直接创建符号链接节省空间）
            src = os.path.join(voc_dir, f"VOCdevkit/VOC2007/JPEGImages/{img_id}.jpg")
            dst = os.path.join(output_dir, f"images/{img_id}.jpg")
            if not os.path.exists(dst):
                try:
                    shutil.copy2(src, dst)  # 如果失败则复制文件
                    # os.symlink(src, dst)  # 尝试创建符号链接
                except:
                    print(f"创建符号链接失败，复制文件: {src} -> {dst}")
    # 创建数据集配置文件
    with open("voc.yaml", "w") as f:
        names_str = '\n'.join([f"  {i}: {name}" for i, name in enumerate(classes)])
        f.write(
f"""path: {os.path.abspath(output_dir)}
train: images
val: images
test: images
names:
{names_str}
"""
                )

    print("转换完成！输出voc.yaml")
# 下载训练集和测试集
download_and_extract(VOC_URL, DATA_DIR)
download_and_extract(VOC_TEST_URL, DATA_DIR)
# 执行格式转换 voc->yolo
convert_voc_to_yolo(DATA_DIR, DATA_DIR+"/output")


Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar...
Extracting ./dataset/VOCtrainval_06-Nov-2007.tar...
Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar...
Extracting ./dataset/VOCtest_06-Nov-2007.tar...
转换VOC格式到YOLO格式:./dataset//output
转换完成！输出voc.yaml


In [2]:
"""
基于YOLO V8的PASCAL VOC 2007的目标检测任务
PASCAL VOC 2007 数据集是计算机视觉领域中一个著名的标准数据集，主要用于目标检测、图像分类和语义分割等任务。
该数据集包含 9963 张图片，分为训练集（5011 张）和测试集（4952 张），
涵盖 20 个类别，如飞机、自行车、鸟、船、瓶子、公共汽车、汽车、猫、椅子、牛、餐桌、狗、马、摩托车、人、盆栽、羊、沙发、火车和电视显示器。
其标注信息以 XML 格式存储，包含目标的边界框、类别标签等。该数据集是许多经典计算机视觉模型的训练和评估基准。

"""
!pip install ultralytics
# 使用预训练模型
from ultralytics import YOLO
from PIL import Image, ImageDraw, ImageFont
device = 'cuda'  # 使用GPU训练,可选cuda或cpu
epochs=15  # 训练轮数
batch_size=64  # 批处理大小
imgsize=640  # 输入图像大小
lr=0.001    # 学习率,可选0.001、0.01、0.1等
augmentation=True  # 是否使用数据增强,可选True或False,对数据集进行旋转、缩放、翻转等操作,防止过拟合
optimizer='AdamW'  # 优化器,可选SGD、Adam、AdamW等,区别:SGD是随机梯度下降,Adam是自适应学习率优化器,AdamW是Adam的改进版,具有更好的收敛性和泛化能力

Collecting ultralytics
  Downloading ultralytics-8.3.108-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [3]:
# 加载数据集

train_path = 'voc.yaml'  # 训练集路径
test_path = ''  # 测试集路径


In [4]:
!nvidia-smi
# 加载预训练模型
model = YOLO("baseModel/yolov8m.pt").to(device)  # 使用预训练模型
print("模型加载完成") if model else print("模型加载失败")

/bin/bash: line 1: nvidia-smi: command not found
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'baseModel/yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 92.9MB/s]

模型加载完成





In [None]:
# 训练模型
# 训练参数配置
model.train(
    data=train_path,
    epochs=20,              # 增加训练轮次
    batch=32,                # 根据GPU内存调整
    imgsz=640,
    device=0,
    optimizer='AdamW',
    lr0=0.01,
    lrf=0.1,
    momentum=0.937,
    weight_decay=0.0005,
    box=7.5,
    cls=0.5,
    dfl=1.5,
    warmup_epochs=5,
    warmup_momentum=0.8,
    warmup_bias_lr=0.1,
    augment=True,
    hsv_h=0.02,
    hsv_s=0.8,
    hsv_v=0.5,
    degrees=10.0,
    translate=0.2,
    scale=0.9,
    shear=5.0,
    perspective=0.001,
    flipud=0.2,
    fliplr=0.5,
    mosaic=1.0,
    mixup=0.2,
    copy_paste=0.2,
    conf=0.25,
    iou=0.7,
    max_det=300,
    save=True,
    exist_ok=True
)

Ultralytics 8.3.108 🚀 Python-3.11.12 torch-2.6.0+cu124 CPU (AMD EPYC 7B12)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=baseModel/yolov8n.pt, data=voc.yaml, epochs=20, time=None, patience=100, batch=32, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train, exist_ok=True, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=0.25, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=True, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None,

100%|██████████| 755k/755k [00:00<00:00, 19.5MB/s]


Overriding model.yaml nc=80 with nc=20

                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      7360  ultralytics.nn.modules.block.C2f             [32, 32, 1, True]             
  3                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  4                  -1  2     49664  ultralytics.nn.modules.block.C2f             [64, 64, 2, True]             
  5                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  6                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  7                  -1  1    295424  ultralytic

[34m[1mtrain: [0mScanning /content/dataset/output/labels... 9963 images, 0 backgrounds, 0 corrupt: 100%|██████████| 9963/9963 [00:29<00:00, 333.02it/s]


[34m[1mtrain: [0mNew cache created: /content/dataset/output/labels.cache
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


[34m[1mval: [0mScanning /content/dataset/output/labels.cache... 9963 images, 0 backgrounds, 0 corrupt: 100%|██████████| 9963/9963 [00:00<?, ?it/s]


Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.01, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 20 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/20         0G      1.724      3.932      1.693        237        640:   6%|▌         | 18/312 [07:21<1:56:41, 23.81s/it]

In [None]:
import os
from PIL import Image, ImageDraw, ImageFont
import torch
from ultralytics import YOLO

# 加载模型
model = YOLO("runs/detect/train/weights/best.pt")  # 加载你训练好的模型

# 预测函数
def predict_and_visualize(image_path, save_path=None):
    # 加载图像
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)

    # 设置字体
    try:
        font = ImageFont.truetype("arial.ttf", 20)
    except:
        font = ImageFont.load_default()

    # 进行预测
    results = model.predict(image_path, imgsz=640, device="cpu")  # 根据你的设备选择合适的设备

    # 绘制预测结果
    for box in results[0].boxes:
        # 获取坐标和类别信息
        x1, y1, x2, y2 = box.xyxy[0].tolist()
        cls_id = int(box.cls)
        conf = float(box.conf)

        # 绘制矩形框
        draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

        # 添加标签文本
        label = f"{model.names[cls_id]} {conf:.2f}"
        text_bbox = font.getbbox(label)  # 使用 getbbox 获取文本的边界框
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]

        draw.rectangle([x1, y1 - text_height, x1 + text_width, y1], fill="red")
        draw.text((x1, y1 - text_height), label, fill="white", font=font)

    # 保存或显示图像
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)  # 确保输出目录存在
        image.save(save_path)
        print(f"预测结果已保存到 {save_path}")
    else:
        image.show()

# 递归处理文件夹中的所有图片
def process_folder(input_folder, output_folder):
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
                image_path = os.path.join(root, file)
                relative_path = os.path.relpath(image_path, input_folder)
                save_path = os.path.join(output_folder, relative_path)
                predict_and_visualize(image_path, save_path)

# 调用函数
input_folder = "dataset/output/images"  # 输入文件夹路径
output_folder = "predict"  # 输出文件夹路径
process_folder(input_folder, output_folder)