In [1]:
"""
下载 PASCAL VOC 2007 数据集并转换为 YOLO 格式,生成 voc.yaml 文件
PASCAL VOC 2007 数据集是计算机视觉领域中一个著名的标准数据集，主要用于目标检测、图像分类和语义分割等任务。
该数据集包含 9963 张图片，分为训练集（5011 张）和测试集（4952 张），
涵盖 20 个类别，如飞机、自行车、鸟、船、瓶子、公共汽车、汽车、猫、椅子、牛、餐桌、狗、马、摩托车、人、盆栽、羊、沙发、火车和电视显示器。
其标注信息以 XML 格式存储，包含目标的边界框、类别标签等。该数据集是许多经典计算机视觉模型的训练和评估基准。
"""
import os
import requests
import tarfile
from lxml import etree
import shutil

# ----------------------
# 配置参数
# ----------------------
VOC_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar"
VOC_TEST_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar"
DATA_DIR = "./dataset/"

# ----------------------
# 下载并解压数据集
# ----------------------
def download_and_extract(url, dest_dir):
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir, exist_ok=True)

    filename = os.path.join(dest_dir, url.split("/")[-1])

    # 下载文件
    if not os.path.exists(filename):
        print(f"Downloading {url}...")
        response = requests.get(url, stream=True)
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

    # 解压文件
    print(f"Extracting {filename}...")
    with tarfile.open(filename) as tar:
        tar.extractall(path=dest_dir)



# 转换VOC格式到YOLO格式
# ----------------------
def convert_voc_to_yolo(voc_dir, output_dir):
    print(f"转换VOC格式到YOLO格式: {output_dir}")

    classes = [
        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
        'bus', 'car', 'cat', 'chair', 'cow',
        'diningtable', 'dog', 'horse', 'motorbike', 'person',
        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
    ]

    # 创建训练和测试子目录
    for split in ["train", "test"]:
        os.makedirs(os.path.join(output_dir, f"images/{split}"), exist_ok=True)
        os.makedirs(os.path.join(output_dir, f"labels/{split}"), exist_ok=True)

    # 对 trainval 和 test 分别处理
    for split in ["trainval", "test"]:
        split_type = "train" if split == "trainval" else "test"
        list_path = os.path.join(voc_dir, f"VOCdevkit/VOC2007/ImageSets/Main/{split}.txt")
        if not os.path.exists(list_path):
            print(f"划分文件不存在: {list_path}")
            continue

        with open(list_path) as f:
            ids = [line.strip() for line in f.readlines()]
        print(f"开始处理 {split_type} 集，共 {len(ids)} 张图片")

        for img_id in ids:
            ann_path = os.path.join(voc_dir, f"VOCdevkit/VOC2007/Annotations/{img_id}.xml")
            if not os.path.exists(ann_path):
                print(f"标注文件不存在: {ann_path}")
                continue

            tree = etree.parse(ann_path)
            root = tree.getroot()

            size = root.find("size")
            width = int(size.find("width").text)
            height = int(size.find("height").text)

            yolo_ann = []
            for obj in root.iter("object"):
                cls = obj.find("name").text
                if cls not in classes:
                    continue
                cls_id = classes.index(cls)

                bbox = obj.find("bndbox")
                xmin = float(bbox.find("xmin").text)
                ymin = float(bbox.find("ymin").text)
                xmax = float(bbox.find("xmax").text)
                ymax = float(bbox.find("ymax").text)

                x_center = (xmin + xmax) / 2 / width
                y_center = (ymin + ymax) / 2 / height
                w = (xmax - xmin) / width
                h = (ymax - ymin) / height

                yolo_ann.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}")

            # 保存 YOLO 标签
            label_out_path = os.path.join(output_dir, f"labels/{split_type}/{img_id}.txt")
            with open(label_out_path, "w") as f:
                f.write("\n".join(yolo_ann))

            # 拷贝图像
            src = os.path.join(voc_dir, f"VOCdevkit/VOC2007/JPEGImages/{img_id}.jpg")
            dst = os.path.join(output_dir, f"images/{split_type}/{img_id}.jpg")
            if not os.path.exists(dst):
                try:
                    shutil.copy2(src, dst)
                    # os.symlink(src, dst)  # 尝试创建符号链接
                except:
                    print(f"复制图像失败: {src}")

    # 创建 voc.yaml 文件
    with open("voc.yaml", "w") as f:
        names_str = '\n'.join([f"  {i}: {name}" for i, name in enumerate(classes)])
        f.write(
f"""path: {os.path.abspath(output_dir)}
train: images/train
val: images/train
test: images/test
names:
{names_str}
"""
        )
    print("✅ VOC 转 YOLO 完成，生成 voc.yaml")


# # 下载并解压训练集和测试集
download_and_extract(VOC_URL, DATA_DIR)
download_and_extract(VOC_TEST_URL, DATA_DIR)
# # 执行格式转换 voc->yolo
convert_voc_to_yolo(DATA_DIR, DATA_DIR+"/output")

Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar...
Extracting ./dataset/VOCtrainval_06-Nov-2007.tar...
Downloading http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar...
Extracting ./dataset/VOCtest_06-Nov-2007.tar...
转换VOC格式到YOLO格式: ./dataset//output
开始处理 train 集，共 5011 张图片
开始处理 test 集，共 4952 张图片
✅ VOC 转 YOLO 完成，生成 voc.yaml


In [None]:
"""
基于YOLO V8的PASCAL VOC 2007的目标检测任务
"""
# !pip install ultralytics
# 使用预训练模型
from ultralytics import YOLO
from PIL import Image, ImageDraw, ImageFont
device = 'cuda'  # 使用GPU训练,可选cuda或cpu

!nvidia-smi
# 加载预训练模型
model = YOLO("baseModel/yolov8n.pt")  # 使用预训练模型
print("模型加载完成") if model else print("模型加载失败")
# 训练模型
# 训练参数配置
model.train(
    data='voc.yaml',
    epochs=100,  # 训练轮数
    batch=64,   # 批处理大小
    imgsz=800,  # 输入图像大小
    device=0 if device == "cuda" else "cpu",
    optimizer='AdamW',  # 优化器
    lr0=0.0001, # 初始学习率,可选0.001、0.01、0.1等
    lrf=0.005,
    warmup_epochs=3,   # 新增学习率预热
    weight_decay=0.05,           # 添加权重衰减防止过拟合

    augment=True,
    hsv_h=0.3,                   # 增强色调扰动
    hsv_s=0.6,                   # 增强饱和度扰动
    translate=0.2,               # 增大平移幅度
    scale=0.5,                   # 扩大缩放范围
    shear=0.3,                   # 增大剪切幅度
    mosaic=1.0,                  # 全程开启mosaic
    close_mosaic=15,             # 最后15个epoch关闭mosaic稳定训练
    # 损失函数调整
    cls=2.5,                     # 增大分类损失权重
    box=1.5,                     # 增大框回归损失权重
    dfl=1.5,                     # 增大点框损失权重
    mixup=0.2,       # 新增MixUp增强（默认未启用）

    patience=15,                  # 延长早停观察期
    dropout=0.3,                 # 添加Dropout正则化
    amp=True,                    # 保持混合精度训练
    pretrained=True,
    save=True,
    exist_ok=True,
)

Tue Apr 29 12:34:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   72C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

[34m[1mtrain: [0mScanning /content/dataset/output/labels/train.cache... 5011 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5011/5011 [00:00<?, ?it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1432.2±588.4 MB/s, size: 73.4 KB)


[34m[1mval: [0mScanning /content/dataset/output/labels/train.cache... 5011 images, 0 backgrounds, 0 corrupt: 100%|██████████| 5011/5011 [00:00<?, ?it/s]


Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.0001, momentum=0.937) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.05), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 2 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 100 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      1/100      8.07G     0.2537      19.65       1.39        362        640:  22%|██▏       | 17/79 [00:19<01:14,  1.20s/it]

In [None]:
# 预测输出（使用OpenCV统一绘图）
import os
import cv2
import torch
from ultralytics import YOLO
import matplotlib.pyplot as plt

# ------------ 全局配置 ------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "runs/detect/train/weights/best.pt"
model = YOLO(MODEL_PATH)
# INPUT_PATH = "dataset/output/video/test.mp4"  # 输入路径，可以是图片/视频/文件夹/摄像头
# INPUT_PATH = "dataset/output/test/"
INPUT_PATH = 0
SAVE = False  # 是否保存预测结果
OUTPUT_PATH = "predict/"  # 预测结果保存路径

# ------------ 工具函数 ------------

# 使用OpenCV绘制识别框和文字
# 使用 OpenCV 绘制检测结果（根据类别动态换颜色）
def draw_boxes_cv2(image, results):
    for box in results[0].boxes:
        # 1. 取出检测框的坐标，并四舍五入为整数
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        cls_id = int(box.cls)   # 获取类别ID
        conf = float(box.conf)  # 获取置信度
        label_name = model.names[cls_id]  # 获取类别名称
        label = f"{label_name} {conf:.2f}"  # 生成显示标签文本

        # 2. 决定绘制颜色
        # if label_name == "with_mask":
        #     color = (0, 255, 0)      # 绿色 (BGR)
        # elif label_name == "without_mask":
        #     color = (0, 0, 255)      # 红色 (BGR)
        # else:
        #     color = (0, 165, 255)    # 橙色 (BGR)
        color = (0, 0, 255)      # 红色 (BGR)

        # 3. 绘制矩形框（框颜色根据类别变化）
        cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)

        # 4. 绘制文字背景矩形（填充背景色）
        (text_w, text_h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
        cv2.rectangle(image, (x1, y1 - text_h - 4), (x1 + text_w, y1), color, -1)

        # 5. 绘制白色文字（始终为白色）
        cv2.putText(image, label, (x1, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

    return image


# 保存图片
def save_image_cv2(image, save_path, origin_path=None):
    if os.path.isdir(save_path):
        filename = os.path.basename(origin_path)
        save_path = os.path.join(save_path, filename)
    else:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
    cv2.imwrite(save_path, image)
    print(f"✅ 已保存图片: {save_path}")

# ------------ 单张图片预测 ------------
def predict_image(image_path, save=False, save_path=None):
    image = cv2.imread(image_path)
    results = model.predict(image_path, imgsz=640, device=DEVICE)
    image = draw_boxes_cv2(image, results)

    # 显示结果
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image_rgb)
    plt.axis("off")
    plt.title("预测结果")
    plt.show()

    # 保存结果
    if save and save_path:
        save_image_cv2(image, save_path, origin_path=image_path)

# ------------ 视频文件预测 ------------
def predict_video(video_path, save=False, save_path=None):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("❌ 视频文件无法打开")
        return

    if save:
        if os.path.isdir(save_path):
            filename = os.path.basename(video_path)
            save_path = os.path.join(save_path, f"{os.path.splitext(filename)[0]}.mp4")
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        fps, w, h = cap.get(5), int(cap.get(3)), int(cap.get(4))
        out = cv2.VideoWriter(save_path, fourcc, fps, (w, h))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = model.predict(frame, imgsz=640, device=DEVICE,verbose=False)
        frame = draw_boxes_cv2(frame, results)

        cv2.imshow("预测中 - 按 Q 退出", frame)
        if save:
            out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    if save:
        out.release()
        print(f"✅ 已保存视频: {save_path}")
    cv2.destroyAllWindows()

# ------------ 文件夹批量图片预测 ------------
# 文件夹批量预测 + 实时进度条
def predict_folder(folder_path, save=False, output_dir=None):
    # 获取全部图片文件列表
    all_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff")):
                all_files.append(os.path.join(root, file))

    total_files = len(all_files)  # 总文件数
    if total_files == 0:
        print("❌ 文件夹中没有找到图片")
        return

    # 遍历每一张图片进行预测
    for idx, img_path in enumerate(all_files, start=1):
        image = cv2.imread(img_path)
        results = model.predict(img_path, imgsz=640, device=DEVICE,verbose=False)
        image = draw_boxes_cv2(image, results)

        if save and output_dir:
            rel_path = os.path.relpath(img_path, folder_path)
            save_path = os.path.join(output_dir, rel_path)
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            cv2.imwrite(save_path, image)

        # --- 绘制进度条 ---
        progress = idx / total_files
        bar_len = 30  # 进度条长度（字符数）
        filled_len = int(bar_len * progress)
        bar = "█" * filled_len + "-" * (bar_len - filled_len)
        print(f"\r🔄 预测进度: [{bar}] {progress*100:.1f}% ({idx}/{total_files})", end="")

    # 最后换行
    print()

    if save:
        print(f"✅ 文件夹预测完成，结果已保存至: {output_dir}")


# ------------ 摄像头实时预测并可保存录像 ------------
def predict_camera(index=0, save=False, output_dir="predict/"):
    cap = cv2.VideoCapture(index)
    if not cap.isOpened():
        print(f"❌ 无法打开摄像头 {index}")
        return

    # 如果需要保存视频，初始化 VideoWriter
    if save:
        import datetime
        now = datetime.datetime.now()
        timestamp = now.strftime("%Y%m%d%H%M%S")  # 获取当前时间：年月日时分秒，纯数字
        os.makedirs(output_dir, exist_ok=True)   # 确保输出目录存在
        save_path = os.path.join(output_dir, f"camera_{index}_{timestamp}.mp4")

        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        fps = cap.get(cv2.CAP_PROP_FPS)
        if fps == 0 or fps is None:  # 有些摄像头可能取不到帧率
            fps = 30  # 默认设为30帧
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        out = cv2.VideoWriter(save_path, fourcc, fps, (width, height))

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = model.predict(frame, imgsz=640, device=DEVICE)
        frame = draw_boxes_cv2(frame, results)

        # 显示预测画面
        cv2.imshow("摄像头预测 - 按 Q 退出", frame)

        # 保存预测画面
        if save:
            out.write(frame)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cap.release()
    if save:
        out.release()
        print(f"✅ 摄像头视频已保存到: {save_path}")
    cv2.destroyAllWindows()


# ------------ 总入口函数 ------------
def run_predict(path, save=False, save_path=None):
    if isinstance(path, int):
        predict_camera(index=path,save=save,output_dir=save_path)
    elif os.path.isfile(path):
        ext = os.path.splitext(path)[1].lower()
        if ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
            predict_image(path, save, save_path)
        elif ext in [".mp4", ".avi", ".mov", ".mkv"]:
            predict_video(path, save, save_path)
    elif os.path.isdir(path):
        predict_folder(path, save, save_path)
    else:
        print("❌ 无效路径，请确认输入正确的图片/视频/文件夹/摄像头编号")

# ------------ 示例调用 ------------
run_predict(INPUT_PATH, SAVE, OUTPUT_PATH)



0: 480x640 1 bird, 3 persons, 20.8ms
Speed: 28.6ms preprocess, 20.8ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 18.4ms
Speed: 3.1ms preprocess, 18.4ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 birds, 3 persons, 17.7ms
Speed: 2.5ms preprocess, 17.7ms inference, 1.7ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 bird, 3 persons, 17.8ms
Speed: 2.7ms preprocess, 17.8ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 birds, 2 persons, 18.6ms
Speed: 3.4ms preprocess, 18.6ms inference, 3.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 bird, 1 person, 19.8ms
Speed: 3.3ms preprocess, 19.8ms inference, 3.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 birds, 2 persons, 17.8ms
Speed: 2.2ms preprocess, 17.8ms inference, 2.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 bird, 3 persons, 17.7ms
Speed: 2.0ms preproces