In [1]:
import os

In [2]:
cocoDir='coco'

In [3]:
cocoAnnotationsDir = f"{cocoDir}/annotations"

if not os.path.exists(cocoAnnotationsDir): 
    import requests
    import zipfile
    url = 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip'
    # 下载文件，服务器不支持分块下载，耐心等待
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        # 指定下载文件的路径和文件名
        filename = 'annotations_trainval2017.zip'
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"文件下载完成，保存为：{filename}")

        os.makedirs(cocoAnnotationsDir)
        # 解压缩文件
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(cocoAnnotationsDir)  # 解压缩到指定目录
        print("文件解压缩完成。")
    
        # 删除压缩文件
        os.remove(filename)
        print("压缩文件已删除。")
    else:
        print(f"文件下载失败，状态码：{response.status_code}")

0000
1000
2000


NameError: name 'filename' is not defined

In [None]:
!pip install pycocotools

In [2]:
from pycocotools.coco import COCO

In [None]:
# 初始化COCO API以供实例标注使用。
dataType='train2017'
annFile=f"{cocoAnnotationsDir}/instances_{dataType}.json"
coco=COCO(annFile)

In [None]:
import numpy as np

In [6]:
# 据文档所述，量化所需图片为500-1000张为佳
imagesPerCategory=10

imgIds=[]

for catId in coco.getCatIds() :
    catimgIds = coco.getImgIds(catIds=catId)

    # 将catimgIds打乱顺序。
    np.random.shuffle(catimgIds)
    
    # 选择前N张图像并将其添加到imgIds中
    imgIds += catimgIds[:imagesPerCategory]

# 使用集合去重
imgIds = set(imgIds)

# 加载指定的图片标注数据
imgs = coco.loadImgs(imgIds)

In [None]:
import shutil

In [13]:
downloadDir = f"{dataDir}/download/{dataType}"

if os.path.exists(downloadDir) and os.path.isdir(downloadDir):
    shutil.rmtree(downloadDir)
else:
    os.makedirs(downloadDir)

In [None]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import time

In [None]:
# 定义重试策略
retries = Retry(
    total=5,  # 总重试次数
    backoff_factor=1,  # 指数退避因子
    status_forcelist=[500, 502, 503, 504],  # 指定哪些状态码触发重试
    allowed_methods=frozenset(['GET']),  # 允许重试的HTTP方法
)

# 创建带重试的HTTP适配器
adapter = HTTPAdapter(max_retries=retries)

# 创建Session对象，并挂载适配器
session = requests.Session()
session.mount('http://', adapter)
session.mount('https://', adapter)

imgsLen = len(imgs)

# 下载图片
for i, img in enumerate(imgs):
    tic = time.time()
    fname = os.path.join(downloadDir, img['file_name'])
    if not os.path.exists(fname):
        try:
            # 使用 requests 获取图片内容，并设置超时时间
            response = session.get(img['coco_url'], timeout=5)
            
            # 检查请求是否成功
            if response.status_code == 200:
                with open(fname, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded {i+1}/{imgsLen} {img['file_name']} in {time.time() - tic:.3f}s")
            else:
                print(f"Failed to retrieve {i+1}/{imgsLen} {img['coco_url']}. Status code: {response.status_code}")
        except requests.exceptions.Timeout as e:
            print(f"Request timed out: {e}")
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
    else:
        print(f"{fname} already exists")

In [26]:
# 获取下载目录下所有图片的绝对路径
file_paths = [os.path.join(root, file)
              for root, dirs, files in os.walk(downloadDir)
              for file in files 
             ]

In [None]:
# 存放已经经过预处理的图片
preprocessedDir = "coco/preprocessed"

if not os.path.exists(preprocessedDir):
    os.makedirs(preprocessedDir)

In [15]:
import cv2
from letter_box import LetterBox

In [16]:
letterBox = LetterBox()

In [19]:
# 预处理图片并保存
for file_path in file_paths:
    img = cv2.imread(file_path)
    img = letterBox(img)
    cv2.imwrite(f'{preprocessedDir}/{os.path.basename(file_path)}', img)

In [28]:
# 生成数据集声明文件
dataset_txt_path = "dataset.txt"

if os.path.exists(dataset_txt_path):
    os.remove(dataset_txt_path)
    
with open(dataset_txt_path, 'w') as file:
    for img in imgs:
        file.write(f"./{dataDir}/{dataType}/{img['file_name']} {img['id']}\n")