In [7]:
# tools/check_dataset.py
import json
import os
from pathlib import Path

def check_dataset_annotations(json_path, image_dir):
    """检查数据集标注和图像是否匹配"""
    # 读取标注文件
    with open(json_path, 'r') as f:
        anno_data = json.load(f)
    
    # 获取所有实际图像文件
    image_files = set(os.listdir(image_dir))
    
    # 检查标注文件中的图像是否存在
    missing_files = []
    for image_info in anno_data['images']:
        filename = image_info['file_name']
        if filename not in image_files:
            missing_files.append(filename)
    
    # 检查实际图像是否都有标注
    annotated_files = set(img['file_name'] for img in anno_data['images'])
    unlabeled_files = [f for f in image_files if f not in annotated_files]
    
    print(f"标注文件: {json_path}")
    print(f"图像目录: {image_dir}")
    print(f"标注文件中的图像数量: {len(anno_data['images'])}")
    print(f"实际图像文件数量: {len(image_files)}")
    print(f"\n缺失的文件:")
    for f in missing_files:
        print(f"  - {f}")
    print(f"\n未标注的文件:")
    for f in unlabeled_files:
        print(f"  - {f}")
    
    return missing_files, unlabeled_files

# 使用示例
json_path = "E:/SUAL/SUAL/data/bamberg_coco1024/active_learning/annotations/instances_labeled_val_fixed.json"
image_dir = "E:/SUAL/SUAL/data/bamberg_coco1024/active_learning/images_labeled_val"
missing_files, unlabeled_files = check_dataset_annotations(json_path, image_dir)

标注文件: E:/SUAL/SUAL/data/bamberg_coco1024/active_learning/annotations/instances_labeled_val_fixed.json
图像目录: E:/SUAL/SUAL/data/bamberg_coco1024/active_learning/images_labeled_val
标注文件中的图像数量: 13
实际图像文件数量: 13

缺失的文件:

未标注的文件:


In [6]:
def fix_annotation_file(json_path, image_dir, output_path=None):
    """修复标注文件，只保留实际存在的图像的标注"""
    # 读取标注文件
    with open(json_path, 'r') as f:
        anno_data = json.load(f)
    
    # 获取实际存在的图像文件
    existing_files = set(os.listdir(image_dir))
    
    # 过滤图像信息
    valid_images = []
    valid_image_ids = set()
    for img in anno_data['images']:
        if img['file_name'] in existing_files:
            valid_images.append(img)
            valid_image_ids.add(img['id'])
    
    # 过滤标注信息
    valid_annotations = [
        anno for anno in anno_data['annotations']
        if anno['image_id'] in valid_image_ids
    ]
    
    # 更新数据
    anno_data['images'] = valid_images
    anno_data['annotations'] = valid_annotations
    
    # 保存修复后的标注文件
    if output_path is None:
        output_path = json_path.replace('.json', '_fixed.json')
    
    with open(output_path, 'w') as f:
        json.dump(anno_data, f)
    
    print(f"修复后的标注文件已保存到: {output_path}")
    print(f"有效图像数量: {len(valid_images)}")
    print(f"有效标注数量: {len(valid_annotations)}")

# 使用示例
fix_annotation_file(json_path, image_dir)

修复后的标注文件已保存到: E:/SUAL/SUAL/data/bamberg_coco1024/active_learning/annotations/instances_labeled_val_fixed.json
有效图像数量: 13
有效标注数量: 167
