In [76]:
import cv2
import os
import numpy as np
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm
import datetime

def detect_orb_keypoints(image_path, nfeatures):
    """
    使用ORB检测器提取图像的特征点。
    
    参数:
    - image_path: 图像文件路径
    
    返回:
    - keypoints: 关键点列表
    - descriptors: 描述符列表
    """
    orb = cv2.ORB_create(nfeatures)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    kp, des = orb.detectAndCompute(img, None)
    return kp, des

def match_descriptors(des1, des2):
    """
    使用BFMatcher匹配描述符。
    
    参数:
    - des1, des2: 描述符列表
    
    返回:
    - matches: 匹配结果
    """
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1, des2)
    return matches

def print_matches(matches):
    # 设置每行的字符宽度和每列的匹配数目
    width = 120  # 每行的字符宽度
    matches_per_row = 10  # 每列的匹配数目

    # 打印匹配的距离，以矩阵形式显示
    print("Matches:")
    # 计算需要打印的行数和行号的宽度
    rows = (len(matches) - 1) // matches_per_row + 1
    row_numbers = list(range(1, rows + 1))
    line_width = len(str(len(matches) // matches_per_row + 1))  # 行号的最大位数

    # 确定距离数值的格式化字符串，确保等宽显示
    max_distance = max(matches, key=lambda x: x.distance).distance
    distance_width = len(f"{max_distance:.4f}")  # 距离数值的最大位数

    # 计算每行的格式化字符串
    formatted_rows = []
    for i in range(rows):
        # 计算当前行的起始和结束索引
        start_index = i * matches_per_row
        end_index = start_index + matches_per_row
        # 获取当前行的匹配项
        row_matches = matches[start_index:end_index]
        # 创建一个格式化的字符串列表，每个匹配项后面都有足够的空格
        formatted_distances = [f"{match.distance:>{distance_width}.2f}" for match in row_matches]
        # 构建行号格式化字符串
        line_number = f"[{str(i + 1).zfill(line_width)}]"
        # 将行号和匹配距离连接成一个字符串，并添加到列表中
        formatted_rows.append(f"{line_number} {' '.join(formatted_distances)}")

    # 打印每行格式化后的字符串
    for row in formatted_rows:
        print(row.ljust(width))

    # 如果需要，也可以将这些信息保存到日志文件中
    log_file_path = 'matches_log.txt'
    with open(log_file_path, 'w') as log_file:
        log_file.write("Matches (first N shown):\n")
        for row in formatted_rows:
            log_file.write(f"{row}\n")

def compare_images_by_orb_features(image1_path, image2_path, log_detail=False):
    """
    比较两张图像的ORB特征，判断是否相似，并增加详细日志输出辅助问题判断。
    修正逻辑：若总匹配数达到原图特征点数的90%，则视作相似。
    """

    nfeatures = 500
    num_threshold = 0.7


    if log_detail: print(f"Comparing images by ORB features\nimg1: {image1_path}\nimg2: {image2_path}")
    match_details = {}
    
    kp1, des1 = detect_orb_keypoints(image1_path, nfeatures)
    kp2, des2 = detect_orb_keypoints(image2_path, nfeatures)
    
    # 新增日志：检测到的关键点数量
    match_details["keypoints_image1"] = len(kp1)
    match_details["keypoints_image2"] = len(kp2)
    if log_detail: print(f"KeyPoints Count: img1 -> {match_details['keypoints_image1']}, img2 -> {match_details['keypoints_image2']}")

    # 检查是否成功检测到特征点
    if des1 is None or des2 is None:
        if log_detail: print(f"No keypoints detected")
        match_details["status"] = "No keypoints"
        return False
    
    matches = match_descriptors(des1, des2)
        
    # 输出匹配数量
    total_matches = len(matches)
    if log_detail: print(f"Total matches between img1 and img2: {total_matches}")
    
    if total_matches == 0:  
        if log_detail: print(f"No matches found between img1 and img2")
        match_details["status"] = "No matches"
        return False

    if total_matches > nfeatures * num_threshold:
        if log_detail: print(f"img1 and img2 are similar.")
        return True
    else:
        if log_detail: print(f"img1 and img2 are different.")
        return False

def remove_duplicates_by_orb(directory, remove_from_disk=False):
    """
    使用ORB特征去除目录中的重复图像，并记录详细日志。
    
    参数:
    - directory: 图像文件目录
    - remove_from_disk: 是否删除重复文件
    
    返回:
    - unique_images: 唯一图像文件路径列表
    """
    directory_path = Path(directory)
    total_files = sum(1 for item in directory_path.iterdir() if item.is_file())
    print(f"开始处理图像去重，总文件数：{total_files}")

    log_file_path = os.path.join(os.getcwd(), f"log_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.txt")
    with open(log_file_path, "w") as f:
        f.write("开始处理图像去重...\n")

    unique_images = []
    duplicates = defaultdict(list)  # 使用字典存储重复图片组
    for img_path in tqdm(directory_path.glob("*"), total=total_files, desc='Processing'):
        if not img_path.suffix.lower() in (".jpg", ".png"):
            continue

        is_duplicate = False
        for unique_path in unique_images:
            if compare_images_by_orb_features(str(img_path), str(unique_path))[0]:
                duplicates[str(unique_path)].append(img_path)
                is_duplicate = True
                break
                
        if not is_duplicate:
            unique_images.append(img_path)

    # 记录去重结果并打印日志
    group_count = 1
    with open(log_file_path, "a") as f:
        for unique_path, dup_group in duplicates.items():
            log_message = f"第 {group_count} 组重复图片，共 {len(dup_group) + 1} 张\n"
            print(log_message[:-1])  # 打印日志内容到控制台，去除末尾换行符
            f.write(log_message)
            
            w, h = get_image_dimensions(unique_path)
            log_message = f"{unique_path} w:{w} h:{h} - 保留（尺寸最大）\n"
            print(log_message[:-1])
            f.write(log_message)
            
            for duplicate in dup_group:
                w, h = get_image_dimensions(duplicate)
                log_message = f"{duplicate} w:{w} h:{h} - 标记删除\n"
                print(log_message[:-1])
                f.write(log_message)
            group_count += 1
            
        f.write(f"总重复组数：{group_count - 1}\n")
        print(f"总重复组数：{group_count - 1}")

    print(f"去重后数量: {len(unique_images)}, 总重复文件数: {sum(len(v) for v in duplicates.values())}")
    print(f"详细信息见 {log_file_path}")

    # 根据需求移除重复文件
    if remove_from_disk:
        for duplicates_list in duplicates.values():
            for duplicate in duplicates_list:
                try:
                    os.remove(str(duplicate))
                    print(f"已删除重复文件: {duplicate}")
                except Exception as e:
                    print(f"删除文件 {duplicate} 出错: {e}")

    return unique_images

def get_image_dimensions(image_path):
    """获取图像的宽度和高度"""
    img = cv2.imread(str(image_path))  
    height, width, _ = img.shape
    return width, height

In [77]:
detail_info = compare_images_by_orb_features(
    "/Volumes/192.168.1.173/pic/test/hh.jpg",
    "/Volumes/192.168.1.173/pic/test/h.jpg",
    True)

Comparing images by ORB features
img1: /Volumes/192.168.1.173/pic/test/hh.jpg
img2: /Volumes/192.168.1.173/pic/test/h.jpg
KeyPoints Count: img1 -> 500, img2 -> 500
Total matches between img1 and img2: 370
img1 and img2 are similar.


In [78]:
# 调用函数并接收返回的去重后文件列表
# unique_images_list = remove_duplicates_by_orb("F:\\pic\\test", False)