In [29]:
from PIL import Image
import numpy as np
import cv2
import os
from pathlib import Path
from imagehash import phash
from itertools import combinations
from typing import List, Set, Tuple
from termcolor import colored
import datetime
from tqdm import tqdm

class ImageDescriptor:
    def __init__(self, unique_images: Set[Path], similar_groups: List[List[Path]]):
        self.unique_images = unique_images
        self.similar_groups = similar_groups
        # tqdm.write(colored(f"ImageDescripter constructed: {[len(unique_images)] + [len(sub_array) for sub_array in similar_groups]}", "white"))

    def serialize(self, filepath: str):
        """将描述信息保存为文本文件"""
        tqdm.write(colored(f"Saving description to {filepath}", "white"))
        with open(filepath, 'w') as file:
            file.write("Unique Images:\n")
            for image in self.unique_images:
                file.write(f"{image.name}\n")
            file.write("\nSimilar Groups:\n")
            for group in self.similar_groups:
                file.write(f"Group:\n")
                for image in group:
                    file.write(f"{image.name}\n")
                file.write("\n")

class HashDetector:
    def __init__(self, precision: int):
        self.precision = precision

    def detect(self, images: List[Path]) -> ImageDescriptor:
        # tqdm.write(colored(f"Detecting duplicates using perceptual hash, precision: {self.precision}\nimages cnt: {len(images)}", "white"))
        hash_dict = {}
        
        # 添加tqdm进度条
        for image_path in tqdm(images, desc="Hashing images"):
            try:
                with Image.open(image_path) as img:
                    # 计算图片的perceptual hash
                    img_hash = phash(img.convert("L").resize((self.precision, self.precision)))
                    if img_hash in hash_dict:
                        hash_dict[img_hash].append(image_path)
                    else:
                        hash_dict[img_hash] = [image_path]
            except Exception as e:
                tqdm.write(colored(f"Error processing {image_path}: {e}", "red"))
                    
        tqdm.write(colored(f"Found {len(hash_dict)} unique hashes", "white"))
        unique_images = set()
        similar_groups = []
        
        # 添加tqdm进度条
        for paths in tqdm(hash_dict.values(), desc="Grouping images"):
            if len(paths) == 1:
                unique_images.add(paths[0])
            else:
                similar_groups.append(paths)
                
        tqdm.write(colored(f"Found {len(unique_images)} unique images, {len(similar_groups)} similar groups", "white"))
        return ImageDescriptor(unique_images, similar_groups)

class ORBDetector:
    def __init__(self, nfeatures: int, threshold: float):
        self.nfeatures = nfeatures
        self.threshold = threshold

    def detect(self, images: List[Path]) -> ImageDescriptor:
        # tqdm.write(colored(f"Detecting duplicates using ORB, nfeatures: {self.nfeatures}, threshold: {self.threshold}, images cnt: {len(images)}", "white"))
        keypoints_dict = {img: self._extract_features(img) for img in images}
        similar_groups = []
        unique_images = set(images)
        
        # 直接计算组合数，而不生成组合列表
        combines = combinations(images, 2)
        
        # 使用tqdm直接包装组合迭代器
        # for img1, img2 in tqdm(combines, total=len(images) * (len(images) - 1) // 2, desc="Matching features"):
        for img1, img2 in combines:
            kp1, des1 = keypoints_dict[img1]
            kp2, des2 = keypoints_dict[img2]
            if des1 is not None and des2 is not None:
                if self._match_features(des1, des2) > self.nfeatures * self.threshold:
                    similar_groups.append([img1, img2])
                    unique_images.discard(img1)
                    unique_images.discard(img2)

        return ImageDescriptor(unique_images, similar_groups)

    def _extract_features(self, image_path: Path):
        orb = cv2.ORB_create(self.nfeatures)
        img = cv2.imread(str(image_path), cv2.IMREAD_GRAYSCALE)
        return orb.detectAndCompute(img, None)

    def _match_features(self, des1, des2):
        bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
        matches = bf.match(des1, des2)
        return len(matches)

class ImageDeduplicator:
    def __init__(self, directory: str):
        if not os.path.exists(directory) or not os.path.isdir(directory):
            tqdm.write(colored(f"Directory {directory} not valid", "red"))
            self.directory = None
            return
        thumbnail = Path(f"{directory}/thumbnail")
        if not os.path.exists(thumbnail) or not os.path.isdir(thumbnail):
            tqdm.write(colored(f"Directory {thumbnail} not valid, create thumbnails first", "red"))
            self.directory = None
            return
        self.directory = directory
        self.detectors = [
            HashDetector(8),
            ORBDetector(1000, 0.7)
        ]

    def deduplicate(self):

        if self.directory is None:
            tqdm.write(colored("Directory not valid", "red"))
            return
        thumbails = [file for file in Path(f"{self.directory}/thumbnail").glob('*') if not file.name.startswith('.') and file.suffix.lower() in [".jpg", ".png"]]
        descriptor = ImageDescriptor(set(), [thumbails])

        for detector in self.detectors:
            tqdm.write(colored(f"Processed with {type(detector).__name__}[{id(detector)}], similar_groups count: {len(descriptor.similar_groups)}", "yellow"))
            new_descripter = ImageDescriptor(descriptor.unique_images, [])
            if isinstance(detector, ORBDetector):
                for group_of_img in tqdm(descriptor.similar_groups):
                    result = detector.detect(group_of_img)
                    new_descripter.unique_images.update(result.unique_images)
                    new_descripter.similar_groups.extend(result.similar_groups)
            else:
                for group_of_img in descriptor.similar_groups:
                    result = detector.detect(group_of_img)
                    new_descripter.unique_images.update(result.unique_images)
                    new_descripter.similar_groups.extend(result.similar_groups)
            descriptor = new_descripter
            # 序列化待序列化的描述对象
            timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            filepath = f"{self.directory}/descriptor_{type(detector).__name__}_{timestamp}.txt"
            descriptor.serialize(filepath)
            tqdm.write(colored(f"After processed, similar_groups count: {len(descriptor.similar_groups)}, unique count: {len(descriptor.unique_images)}\n", "yellow"))
        
        return descriptor

# Example usage
# deduplicator = ImageDeduplicator("/Users/chenweichu/dev/data/test_副本")
deduplicator = ImageDeduplicator("/Volumes/192.168.1.173/pic/陈都灵_503[167_MB]")
deduplicator = ImageDeduplicator("/Volumes/192.168.1.173/pic/鞠婧祎_4999[5_GB]")

final_descriptor = deduplicator.deduplicate()
if final_descriptor is None:
    tqdm.write(colored("Deduplication failed", "red"))
else:
    tqdm.write(colored(f"Final unique images count: {len(final_descriptor.unique_images)}", "white"))


[33mProcessed with HashDetector[6260114672], similar_groups count: 1[0m


Hashing images: 100%|██████████| 4995/4995 [02:24<00:00, 34.59it/s]


[37mFound 4330 unique hashes[0m


Grouping images: 100%|██████████| 4330/4330 [00:00<00:00, 388511.02it/s]


[37mFound 3683 unique images, 647 similar groups[0m
[37mSaving description to /Volumes/192.168.1.173/pic/鞠婧祎_4999[5_GB]/descriptor_HashDetector_20240512140338.txt[0m
[33mAfter processed, similar_groups count: 647, unique count: 3683
[0m
[33mProcessed with ORBDetector[6260116928], similar_groups count: 647[0m


100%|██████████| 647/647 [00:28<00:00, 22.40it/s]


[37mSaving description to /Volumes/192.168.1.173/pic/鞠婧祎_4999[5_GB]/descriptor_ORBDetector_20240512140407.txt[0m
[33mAfter processed, similar_groups count: 377, unique count: 4255
[0m
[37mFinal unique images count: 4255[0m
