##Patch 1.0.0##

In [13]:
import os
import cv2
import numpy as np
from collections import defaultdict

def are_images_identical(image1, image2):
    # Check if both images have the same shape
    if image1.shape != image2.shape:
        return False
    
    # Compare pixel values
    difference = cv2.subtract(image1, image2)
    if np.any(difference):
        return False
    return True

def find_duplicate_images(folder_path):
    images = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder_path, filename)
            images[filename] = cv2.imread(img_path)

    duplicates = defaultdict(list)
    checked_images = set()
    group_id = 1
    
    for image1_name, image1 in images.items():
        if image1_name in checked_images:
            continue
        duplicates[f'dup{group_id}'].append(image1_name)
        for image2_name, image2 in images.items():
            if image1_name != image2_name and image2_name not in checked_images:
                if are_images_identical(image1, image2):
                    duplicates[f'dup{group_id}'].append(image2_name)
                    checked_images.add(image2_name)
        checked_images.add(image1_name)
        group_id += 1

    # Remove groups with only one image (no duplicates)
    duplicates = {k: v for k, v in duplicates.items() if len(v) > 1}

    return duplicates

# def save_report(duplicates, report_path):
#     with open(report_path, 'w') as file:
#         for group, images in duplicates.items():
#             file.write(f"{group}: {', '.join(images)}\n")

def save_report(duplicates, report_path):
    total_dup = len(duplicates)
    total_duplicates = 0
    with open(report_path, 'w') as file:
        for group, images in duplicates.items():
            file.write(f"{group}: {', '.join(images)}\n")
            file.write(f"จำนวนภาพใน {group}: {len(images)}\n")
            
            total_duplicates += len(images)
        file.write(f"ภาพ dup : {total_dup}\n")
        file.write(f"จำนวนภาพซ้ำทั้งหมด: {total_duplicates}")

# folder_path = './66F_Healthy-20240719T132910Z-001/66F_Healthy'
# report_path = '66F_Healthy.txt'
# Example usage:
folder_path = './dataset_sib_391-780'
report_path = 'sib_391-780.txt'
duplicate_images = find_duplicate_images(folder_path)
save_report(duplicate_images, report_path)
print(f"Duplicate images report saved to {report_path}")


Duplicate images report saved to sib_391-780.txt


In [11]:
len(duplicate_images)

79

In [None]:
import cv2
import matplotlib.pyplot as plt

# Use raw string to avoid escape sequence issues
image1 = cv2.imread(r'C:\Teamsean\dataset_sib_391-780\66F_Healthy_391.jpg')
image2 = cv2.imread(r'C:\Teamsean\dataset_sib_391-780\66F_Healthy_392.jpg')
# Convert the image to RGB (OpenCV uses BGR)
# if image is not None:
#     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#     # Display the image
#     plt.imshow(image)
#     plt.axis('off')  # Hide axis
#     plt.show()
# else:
#     print("Error: Image not found.")

# Check if the images are identical
if image1.shape != image2.shape:
    print("Images have different dimensions")
else:
    difference = cv2.subtract(image1, image2)
    print(difference)
    if not np.any(difference):
        print("Images are identical")
    else:
        print("Images are different")

[[[ 15  79  95]
  [  3  67  83]
  [  0  51  64]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 [[ 20  90 104]
  [ 16  83  96]
  [ 14  79  92]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 [[ 19  95 109]
  [  0  66  78]
  [  4  77  87]
  ...
  [  0   0   0]
  [  0   0   0]
  [  0   0   0]]

 ...

 [[ 35  38  53]
  [ 41  47  59]
  [ 27  33  45]
  ...
  [  0  30  56]
  [  0  31  56]
  [  0  16  41]]

 [[ 39  44  59]
  [ 51  57  70]
  [ 60  63  78]
  ...
  [  0  21  47]
  [  0  22  47]
  [  0  26  51]]

 [[ 43  48  63]
  [ 29  34  49]
  [ 47  52  67]
  ...
  [  0  22  48]
  [  0  23  48]
  [  0  26  51]]]
Images are different


##Patch 1.0.1##

In [15]:
import os
import cv2
import numpy as np
from collections import defaultdict
import shutil

def are_images_identical(image1, image2):
    if image1.shape != image2.shape:
        return False
    difference = cv2.subtract(image1, image2)
    return not np.any(difference)

def find_duplicate_images(folder_path):
    images = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder_path, filename)
            images[filename] = cv2.imread(img_path)

    duplicates = defaultdict(list)
    checked_images = set()
    group_id = 1
    
    for image1_name, image1 in images.items():
        if image1_name in checked_images:
            continue
        duplicates[f'dup{group_id}'].append(image1_name)
        for image2_name, image2 in images.items():
            if image1_name != image2_name and image2_name not in checked_images:
                if are_images_identical(image1, image2):
                    duplicates[f'dup{group_id}'].append(image2_name)
                    checked_images.add(image2_name)
        checked_images.add(image1_name)
        group_id += 1

    duplicates = {k: v for k, v in duplicates.items() if len(v) > 1}
    return duplicates

def save_report(duplicates, report_path):
    total_dup = len(duplicates)
    total_duplicates = 0
    with open(report_path, 'w') as file:
        for group, images in duplicates.items():
            file.write(f"{group}: {', '.join(images)}\n")
            file.write(f"จำนวนภาพใน {group}: {len(images)}\n")
            
            total_duplicates += len(images)
        file.write(f"ภาพ dup : {total_dup}\n")
        file.write(f"จำนวนภาพซ้ำทั้งหมด: {total_duplicates}")

def move_duplicate_images(duplicates, source_folder, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    
    for group, images in duplicates.items():
        for image in images[1:]:  # Skip the first image
            source_path = os.path.join(source_folder, image)
            destination_path = os.path.join(destination_folder, image)
            shutil.move(source_path, destination_path)
    
    print(f"Moved duplicate images to {destination_folder}")


folder_path = './dataset_sib_391-780'
report_path = 'dataset_sib_391-780.txt'
duplicate_images = find_duplicate_images(folder_path)
save_report(duplicate_images, report_path)
print(f"Duplicate images report saved to {report_path}")


Name_dir_duplicate = 'Duplicate_Images_dataset_sib_391-780'
destination_folder = os.path.join(os.path.dirname(folder_path), Name_dir_duplicate)
move_duplicate_images(duplicate_images, folder_path, destination_folder)


Duplicate images report saved to dataset_sib_391-780.txt
Moved duplicate images to .\Duplicate_Images_dataset_sib_391-780
