In [None]:
import os
import re
import sys
import hashlib
from tqdm import tqdm
from PIL import Image, ImageChops
from dotenv import load_dotenv

sys.path.append(os.path.join("..", ".."))
from utils.s3_bucket import S3Bucket

In [None]:
bucket_name = "ava-cv-raw-photo-bucket"
prefix = "GoogleImages/plants/"

In [None]:
bucket = S3Bucket(
    bucket_name=bucket_name,
    region_name=os.environ["REGION_NAME"]
)

In [None]:
%%time

# convert all images to JPEG format, delete image if it fails to convert
failed_images_count = 0
for obj in bucket.filter(prefix=prefix):
    if not obj.key.endswith((".png", ".webp", ".gif")):
        continue
        
    new_key = re.sub(r"\.(png|webp|gif)", ".jpeg", obj.key)
    try:
        bucket[new_key] = bucket[obj.key]
    except KeyboardInterrupt as exc:
        raise exc
    except Exception:
        print(f"Exception occured when converting image to JPEG. Deleting {obj.key}...")
        failed_images_count += 1
    bucket.delete(key=obj.key)
    
print(f"Deleted {failed_images_count} failed images.")

In [None]:
%%time

# create a hash string from every image, if any of the images have
# the same hash string, then they're most likely duplicates
duplicate_images_count = 0
hash_to_keys = {}
for obj in bucket.filter(prefix=prefix):
    if not obj.key.endswith((".jpg", ".jpeg")):
        continue
        
    md5_hash = hashlib.md5(bucket[obj.key].tobytes()).hexdigest()
    if md5_hash not in hash_to_keys:
        hash_to_keys[md5_hash] = []
    hash_to_keys[md5_hash].append(obj.key)

for md5_hash, keys in hash_to_keys.items():
    if len(keys) <= 1:
        continue
    
    for key in keys[1:]:
        print(f"Found duplicate. Deleting {key}...")
        duplicate_images_count += 1
        bucket.delete(key=key)
        
print(f"Deleted {duplicate_images_count} duplicate images.")