In [None]:
import os
import cv2
import shutil
import numpy as np
from tqdm import tqdm

# Correct folders
SOURCE_DIR = "data/processed/preproc_backup_31396"
DEST_DIR = "data/processed/preproc_best10k"

os.makedirs(DEST_DIR, exist_ok=True)

# ---------------------------------------------------
# 1) Gather all JPG image paths
# ---------------------------------------------------
image_paths = [
    os.path.join(SOURCE_DIR, f)
    for f in os.listdir(SOURCE_DIR)
    if f.lower().endswith(".jpg")
]

print(f"Found {len(image_paths):,} processed images.")


# ---------------------------------------------------
# 2) Compute quality metrics for each image
# ---------------------------------------------------
def compute_sharpness(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var()


def compute_resolution(img):
    h, w = img.shape[:2]
    return h * w


scores = []

for path in tqdm(image_paths, desc="Scoring images"):
    img = cv2.imread(path)
    if img is None:
        continue

    sharp = compute_sharpness(img)
    res = compute_resolution(img)

    quality = 0.5 * sharp + 0.5 * res

    scores.append((path, quality))

# ---------------------------------------------------
# 3) Sort by quality descending & keep top 10,000
# ---------------------------------------------------
scores_sorted = sorted(scores, key=lambda x: x[1], reverse=True)
best10k = scores_sorted[:10000]

print("Saving best 10,000 images...")

for path, _ in tqdm(best10k):
    fname = os.path.basename(path)
    shutil.copy2(path, os.path.join(DEST_DIR, fname))

print("DONE. Check: data/processed/preproc_best10k/")
