## !!! This is the old preprocess file
- uses cv2 to preprocess the images and apply filters on them
- now the preprocess step only uses the removebg api

In [1]:
ITEMS_LIST = "../yjpa_scraper/items-select.csv"
ITEMS_URL = "../yjpa_scraper/items-url.csv"
ITEMS_LISTING_MENU = "../yjpa_scraper/items_listing_menu/"
ITEMS_LISTING_JSON = "../yjpa_scraper/items_listing/json"
ITEMS_LISTING_IMG = "../yjpa_scraper/items_listing/raw"
PREPROCESSED = "../yjpa_scraper/items_listing/preprocessed"

In [12]:
import cv2
import numpy as np
from pathlib import Path
import os
import pandas as pd

# -------------------------------
# Individual preprocessing steps
# -------------------------------

def apply_clahe(img):
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    limg = cv2.merge((cl, a, b))
    return cv2.cvtColor(limg, cv2.COLOR_LAB2BGR)

def adjust_gamma(image, gamma=1.2):
    invGamma = 1.0 / gamma
    table = np.array([
        ((i / 255.0) ** invGamma) * 255 for i in np.arange(256)
    ]).astype("uint8")
    return cv2.LUT(image, table)

def unsharp_mask(image):
    blurred = cv2.GaussianBlur(image, (0, 0), sigmaX=1)
    return cv2.addWeighted(image, 1.5, blurred, -0.5, 0)

def emphasize_edges_sobel(image, strength=0.6):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
    sobel = cv2.magnitude(sobelx, sobely)
    sobel = cv2.convertScaleAbs(sobel)
    sobel_color = cv2.cvtColor(sobel, cv2.COLOR_GRAY2BGR)
    return cv2.addWeighted(image, 1.0, sobel_color, strength, 0)

def boost_contrast(image, alpha=1.5, beta=0):
    """
    Increase contrast using linear transformation:
    new_pixel = alpha * pixel + beta
    alpha > 1.0 increases contrast
    """
    return cv2.convertScaleAbs(image, alpha=alpha, beta=beta)

# -------------------------------
# Full preprocessing pipeline
# -------------------------------

def preprocess_image(img_path):
    img = cv2.imread(str(img_path), cv2.IMREAD_UNCHANGED)
    if img is None:
        print(f"Warning: Could not read image {img_path}")
        return None

    # If image has alpha channel (4 channels), split and preserve it
    if img.shape[2] == 4:
        bgr = img[:, :, :3]
        alpha = img[:, :, 3]
    else:
        bgr = img
        alpha = None

    # Apply preprocessing steps to BGR part only
    bgr = apply_clahe(bgr)
    bgr = adjust_gamma(bgr, gamma=1.2)
    bgr = unsharp_mask(bgr)
    bgr = emphasize_edges_sobel(bgr, strength=1.2)
    bgr = boost_contrast(bgr, alpha=1.2) 

    # Re-attach alpha channel if it existed
    if alpha is not None:
        img_out = cv2.merge((bgr, alpha))
    else:
        img_out = bgr

    return img_out

# -------------------------------
# Patch cropper (optional)
# -------------------------------

def crop_to_patches(image, patch_size=512, stride=256):
    patches = []
    h, w, _ = image.shape
    for y in range(0, h - patch_size + 1, stride):
        for x in range(0, w - patch_size + 1, stride):
            patch = image[y:y+patch_size, x:x+patch_size]
            patches.append((patch, x, y))
    return patches

# -------------------------------
# Batch preprocessing with optional patch saving
# -------------------------------

def batch_preprocess_images(input_dir, output_dir, patching=False, patch_size=512, stride=256):
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    image_extensions = ['.jpg', '.jpeg', '.png']
    for img_path in input_dir.glob("*"):
        if img_path.suffix.lower() not in image_extensions:
            continue

        processed = preprocess_image(img_path)
        if processed is not None:
            # If the image has alpha, enforce .png extension
            if processed.shape[2] == 4:
                out_filename = img_path.stem + '.png'
                out_path = output_dir / out_filename
                cv2.imwrite(str(out_path), processed, [cv2.IMWRITE_PNG_COMPRESSION, 3])
            else:
                out_path = output_dir / img_path.name
                cv2.imwrite(str(out_path), processed)


#             print(f"Processed: {img_path}")



def batch_preprocess_listing_images(items_list_csv,items_listing_img_dir,preprocessed_dir,patching=False,patch_size=512,stride=256):
    """
    For each ID in items_list_csv, process all images in items_listing_img_dir/{id}
    and save them to preprocessed_dir/{id}.
    """
    # Load IDs from CSV
    df = pd.read_csv(items_list_csv, usecols=['id'])
    ids = df['id'].astype(str).tolist()

    items_listing_img_dir = Path(items_listing_img_dir)
    preprocessed_dir = Path(preprocessed_dir)

    for item_id in ids:
        input_folder = items_listing_img_dir / item_id
        output_folder = preprocessed_dir / item_id

        if not input_folder.exists():
            print(f"Skipping ID {item_id}: folder {input_folder} does not exist.")
            continue
        # Use your existing batch_preprocess_images() function
        batch_preprocess_images(
            input_dir=input_folder,
            output_dir=output_folder,
            patching=patching,
            patch_size=patch_size,
            stride=stride
        )
        print(f"Processed Item: {item_id}")

# -------------------------------
# Example usage
# -------------------------------


if __name__ == "__main__":
    items_list=ITEMS_LIST
    input_folder = ITEMS_LISTING_IMG
    output_folder = PREPROCESSED
    enable_patching = False
    batch_preprocess_listing_images(items_list,input_folder,output_folder)



Processed Item: 108
Processed Item: 37
Processed Item: 199
Processed Item: 193
Processed Item: 244
Processed Item: 100
Processed Item: 210
Processed Item: 154
Processed Item: 95
Processed Item: 99
