In [3]:

FIREBASE_CRED_PATH = "../misc/strustore-dev-firebase-adminsdk-emic5-85ed2690d9.json"

MODEL="gdino" #sam/gdino/manual
ANNOTATION_SPLIT = "../Open-GroundingDino/annotations/dataset_split.json"
LENS = f"../zm_scraper/auctions/{MODEL}/lens/"
MANUAL_POSTPROCESSED_LBL = "../Open-GroundingDino/annotations/test_coco.json" #manual
GDINO_POSTPROCESSED_LBL = "../zm_scraper/auctions/gdino/output_labels" #gdino 
SAM_POSTPROCESSED_LBL = "../zm_scraper/auctions/sam/postprocessed/labels" #sam

RAW = "../zm_scraper/auctions/raw"
PREPROCESSED = "../zm_scraper/auctions/preprocessed"
BUCKET_NAME = "strustore-dev.firebasestorage.app"
TEXT_PROMPT_CSV = "../zm_scraper/items-prompt.csv"

OVERWRITE_RESULTS=False

## SAM : Extract Masks from .npy files and save them into firestore

In [None]:
# obtain masks and their public urls
import os
import numpy as np
import cv2
import pandas as pd
import json
import firebase_admin
from firebase_admin import credentials, storage

os.makedirs(LENS, exist_ok=True)

# Initialize Firebase only if not already initialized
if not firebase_admin._apps:
    cred = credentials.Certificate(FIREBASE_CRED_PATH)
    firebase_admin.initialize_app(cred, {
        'storageBucket': BUCKET_NAME
    })
bucket = storage.bucket()

# ────────────── Load Dataset Split ──────────────
with open(ANNOTATION_SPLIT, "r") as f:
    split_data = json.load(f)

test_split = split_data.get("test", {})

# ────────────── FUNCTIONS ──────────────
def upload_image_to_firebase(image_array, remote_name):
    """Uploads an image (NumPy array) as PNG to Firebase without saving locally."""
    success, encoded_image = cv2.imencode('.png', image_array)
    if not success:
        raise ValueError("Failed to encode image to PNG.")

    blob = bucket.blob(remote_name)
    blob.upload_from_string(encoded_image.tobytes(), content_type='image/png')
    blob.make_public()  # Make file publicly accessible
    return blob.public_url

def extract_and_upload_masks(image_path, label_path, item_id, auction_id):
    """Extract masks and original image crops, upload to Firebase."""
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    if image is None:
        print(f"[ERROR] Image not found: {image_path}")
        return {}

    if image.shape[2] == 4:  # Drop alpha if present
        image = image[:, :, :3]

    label_map = np.load(label_path)
    region_ids = np.unique(label_map)
    region_ids = region_ids[region_ids > 0]  # Exclude background

    mask_data = {}

    for region_id in region_ids:
        mask = (label_map == region_id).astype(np.uint8)
        ys, xs = np.where(mask == 1)
        if ys.size == 0 or xs.size == 0:
            continue

        # Compute bounding box
        y_min, y_max = int(ys.min()), int(ys.max())
        x_min, x_max = int(xs.min()), int(xs.max())

        # Expand by 5%
        height, width = image.shape[:2]
        delta_x = int((x_max - x_min) * 0.05)
        delta_y = int((y_max - y_min) * 0.05)

        x_min = max(0, x_min - delta_x)
        y_min = max(0, y_min - delta_y)
        x_max = min(width - 1, x_max + delta_x)
        y_max = min(height - 1, y_max + delta_y)

        # Crop original image using expanded bounding box
        original_crop = image[y_min:y_max+1, x_min:x_max+1]

        # Convert original crop to PNG and upload
        firebase_crop_name = f"{MODEL}/{item_id}/{auction_id}/mask_{region_id}.png"
        url = upload_image_to_firebase(original_crop, firebase_crop_name)


        # Save both URLs in JSON
        mask_data[str(region_id)] = {
            "url": url,
            "bbox": [x_min, y_min, x_max, y_max],
            "results": [],
            "tokens": []
        }

    return mask_data


def process_items():
    df = pd.read_csv(TEXT_PROMPT_CSV)
    item_ids = df['id'].astype(str).tolist()

    for item in item_ids:
        if item not in test_split:
            print(f"[SKIP] Item {item} not in test split.")
            continue

        allowed_files = test_split[item]  # e.g., ["auction1.jpg", "auction2.jpg"]
        allowed_ids = set([os.path.splitext(fname)[0] for fname in allowed_files])

        print(f"\nProcessing Item: {item} (Allowed auctions: {len(allowed_ids)})")
        label_dir = os.path.join(SAM_POSTPROCESSED_LBL, item)
        img_dir = os.path.join(PREPROCESSED, item)

        if not os.path.exists(label_dir):
            print(f"[WARNING] Label directory missing: {label_dir}")
            continue
        if not os.path.exists(img_dir):
            print(f"[WARNING] Image directory missing: {img_dir}")
            continue

        for npy_file in os.listdir(label_dir):
            if not npy_file.endswith(".npy"):
                continue

            auction_id = npy_file.replace(".npy", "")
            if auction_id not in allowed_ids:
                continue  # Skip auctions not in allowed list

            label_path = os.path.join(label_dir, npy_file)
            image_path = os.path.join(img_dir, f"{auction_id}.png")

            if not os.path.exists(image_path):
                print(f"[WARNING] Missing original image for auction {auction_id}")
                continue

            # Extract, upload, and build JSON
            auction_mask_data = extract_and_upload_masks(image_path, label_path, item, auction_id)

            # Save JSON for this auction
            auction_output_dir = os.path.join(LENS, item)
            os.makedirs(auction_output_dir, exist_ok=True)
            json_path = os.path.join(auction_output_dir, f"{auction_id}.json")

            with open(json_path, "w") as f:
                json.dump(auction_mask_data, f, indent=4)

            print(f"[DONE] JSON saved: {json_path}")

if __name__ == "__main__":
    process_items()
    print("\n✅ Filtered auctions processed and uploaded to Firebase!")


## GDino: Extract Masks and save them in Firestore

In [4]:
# obtain masks and their public urls (now reading bboxes from .npz)
import os
import numpy as np
import cv2
import pandas as pd
import json
import firebase_admin
import math
from firebase_admin import credentials, storage

os.makedirs(LENS, exist_ok=True)

# Initialize Firebase only if not already initialized
if not firebase_admin._apps:
    cred = credentials.Certificate(FIREBASE_CRED_PATH)
    firebase_admin.initialize_app(cred, {
        'storageBucket': BUCKET_NAME
    })
bucket = storage.bucket()

# ────────────── Load Dataset Split ──────────────
with open(ANNOTATION_SPLIT, "r") as f:
    split_data = json.load(f)

test_split = split_data.get("test", {})

# ────────────── HELPERS ──────────────
def clamp_bbox_xyxy_exclusive(x1, y1, x2, y2, width, height):
    """
    Clamp (x1,y1,x2,y2) where x2,y2 are end-exclusive.
    Ensures 0 <= x1 < x2 <= width and 0 <= y1 < y2 <= height.
    """
    x1 = max(0, min(int(x1), max(0, width - 1)))
    y1 = max(0, min(int(y1), max(0, height - 1)))
    x2 = max(x1 + 1, min(int(x2), width))
    y2 = max(y1 + 1, min(int(y2), height))
    return x1, y1, x2, y2


def load_bboxes_from_coco_json(coco_path, image_shape, expected_filename=None):
    with open(coco_path, "r") as f:
        coco = json.load(f)

    height, width = image_shape[:2]
    images = coco.get("images", [])
    anns = coco.get("annotations", [])

    image_id = None
    if expected_filename:
        for img in images:
            if img.get("file_name") == expected_filename or \
               os.path.splitext(img.get("file_name", ""))[0] == os.path.splitext(expected_filename)[0]:
                image_id = img.get("id")
                break
    if image_id is None and images:
        image_id = images[0].get("id")
    if image_id is None:
        return []

    results = []
    for ann in anns:
        if ann.get("image_id") != image_id:
            continue
        bbox = ann.get("bbox")
        if not bbox or len(bbox) != 4:
            continue

        x, y, w, h = map(float, bbox)
        # Convert to xyxy end-exclusive
        x1 = math.floor(x)
        y1 = math.floor(y)
        x2 = math.ceil(x + w)
        y2 = math.ceil(y + h)

        # 🔹 Expand bbox by 5% of its size
        expand_w = 0.05 * w
        expand_h = 0.05 * h
        x1 -= expand_w / 2
        y1 -= expand_h / 2
        x2 += expand_w / 2
        y2 += expand_h / 2

        # Clamp to image boundaries
        x1, y1, x2, y2 = clamp_bbox_xyxy_exclusive(x1, y1, x2, y2, width, height)

        region_id = str(ann.get("id", len(results) + 1))
        results.append((region_id, (x1, y1, x2, y2)))

    return results


# ────────────── FUNCTIONS ──────────────
def upload_image_to_firebase(image_array, remote_name):
    """Uploads an image (NumPy array) as PNG to Firebase without saving locally."""
    success, encoded_image = cv2.imencode('.png', image_array)
    if not success:
        raise ValueError("Failed to encode image to PNG.")

    blob = bucket.blob(remote_name)
    blob.upload_from_string(encoded_image.tobytes(), content_type='image/png')
    blob.make_public()  # Make file publicly accessible
    return blob.public_url

def extract_and_upload_crops_from_coco(image_path, coco_json_path, item_id, auction_id):
    """Read COCO JSON bboxes, crop from image, upload to Firebase, and return JSON dict."""
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    if image is None:
        print(f"[ERROR] Image not found: {image_path}")
        return {}

    if image.ndim == 3 and image.shape[2] == 4:  # Drop alpha if present
        image = image[:, :, :3]

    # Expected filename to match within the COCO "images" array
    expected_filename = os.path.basename(image_path)

    id_bbox_list = load_bboxes_from_coco_json(coco_json_path, image.shape, expected_filename=expected_filename)

    mask_data = {}
    for region_id, (x1, y1, x2, y2) in id_bbox_list:
        # end-exclusive slicing
        crop = image[y1:y2, x1:x2]
        if crop.size == 0:
            continue

        # Upload crop
        firebase_crop_name = f"{MODEL}/{item_id}/{auction_id}/mask_{region_id}.png"
        url = upload_image_to_firebase(crop, firebase_crop_name)

        mask_data[str(region_id)] = {
            "url": url,
            "bbox": [int(x1), int(y1), int(x2), int(y2)],
            "results": [],
            "tokens": []
        }

    return mask_data


def process_items():
    df = pd.read_csv(TEXT_PROMPT_CSV)
    item_ids = df['id'].astype(str).tolist()

    for item in item_ids:
        if item not in test_split:
            print(f"[SKIP] Item {item} not in test split.")
            continue

        allowed_files = test_split[item]  # e.g., ["auction1.jpg", "auction2.png"]
        allowed_ids = set(os.path.splitext(fname)[0] for fname in allowed_files)

        print(f"\nProcessing Item: {item} (Allowed auctions: {len(allowed_ids)})")
        label_dir = os.path.join(GDINO_POSTPROCESSED_LBL, item)
        img_dir = os.path.join(PREPROCESSED, item)

        if not os.path.exists(label_dir):
            print(f"[WARNING] Label directory missing: {label_dir}")
            continue
        if not os.path.exists(img_dir):
            print(f"[WARNING] Image directory missing: {img_dir}")
            continue

        for label_file in os.listdir(label_dir):
            if not label_file.endswith(".json"):
                continue

            auction_id = os.path.splitext(label_file)[0]
            if auction_id not in allowed_ids:
                continue  # Skip auctions not in allowed list

            coco_json_path = os.path.join(label_dir, label_file)

            # Find matching image (try common extensions)
            image_path = None
            for ext in [".png", ".jpg", ".jpeg", ".JPG", ".PNG", ".JPEG"]:
                candidate = os.path.join(img_dir, f"{auction_id}{ext}")
                if os.path.exists(candidate):
                    image_path = candidate
                    break

            if image_path is None:
                print(f"[WARNING] Missing original image for auction {auction_id}")
                continue

            # Read COCO, crop, upload, and build JSON
            auction_mask_data = extract_and_upload_crops_from_coco(
                image_path=image_path,
                coco_json_path=coco_json_path,
                item_id=item,
                auction_id=auction_id
            )

            # Save JSON for this auction
            auction_output_dir = os.path.join(LENS, item)
            os.makedirs(auction_output_dir, exist_ok=True)
            json_path = os.path.join(auction_output_dir, f"{auction_id}.json")

            with open(json_path, "w") as f:
                json.dump(auction_mask_data, f, indent=4)

            print(f"[DONE] JSON saved: {json_path}")


if __name__ == "__main__":
    process_items()
    print("\n✅ Filtered auctions processed and uploaded to Firebase!")



Processing Item: 1 (Allowed auctions: 60)
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/w1190871465.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/q1177627244.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/t1144496280.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/x1121603563.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/l1180181252.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/r1140105858.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/n1181963058.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/w1191415731.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/l1190559645.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/c1174455551.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/c1186222082.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/1190947766.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/1/q1185787171.json
[DONE] JSON saved: ../zm_scraper/li

[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/3/d1181561615.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/3/b1187441783.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/3/l1180842849.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/3/d1190904638.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/3/m1190952251.json

Processing Item: 4 (Allowed auctions: 36)
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/n1191790097.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/m1191610582.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/t1175705831.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/p1191561874.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/f1191683408.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/j1191552346.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/f1191723507.json
[DONE] JSON saved: ../zm_scraper/listing/gdino/lens/4/t1191616766.json
[DONE] JSON saved: ../zm_scraper/l

## Manual: Extract Masks and save them in Firestore

In [32]:
# obtain masks and their public urls (now reading bboxes from .npz)
import os
import numpy as np
import cv2
import pandas as pd
import json
import firebase_admin
import math
from firebase_admin import credentials, storage

os.makedirs(LENS, exist_ok=True)

# Initialize Firebase only if not already initialized
if not firebase_admin._apps:
    cred = credentials.Certificate(FIREBASE_CRED_PATH)
    firebase_admin.initialize_app(cred, {
        'storageBucket': BUCKET_NAME
    })
bucket = storage.bucket()

# ────────────── Load Dataset Split ──────────────
with open(ANNOTATION_SPLIT, "r") as f:
    split_data = json.load(f)

test_split = split_data.get("test", {})

# ────────────── HELPERS ──────────────
def clamp_bbox_xyxy_exclusive(x1, y1, x2, y2, width, height):
    """
    Clamp (x1,y1,x2,y2) where x2,y2 are end-exclusive.
    Ensures 0 <= x1 < x2 <= width and 0 <= y1 < y2 <= height.
    """
    x1 = max(0, min(int(x1), max(0, width - 1)))
    y1 = max(0, min(int(y1), max(0, height - 1)))
    x2 = max(x1 + 1, min(int(x2), width))
    y2 = max(y1 + 1, min(int(y2), height))
    return x1, y1, x2, y2


def load_bboxes_from_coco_json(coco_path, image_shape, expected_filename=None):
    """
    Returns list of tuples: (region_id:str, (x1,y1,x2,y2)) from a COCO JSON file.

    - Supports per-image JSON (your current writer): takes the single image entry.
    - Supports dataset-wide JSON: filters annotations by matching image.file_name
      to `expected_filename` (if provided), else takes the first image block.

    Coordinates:
      COCO bbox is [x, y, w, h] in pixels (floats). We convert to xyxy with:
        x1 = floor(x), y1 = floor(y), x2 = ceil(x + w), y2 = ceil(y + h)
      We then clamp to image bounds and keep (x2, y2) as end-exclusive for slicing.
    """
    with open(coco_path, "r") as f:
        coco = json.load(f)

    height, width = image_shape[:2]

    images = coco.get("images", [])
    anns = coco.get("annotations", [])

    # If per-image JSON, there will be 1 image with id=1 and its annotations use image_id=1
    # If dataset-wide, find the image_id by filename
    image_id = None
    if expected_filename:
        # Handle both exact match and basename match
        for img in images:
            if img.get("file_name") == expected_filename or \
               os.path.splitext(img.get("file_name", ""))[0] == os.path.splitext(expected_filename)[0]:
                image_id = img.get("id")
                break
    if image_id is None:
        # Fallback: use the first image (per-image JSON typical)
        if images:
            image_id = images[0].get("id")

    if image_id is None:
        # No image found; return empty
        return []

    results = []
    for ann in anns:
        if ann.get("image_id") != image_id:
            continue
        bbox = ann.get("bbox")
        if not bbox or len(bbox) != 4:
            continue
        x, y, w, h = map(float, bbox)
        # Convert to xyxy end-exclusive
        x1 = math.floor(x)
        y1 = math.floor(y)
        x2 = math.ceil(x + w)
        y2 = math.ceil(y + h)
        x1, y1, x2, y2 = clamp_bbox_xyxy_exclusive(x1, y1, x2, y2, width, height)

        region_id = str(ann.get("id", len(results) + 1))
        results.append((region_id, (x1, y1, x2, y2)))

    return results

# ────────────── FUNCTIONS ──────────────
def upload_image_to_firebase(image_array, remote_name):
    """Uploads an image (NumPy array) as PNG to Firebase without saving locally."""
    success, encoded_image = cv2.imencode('.png', image_array)
    if not success:
        raise ValueError("Failed to encode image to PNG.")

    blob = bucket.blob(remote_name)
    blob.upload_from_string(encoded_image.tobytes(), content_type='image/png')
    blob.make_public()  # Make file publicly accessible
    return blob.public_url

def extract_and_upload_crops_from_coco(image_path, coco_json_path, item_id, auction_id):
    """Read COCO JSON bboxes, crop from image, upload to Firebase, and return JSON dict."""
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    if image is None:
        print(f"[ERROR] Image not found: {image_path}")
        return {}

    if image.ndim == 3 and image.shape[2] == 4:  # Drop alpha if present
        image = image[:, :, :3]

    # Expected filename to match within the COCO "images" array
    expected_filename = os.path.basename(image_path)

    id_bbox_list = load_bboxes_from_coco_json(coco_json_path, image.shape, expected_filename=expected_filename)

    mask_data = {}
    for region_id, (x1, y1, x2, y2) in id_bbox_list:
        # end-exclusive slicing
        crop = image[y1:y2, x1:x2]
        if crop.size == 0:
            continue

        # Upload crop
        firebase_crop_name = f"{MODEL}/{item_id}/{auction_id}/mask_{region_id}.png"
        url = upload_image_to_firebase(crop, firebase_crop_name)

        mask_data[str(region_id)] = {
            "url": url,
            "bbox": [int(x1), int(y1), int(x2), int(y2)],
            "results": [],
            "tokens": []
        }

    return mask_data


def process_items():
    # Load item ids from your prompt CSV (used to locate per-item folders)
    df = pd.read_csv(TEXT_PROMPT_CSV)
    item_ids = df['id'].astype(str).tolist()

    # Load the single dataset-wide COCO file once
    COCO_JSON_PATH = MANUAL_POSTPROCESSED_LBL
    with open(COCO_JSON_PATH, "r") as f:
        coco = json.load(f)

    images = coco.get("images", [])
    annotations = coco.get("annotations", [])

    # Index annotations by image_id for fast lookup
    from collections import defaultdict
    anns_by_image = defaultdict(list)
    for ann in annotations:
        anns_by_image[ann.get("image_id")].append(ann)

    # Build: for convenience, map image_id -> image dict
    images_by_id = {img.get("id"): img for img in images}

    for item in item_ids:
        if item not in test_split:
            print(f"[SKIP] Item {item} not in test split.")
            continue

        allowed_files = test_split[item]  # e.g., ["1183014126.jpg", "1187040966.jpg"]
        allowed_ids = set(os.path.splitext(fname)[0] for fname in allowed_files)

        print(f"\nProcessing Item: {item} (Allowed auctions: {len(allowed_ids)})")
        img_dir = os.path.join(RAW, item)

        if not os.path.exists(img_dir):
            print(f"[WARNING] Image directory missing: {img_dir}")
            continue

        # Find all COCO images that belong to this item and are in the allowed split
        prefix = f"{item}/"
        candidate_imgs = []
        for img in images:
            file_name = img.get("file_name", "")
            if not file_name:
                continue
            if not file_name.startswith(prefix):
                continue
            auction_id = os.path.splitext(os.path.basename(file_name))[0]
            if auction_id in allowed_ids:
                candidate_imgs.append(img)

        if not candidate_imgs:
            print(f"[INFO] No matching COCO images for item {item} and given split.")
            continue

        for img in candidate_imgs:
            image_id = img.get("id")
            file_name = img.get("file_name")  # e.g., "1/1183014126.jpg"
            auction_id = os.path.splitext(os.path.basename(file_name))[0]

            # Resolve the image path in PREPROCESSED
            image_path = os.path.join(RAW, file_name)  # PREPROCESSED/<item>/<auction>.jpg
            if not os.path.exists(image_path):
                base_no_ext = os.path.join(RAW, item, auction_id)
                tried = []
                for ext in [".png", ".jpg", ".jpeg", ".JPG", ".PNG", ".JPEG"]:
                    cand = base_no_ext + ext
                    tried.append(cand)
                    if os.path.exists(cand):
                        image_path = cand
                        break
                if not os.path.exists(image_path):
                    print(f"[WARNING] Missing original image for auction {auction_id}. Tried: {tried}")
                    continue

            # Load image (for cropping)
            image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
            if image is None:
                print(f"[ERROR] Failed to read image: {image_path}")
                continue
            if image.ndim == 3 and image.shape[2] == 4:  # drop alpha if present
                image = image[:, :, :3]

            H, W = image.shape[:2]

            # Collect and convert this image's bboxes from COCO
            id_bbox_list = []
            # IMPORTANT: enumerate -> 1..N per IMAGE (not using ann["id"])
            for idx, ann in enumerate(anns_by_image.get(image_id, []), start=1):
                bbox = ann.get("bbox")
                if not bbox or len(bbox) != 4:
                    continue
                x, y, w, h = map(float, bbox)
                # xyxy (end-exclusive)
                x1 = math.floor(x)
                y1 = math.floor(y)
                x2 = math.ceil(x + w)
                y2 = math.ceil(y + h)
                # clamp
                x1, y1, x2, y2 = clamp_bbox_xyxy_exclusive(x1, y1, x2, y2, W, H)
                region_id = str(idx)  # <-- use running count (1,2,3,...) instead of ann["id"]
                id_bbox_list.append((region_id, (x1, y1, x2, y2)))

            # Crop, upload, and build JSON
            mask_data = {}
            for region_id, (x1, y1, x2, y2) in id_bbox_list:
                crop = image[y1:y2, x1:x2]
                if crop.size == 0:
                    continue
                firebase_crop_name = f"{MODEL}/{item}/{auction_id}/mask_{region_id}.png"
                url = upload_image_to_firebase(crop, firebase_crop_name)
                mask_data[str(region_id)] = {
                    "url": url,
                    "bbox": [int(x1), int(y1), int(x2), int(y2)],
                    "results": [],
                    "tokens": []
                }

            # Save JSON for this auction
            auction_output_dir = os.path.join(LENS, item)
            os.makedirs(auction_output_dir, exist_ok=True)
            json_path = os.path.join(auction_output_dir, f"{auction_id}.json")
            with open(json_path, "w") as f:
                json.dump(mask_data, f, indent=4)

            print(f"[DONE] {item}/{auction_id} -> {json_path}")


if __name__ == "__main__":
    process_items()
    print("\n✅ Filtered auctions processed and uploaded to Firebase!")



Processing Item: 1 (Allowed auctions: 60)
[DONE] 1/1183014126 -> ../zm_scraper/listing/manual/lens/1/1183014126.json
[DONE] 1/1187040966 -> ../zm_scraper/listing/manual/lens/1/1187040966.json
[DONE] 1/1188971238 -> ../zm_scraper/listing/manual/lens/1/1188971238.json
[DONE] 1/1190947766 -> ../zm_scraper/listing/manual/lens/1/1190947766.json
[DONE] 1/b1187681880 -> ../zm_scraper/listing/manual/lens/1/b1187681880.json
[DONE] 1/b1191454294 -> ../zm_scraper/listing/manual/lens/1/b1191454294.json
[DONE] 1/c1174455551 -> ../zm_scraper/listing/manual/lens/1/c1174455551.json
[DONE] 1/c1186222082 -> ../zm_scraper/listing/manual/lens/1/c1186222082.json
[DONE] 1/c1191011512 -> ../zm_scraper/listing/manual/lens/1/c1191011512.json
[DONE] 1/c1191323690 -> ../zm_scraper/listing/manual/lens/1/c1191323690.json
[DONE] 1/d1187227651 -> ../zm_scraper/listing/manual/lens/1/d1187227651.json
[DONE] 1/d1191065516 -> ../zm_scraper/listing/manual/lens/1/d1191065516.json
[DONE] 1/f1152247645 -> ../zm_scraper/lis

[DONE] 3/m1190952251 -> ../zm_scraper/listing/manual/lens/3/m1190952251.json
[DONE] 3/m1190985464 -> ../zm_scraper/listing/manual/lens/3/m1190985464.json
[DONE] 3/n1173190176 -> ../zm_scraper/listing/manual/lens/3/n1173190176.json
[DONE] 3/n1191554586 -> ../zm_scraper/listing/manual/lens/3/n1191554586.json
[DONE] 3/n1191611788 -> ../zm_scraper/listing/manual/lens/3/n1191611788.json
[DONE] 3/o1186960589 -> ../zm_scraper/listing/manual/lens/3/o1186960589.json
[DONE] 3/p1171340896 -> ../zm_scraper/listing/manual/lens/3/p1171340896.json
[DONE] 3/p1188443975 -> ../zm_scraper/listing/manual/lens/3/p1188443975.json
[DONE] 3/q1191056336 -> ../zm_scraper/listing/manual/lens/3/q1191056336.json
[DONE] 3/q1191566819 -> ../zm_scraper/listing/manual/lens/3/q1191566819.json
[DONE] 3/s1167344625 -> ../zm_scraper/listing/manual/lens/3/s1167344625.json
[DONE] 3/t1191103807 -> ../zm_scraper/listing/manual/lens/3/t1191103807.json
[DONE] 3/v1187033255 -> ../zm_scraper/listing/manual/lens/3/v1187033255.json

## Call ScrapingDog (Lens API)

In [5]:
# For each json, read each mask ids public url and send it into scrapingdog 
# save its responses into the results array.
import os
import json
import pandas as pd
import requests
from dotenv import load_dotenv

# load variables from .env in the current directory
load_dotenv()

# Paths
SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
SCRAPINGDOG_ENDPOINT = "https://api.scrapingdog.com/google_lens"

def call_scrapingdog(image_url):
    """Send image URL to Scrapingdog and return the JSON response."""
    params = {
        "api_key": SCRAPINGDOG_API_KEY,
        "url": f"https://lens.google.com/uploadbyurl?url={image_url}",
    }
    try:
        response = requests.get(SCRAPINGDOG_ENDPOINT, params=params, timeout=60)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"[ERROR] Scrapingdog failed for {image_url} - Status: {response.status_code}")
            return None
    except Exception as e:
        print(f"[EXCEPTION] Error calling Scrapingdog: {e}")
        return None

def process_jsons():
    # Read item IDs from CSV
    df = pd.read_csv(TEXT_PROMPT_CSV)
    item_ids = df['id'].astype(str).tolist()

    for item_id in item_ids:
        item_folder = os.path.join(LENS, item_id)
        if not os.path.exists(item_folder):
            print(f"[SKIP] No JSON folder for item: {item_id}")
            continue

        # Iterate over JSON files for this item
        for json_file in os.listdir(item_folder):
            if not json_file.endswith(".json"):
                continue

            json_path = os.path.join(item_folder, json_file)
            print(f"\n[PROCESSING] {json_path}")

            # Load JSON data
            with open(json_path, "r") as f:
                data = json.load(f)

            updated = False

            for mask_id, mask_info in data.items():
                image_url = mask_info.get("url")
                if not image_url:
                    continue

                # Skip if results already exist
                if (OVERWRITE_RESULTS is False):
                    if "results" in mask_info and len(mask_info["results"]) > 0:
                        print(f"  [SKIP] Mask {mask_id} already has results.")
                        continue
                        
                # Call scraping dog here
                print(f"  [CALLING] Lens for mask {mask_id}")
                result = call_scrapingdog(image_url)                    
                if result:
                    if "results" not in mask_info:
                        mask_info["results"] = []
#                     If invalud result
                    if result['lens_results'] == ["Google Lens didn't return any results."]:
                        mask_info["results"]=[]
                    else:
                        mask_info["results"]=result['lens_results']
                    updated = True
                else:
                    print(f"FAILED")

            if updated:
                with open(json_path, "w") as f:
                    json.dump(data, f, indent=4, ensure_ascii=False)
                print(f"  [UPDATED] {json_path}")

if __name__ == "__main__":
    process_jsons()
    print("\n✅ All JSON files updated with Scrapingdog results!")



[PROCESSING] ../zm_scraper/listing/gdino/lens/1/w1190871465.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [CALLING] Lens for mask 5
  [CALLING] Lens for mask 6
  [CALLING] Lens for mask 7
  [CALLING] Lens for mask 8
  [CALLING] Lens for mask 9
  [CALLING] Lens for mask 10
  [CALLING] Lens for mask 11
  [CALLING] Lens for mask 12
  [UPDATED] ../zm_scraper/listing/gdino/lens/1/w1190871465.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/1/q1177627244.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/1/q1177627244.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/1/t1144496280.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/1/t1144496280.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/1/x1121603563.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [UPDATED] ../zm_scraper/listing/gdi

  [CALLING] Lens for mask 2
  [UPDATED] ../zm_scraper/listing/gdino/lens/1/b1187681880.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/1/o1191069257.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [UPDATED] ../zm_scraper/listing/gdino/lens/1/o1191069257.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/1/x1191588773.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [CALLING] Lens for mask 5
  [CALLING] Lens for mask 6
  [CALLING] Lens for mask 7
  [CALLING] Lens for mask 8
  [CALLING] Lens for mask 9
  [CALLING] Lens for mask 10
  [CALLING] Lens for mask 11
  [CALLING] Lens for mask 12
  [CALLING] Lens for mask 13
  [CALLING] Lens for mask 14
  [CALLING] Lens for mask 15
  [CALLING] Lens for mask 16
  [CALLING] Lens for mask 17
  [CALLING] Lens for mask 18
  [CALLING] Lens for mask 19
  [CALLING] Lens for mask 20
  [CALLING] Lens for mask 21
  [CALLING] Lens for mask 22

  [CALLING] Lens for mask 10
  [UPDATED] ../zm_scraper/listing/gdino/lens/2/e1191332130.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/2/b1174068183.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [UPDATED] ../zm_scraper/listing/gdino/lens/2/b1174068183.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/2/k1184414767.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/2/k1184414767.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/2/c1187483831.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [CALLING] Lens for mask 5
  [UPDATED] ../zm_scraper/listing/gdino/lens/2/c1187483831.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/2/m1175679082.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [UPDATED] ../zm_scraper/listing/gdino/lens/2/m1175679082.json

[PROCESSING] ../zm_scr

  [CALLING] Lens for mask 15
  [CALLING] Lens for mask 16
  [CALLING] Lens for mask 17
  [CALLING] Lens for mask 18
  [CALLING] Lens for mask 19
  [CALLING] Lens for mask 20
  [CALLING] Lens for mask 21
  [CALLING] Lens for mask 22
  [UPDATED] ../zm_scraper/listing/gdino/lens/3/e1191338151.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/3/q1191056336.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/3/q1191056336.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/3/m1148753453.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [UPDATED] ../zm_scraper/listing/gdino/lens/3/m1148753453.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/3/n1191611788.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/3/n1191611788.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/3/l1191174227.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/3/l1191174227.json

[PROCESSING] ../zm_scraper/listing/gdino/le

  [CALLING] Lens for mask 7
  [CALLING] Lens for mask 8
  [CALLING] Lens for mask 9
  [UPDATED] ../zm_scraper/listing/gdino/lens/4/k1191592318.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/4/s1191626724.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/4/s1191626724.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/4/j1132611060.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/4/j1132611060.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/4/v1190982790.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [UPDATED] ../zm_scraper/listing/gdino/lens/4/v1190982790.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/4/x1191763053.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [CALLING] Lens for mask 5
  [CALLING] Lens for mask 6
  [CALLING] Lens for mask 7
  [CALLING] Lens for mask 8
  [C

  [UPDATED] ../zm_scraper/listing/gdino/lens/5/l1174471422.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/5/m1165318477.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [UPDATED] ../zm_scraper/listing/gdino/lens/5/m1165318477.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/5/o1191160640.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [CALLING] Lens for mask 4
  [UPDATED] ../zm_scraper/listing/gdino/lens/5/o1191160640.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/5/t1139631929.json
  [CALLING] Lens for mask 1
  [UPDATED] ../zm_scraper/listing/gdino/lens/5/t1139631929.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/5/n1191341403.json
  [CALLING] Lens for mask 1
  [CALLING] Lens for mask 2
  [CALLING] Lens for mask 3
  [UPDATED] ../zm_scraper/listing/gdino/lens/5/n1191341403.json

[PROCESSING] ../zm_scraper/listing/gdino/lens/5/e1178664397.json
  [CALLING] Lens for mask 1
  [CALLING] Le

## Test scripts

In [None]:
# Scrapingdog
import requests
url = "https://api.scrapingdog.com/google_lens"
image = "https://storage.googleapis.com/strustore-dev.firebasestorage.app/crops/1/x1121603563/crop_6.png"
params = {
    "api_key": SCRAPINGDOG_API_KEY,
  "url": f"https://lens.google.com/uploadbyurl?url={image}",
}

response = requests.get(url, params=params)

if response.status_code == 200:
  data = response.json()
  print(data['lens_results'])
else:
  print(f"Request failed with status code: {response.status_code}")