# Upload, check and update images on Supabase storage

In [9]:
import csv
import requests
from pathlib import Path
from collections import defaultdict
import time
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import os
from supabase import create_client
from httpx import RemoteProtocolError, ReadTimeout, WriteError
from dotenv import load_dotenv

## Initial image upload

In [7]:
# CONFIG

SUPABASE_PUBLIC_BASE = (
    "https://utwhgfveotpusdjopcnl.supabase.co"
    "/storage/v1/object/public/prolific_images/"
)

LOCAL_IMAGE_ROOT = Path(
    "/mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data"
)

LOCAL_TABLE_ROOT = Path(
    "/mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/prolific_stimuli"
)

CSV_FILES = [
    "stimulus_table_image_priors.csv",
    "stimulus_table_shapes.csv",
    "stimulus_table_counterfact.csv",
]

TIMEOUT = 10
SLEEP_BETWEEN_REQUESTS = 0.05

In [3]:
# COLLECT IMAGE PATHS

image_paths = set()
csv_sources = defaultdict(list)

for csv_file in CSV_FILES:
    path = LOCAL_TABLE_ROOT / csv_file
    if not path.exists():
        print(f" CSV not found: {csv_file}")
        continue

    with open(path, newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            img = row["image_path"].strip()
            image_paths.add(img)
            csv_sources[img].append(csv_file)

print(f"\nFound {len(image_paths)} unique image paths\n")


Found 16614 unique image paths



In [None]:
# CHECK IMAGES

missing_local = []
http_errors = []
ok_images = []

for i, rel_path in enumerate(sorted(image_paths), 1):
    local_path = LOCAL_IMAGE_ROOT / rel_path
    public_url = SUPABASE_PUBLIC_BASE + rel_path

    print(f"[{i}/{len(image_paths)}] Checking {rel_path}")

    # local existence
    if not local_path.exists():
        missing_local.append(rel_path)
        print("   Missing locally")
        continue

    # public URL
    try:
        r = requests.get(public_url, timeout=TIMEOUT)

        if r.status_code != 200:
            http_errors.append((rel_path, r.status_code))
            print(f"   HTTP {r.status_code}")
        else:
            ct = r.headers.get("Content-Type", "")
            if not ct.startswith("image"):
                http_errors.append((rel_path, f"bad content-type: {ct}"))
                print(f"   Bad content-type: {ct}")
            else:
                ok_images.append(rel_path)
                print("   OK")

    except requests.RequestException as e:
        http_errors.append((rel_path, str(e)))
        print(f"   Request error: {e}")

    time.sleep(SLEEP_BETWEEN_REQUESTS)

# SUMMARY

print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)

print(f"Total images checked: {len(image_paths)}")
print(f"OK: {len(ok_images)}")
print(f"Missing locally: {len(missing_local)}")
print(f"HTTP / access errors: {len(http_errors)}")

if missing_local:
    print("\n MISSING LOCALLY:")
    for p in missing_local:
        print(f"  - {p} (referenced in {csv_sources[p]})")

if http_errors:
    print("\n PUBLIC ACCESS ERRORS:")
    for p, err in http_errors:
        print(f"  - {p}: {err} (from {csv_sources[p]})")

if not missing_local and not http_errors:
    print("\n All images are present and publicly accessible.")
    print("Likely cause of failures: transient network / CDN issues.")

print("=" * 60)


## Check images on Supabase storage

In [17]:
# Check if all images listed in the stimulus tables can be loaded and decoded from Supabase
image_paths = set()

for csv_file in CSV_FILES:
    with open(LOCAL_TABLE_ROOT / csv_file, newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            image_paths.add(row["image_path"])

print(f"Checking {len(image_paths)} images...\n")


# CHECK IMAGES
errors = []

for path in tqdm(sorted(image_paths)):
    url = SUPABASE_PUBLIC_BASE + path

    try:
        r = requests.get(url, timeout=TIMEOUT)
        if r.status_code != 200:
            errors.append((path, f"HTTP {r.status_code}"))
            continue

        if len(r.content) < 100:
            errors.append((path, "File too small"))
            continue

        # Try decoding image
        img = Image.open(BytesIO(r.content))
        img.verify()  # checks integrity

    except Exception as e:
        errors.append((path, str(e)))


# REPORT
print("\n--- RESULT ---")
if not errors:
    print("All images loaded and decoded successfully.")
else:
    print(f"{len(errors)} problematic images:\n")
    for p, err in errors:
        print(f"{p} → {err}")

    with open("corrupt_images.txt", "w") as f:
        for p, err in errors:
            f.write(f"{p}\t{err}\n")

    print("\nSaved list to corrupt_images.txt")


Checking 16614 images...



  0%|          | 0/16614 [00:00<?, ?it/s]

100%|██████████| 16614/16614 [29:41<00:00,  9.32it/s] 


--- RESULT ---
All images loaded and decoded successfully.





## Update Supabase (adjust as necessary)

In [10]:
# CONFIG
load_dotenv()

SUPABASE_URL = "https://utwhgfveotpusdjopcnl.supabase.co/"
SUPABASE_KEY = os.environ["SUPABASE_SERVICE_ROLE"]
BUCKET_NAME = "prolific_images"

In [11]:
# Objects to update, adjust as needed
IMAGE_PRIORS_OBJECTS = [
    "Rottweiler",
    "Sealyham terrier",
    "curly-coated retriever",
    "dalmatian",
    "espresso maker",
    "flat-coated retriever",
    "flute",
    "radio",
    "screw",
    "strainer",
    "typewriter",
    "van",
    "waffle iron",
]

COUNTERFACT_OBJECTS = [
    "Band Aid",
    "French horn",
    "Pomeranian",
    "car wheel",
    "faucet",
    "fridge",
    "limousine",
    "padlock",
    "plastic bag",
    "saxophone",
    "thimble",
    "truck",
    "trumpet",
    "wagon",
]

In [12]:
# Paths
IMAGE_PRIORS_ROOT = LOCAL_IMAGE_ROOT / "color_images/gpt-4o/image_priors"
COUNTERFACT_ROOT = LOCAL_IMAGE_ROOT / "color_images/gpt-4o/counterfact"

MAX_RETRIES = 6
SLEEP_BETWEEN_UPLOADS = 0.15
RETRY_SLEEP = 3.0
WRITE_ERROR_SLEEP = 5.0


# SUPABASE CLIENT
def make_supabase():
    return create_client(SUPABASE_URL, SUPABASE_KEY)

supabase = make_supabase()


# HELPERS
def normalize_object_name(name: str) -> str:
    return name.lower().replace(" ", "_")

def collect_object_folders(root: Path, object_names):
    folders = []
    wanted = {normalize_object_name(o) for o in object_names}

    for d in root.iterdir():
        if not d.is_dir():
            continue

        folder_prefix = d.name.lower()
        for obj in wanted:
            if folder_prefix.startswith(obj):
                folders.append(d)
                break

    return folders

In [None]:
# COLLECT FILES
folders_to_update = []

folders_to_update += collect_object_folders(
    IMAGE_PRIORS_ROOT, IMAGE_PRIORS_OBJECTS
)
folders_to_update += collect_object_folders(
    COUNTERFACT_ROOT, COUNTERFACT_OBJECTS
)

if not folders_to_update:
    raise RuntimeError("No folders found to update — check object names.")

image_paths = []

for folder in folders_to_update:
    for img in sorted(folder.glob("*.png")):
        rel_path = img.relative_to(LOCAL_IMAGE_ROOT)
        image_paths.append(rel_path)

print(f"Overwriting {len(image_paths)} images")
print("Folders:")
for f in folders_to_update:
    print(" -", f)


# UPLOAD (OVERWRITE)
for i, rel_path in enumerate(image_paths, 1):
    local_path = LOCAL_IMAGE_ROOT / rel_path

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            with open(local_path, "rb") as f:
                supabase.storage.from_(BUCKET_NAME).update(
                    path=str(rel_path),
                    file=f,
                    file_options={"content-type": "image/png"},
                )

            print(f"[{i}/{len(image_paths)}] Updated {rel_path}")
            break

        except WriteError:
            print(f"WriteError ({attempt}/{MAX_RETRIES}) → resetting client")
            supabase = make_supabase()
            time.sleep(WRITE_ERROR_SLEEP)

        except (RemoteProtocolError, ReadTimeout):
            print(f"Network error ({attempt}/{MAX_RETRIES}) on {rel_path}")
            time.sleep(RETRY_SLEEP)

        except Exception:
            raise

    time.sleep(SLEEP_BETWEEN_UPLOADS)

print("Done.")

Overwriting 63 images
Folders:
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/radio_3_4d87b11a_resized_red
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/Rottweiler_4_131e8b4b_resized_brown
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/typewriter_4_0a946944_resized_grey
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/flat-coated_retriever_3_52e2a611_resized_brown
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/espresso_maker_2_fd91d5b6_resized_red
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/dalmatian_2_266c20b5_resized_grey
 - /mnt/lustre/work/eickhoff/esx061/color-concept-entanglement/data/color_images/gpt-4o/image_priors/screw_2_f5051efb_resized_grey
 - /mnt/lustr