# Following the creation of our databank, we will query and store the triplet image cutouts available from Lasair

In [1]:
# All imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm
import lasair


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Lasair token
TOKEN = os.getenv("LASAIR_TOKEN", "ebc9dee5598ea21658c352a5e71ca8a33875fc96")  
L = lasair.lasair_client(TOKEN)

In [None]:
# Configure
TRAIN_CSV    = "training_data.csv"                        
ID_COLUMN    = "objectId"                             

# Store output 
CUTOUT_DIR   = Path("ztf_training_triplets_maxmag")       
MANIFEST_CSV = "ztf_training_triplets_maxmag_manifest.csv"
CUTOUT_DIR.mkdir(parents=True, exist_ok=True)

# Load objects based on objectId feature
df_train = pd.read_csv(TRAIN_CSV)

if ID_COLUMN not in df_train.columns:
    raise KeyError(f"Column '{ID_COLUMN}' not found in {TRAIN_CSV}")

object_ids = df_train[ID_COLUMN].dropna().astype(str).unique().tolist()
total_objects = len(object_ids)
print(f"Found {total_objects} unique objectIds in {TRAIN_CSV}")


# Helper to try a candidate and download its triplet
def download_triplet_for_candidate(obj_id: str, cand: dict, out_dir: Path):
    """
    Download the Science/Template/Difference cutouts for a single candidate.

    Returns:
    (success, rows)

    success: bool, True if all three cutouts downloaded successfully.
    rows: list of manifest-row dicts for this candidate (0 or 3 entries).

    This function is quiet: it does not print any per-URL errors.
    """
    candid = cand.get("candid")
    fid    = cand.get("fid")
    jd     = cand.get("jd")

    img_urls = cand.get("image_urls") or {}
    if not img_urls:
        return False, []

    manifest_rows = []
    required_labels = ["Science", "Template", "Difference"]

    for label in required_labels:
        url = img_urls.get(label)
        if not url:
            # missing one of the triplet images = treat as failure
            return False, []

        label_norm = label.lower()        # science/template/difference
        ext = Path(url).suffix or ".fits"
        fname = out_dir / f"{obj_id}_{candid}_{label_norm}{ext}"

        if not fname.exists():
            try:
                r = requests.get(url, timeout=60)
                r.raise_for_status()
                with open(fname, "wb") as f:
                    f.write(r.content)
            except Exception:
                # e.g. 404 if cutout has been deleted, or other HTTP/network issue
                # We stay silent and treat this candidate as failed.
                return False, []

        manifest_rows.append(
            {
                "objectId": obj_id,
                "candid": candid,
                "fid": fid,
                "jd": jd,
                "cutout_type": label_norm,   # science/template/difference
                "url": url,
                "file_path": str(fname),
            }
        )
    return True, manifest_rows


# Main loop: One brightest triplet per object
all_manifest_rows = []
n_with_triplet = 0
n_no_triplet   = 0
n_error        = 0

for obj_id in tqdm(object_ids, desc="Downloading brightest triplets per training object"):
    try:
        result = L.object(obj_id, lasair_added=True)
    except Exception:
        # e.g. object not found / bad ID / API error
        n_error += 1
        continue

    candidates = result.get("candidates") or []
    if not candidates:
        # No candidates at all = skip quickly
        n_no_triplet += 1
        continue

    # Quick skip: if no candidate has non-empty image_urls, move on
    if not any(c.get("candid") is not None and c.get("image_urls") for c in candidates):
        n_no_triplet += 1
        continue

    filtered = []
    for cand in candidates:
        candid = cand.get("candid")
        if candid is None:
            continue

        img_urls = cand.get("image_urls") or {}
        if not img_urls:
            continue

        magpsf = cand.get("magpsf")
        try:
            mag_val = float(magpsf)
        except (TypeError, ValueError):
            continue
        if not np.isfinite(mag_val):
            continue

        isdiffpos = cand.get("isdiffpos")
        if isdiffpos not in ("t", "T", True, "True", 1):
            continue

        filtered.append((mag_val, cand))

    if not filtered:
        n_no_triplet += 1
        continue

    # Sort by magpsf ascending (brightest first)
    filtered.sort(key=lambda x: x[0])

    # Try candidates from brightest to fainter until one triplet downloads fully
    success_for_object = False
    for mag_val, cand in filtered:
        ok, rows = download_triplet_for_candidate(obj_id, cand, CUTOUT_DIR)
        if ok:
            all_manifest_rows.extend(rows)
            success_for_object = True
            break

    if success_for_object:
        n_with_triplet += 1
    else:
        n_no_triplet += 1

# Save manifest CSV and final summary
if all_manifest_rows:
    df_manifest = pd.DataFrame(all_manifest_rows)
    df_manifest.to_csv(MANIFEST_CSV, index=False)

    print(f"\nSaved {len(df_manifest)} cutout entries to {MANIFEST_CSV}")
    print(f"Cutout FITS files are in: {CUTOUT_DIR.resolve()}")
else:
    print("\nNo triplets were downloaded for any training_data objects.")

print("\nSummary:")
print(f"  Total unique objectIds in training_data: {total_objects}")
print(f"  Objects with at least one triplet:       {n_with_triplet}")
print(f"  Objects with no usable triplet:          {n_no_triplet}")
print(f"  Objects with API/other errors:           {n_error}")

Found 5479 unique objectIds in training_data.csv


Downloading brightest triplets per training object:  11%| | 628/5479 [1:00:49<16