In [1]:
# 02_download_clean — Target A (PDCSAP-first, TESSCut-FFI fallback) — robust & batchable
import json, warnings, pathlib, re
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# --- Config (single-target defaults) ---
TARGET_TOI = "TOI 1801.01"
TARGET_TIC = 119584412
QUALITY    = 175
CUTOUT_SZ  = 15
WIN_DAYS   = 1.0

# --- Folders ---
pathlib.Path("figures").mkdir(exist_ok=True)
pathlib.Path("results").mkdir(exist_ok=True)
pathlib.Path("data_raw_fresh").mkdir(exist_ok=True)

# --- Helpers ---
def window_len_for(t_days, window_days=1.0):
    dt = np.nanmedian(np.diff(t_days))
    wl = 401 if (not np.isfinite(dt) or dt<=0) else max(5, int(round(window_days/dt)))
    return wl if wl % 2 == 1 else wl + 1

def savgol_trend(t, y, window_len, polyorder=2):
    half = window_len//2
    trend = np.full_like(y, np.nan, dtype=float)
    for i in range(len(y)):
        lo, hi = max(0, i-half), min(len(y), i+half+1)
        tt = t[lo:hi] - t[i]; yy = y[lo:hi]
        m  = np.isfinite(tt) & np.isfinite(yy)
        if np.count_nonzero(m) >= polyorder+1:
            c = np.polyfit(tt[m], yy[m], polyorder)
            trend[i] = np.polyval(c, 0.0)
    trend[~np.isfinite(trend)] = np.nanmedian(y)
    return trend

def custom_flatten(lc, window_days=1.0, polyorder=2):
    """Return a *new* LightCurve with flattened flux (no .replace() calls)."""
    t = lc.time.value
    f = lc.flux.value.astype(float)
    wl = window_len_for(t, window_days)
    tr = savgol_trend(t, f, wl, polyorder=polyorder)
    flat = f/np.where(np.isfinite(tr), tr, np.nanmedian(f))
    flat /= np.nanmedian(flat)
    # carry flux_err if present
    try:
        ferr = lc.flux_err.value if getattr(lc, "flux_err", None) is not None else None
    except Exception:
        ferr = None
    from lightkurve import LightCurve
    return LightCurve(time=lc.time, flux=flat, flux_err=ferr, meta=dict(getattr(lc, "meta", {})))

def approx_cdpp_ppm(lc, hours=1.0):
    t = lc.time.value; f = lc.flux.value
    dt_hr = np.nanmedian(np.diff(t))*24.0
    if not np.isfinite(dt_hr) or dt_hr<=0: return float("nan")
    n = max(1, int(round(hours/dt_hr)))
    run = np.convolve(f - np.nanmedian(f), np.ones(n)/n, mode="valid")
    return float(np.nanstd(run)*1e6)

def get_header_sector(header):
    # Try common sector keys; then regex fallback
    for k in ("SECTOR","sector","SEC1"):
        try:
            v = header.get(k)
            if v is not None:
                return int(v)
        except Exception:
            pass
    txt = " ".join([f"{k}={header.get(k)}" for k in header.keys()])
    m = re.search(r"[Ss]ector[^0-9]*([0-9]{1,3})", txt)
    return int(m.group(1)) if m else None

# --- Lightkurve ---
import lightkurve as lk
from lightkurve import LightCurveCollection

def run_download_clean(target_toi: str, target_tic: int, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS):
    TARGET = f"TIC {target_tic}"
    out_fig  = f"figures/TIC{target_tic}_download_clean.png"
    out_json = f"results/TIC{target_tic}_download_clean_summary.json"

    print(f"Target: {target_toi} ({TARGET})")

    # -------------------------
    # 1) Download SPOC PDCSAP first; read sector from files (robust to missing 'sector' column)
    # -------------------------
    pdcsap_list, got_spoc_sectors = [], set()
    sr_spoc = lk.search_lightcurvefile(TARGET, mission="TESS", author="SPOC")

    if len(sr_spoc) == 0:
        print("No SPOC PDCSAP products found.")
    else:
        for i in range(len(sr_spoc)):
            try:
                lcf = sr_spoc[i].download(download_dir="data_raw_fresh")
                lc  = lcf.PDCSAP_FLUX.remove_nans().normalize()
                # sector from object or header
                s = getattr(lcf, "sector", None)
                if s is None:
                    try:
                        hdr = lcf.header() if hasattr(lcf, "header") else lcf.get_header()
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                lc.meta["source"] = "PDCSAP"; lc.meta["sector"] = int(s)
                pdcsap_list.append(lc)
                if s != -1: got_spoc_sectors.add(int(s))
                print(f" PDCSAP S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f" PDCSAP entry {i}: failed -> {e}")

    # -------------------------
    # 2) Query TESSCut and add FFI for sectors not covered by PDCSAP
    # -------------------------
    ffi_list = []
    try:
        sr_cut = lk.search_tesscut(TARGET)
    except Exception as e:
        sr_cut = []
        print("TESSCut query failed:", e)

    if len(sr_cut) == 0:
        print("No TESSCut entries found.")
    else:
        for i in range(len(sr_cut)):
            try:
                tpf = sr_cut[i].download(cutout_size=cutout_sz, download_dir="data_raw_fresh")
                # sector from object or header
                s = getattr(tpf, "sector", None)
                if s is None:
                    try:
                        hdr = tpf.hdu[0].header
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                # skip if PDCSAP already covers
                if (s != -1) and (int(s) in got_spoc_sectors):
                    print(f"  FFI S{int(s)}: skip (already have PDCSAP)")
                    continue
                # aperture photometry
                mask = tpf.create_threshold_mask(threshold=3, reference_pixel=None)
                if not np.any(mask):
                    m = np.zeros_like(mask, dtype=bool)
                    yy, xx = np.unravel_index(np.nanargmax(np.nanmedian(tpf.flux.value, axis=0)), mask.shape)
                    m[yy, xx] = True
                    mask = m
                lc = tpf.to_lightcurve(aperture_mask=mask).remove_nans().normalize()
                lc.meta["source"] = "FFI"; lc.meta["sector"] = int(s)
                ffi_list.append(lc)
                print(f"  FFI S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f"  FFI entry {i}: failed -> {e}")

    if not (pdcsap_list or ffi_list):
        raise RuntimeError("No light curves obtained from SPOC or TESSCut.")

    # -------------------------
    # 3) Stitch, flatten, QC
    # -------------------------
    all_lcs_raw   = pdcsap_list + ffi_list
    stitched_raw  = LightCurveCollection(all_lcs_raw).stitch().remove_nans()
    stitched_flat = custom_flatten(stitched_raw, window_days=win_days)

    def rms_ppm(x): return float(np.nanstd(x)*1e6)
    qc = {
        "target_toi": target_toi,
        "target_tic": target_tic,
        "n_sectors_pdcsap": len(pdcsap_list),
        "n_sectors_ffi": len(ffi_list),
        "sectors_pdcsap": [int(lc.meta.get("sector", -1)) for lc in pdcsap_list],
        "sectors_ffi": [int(lc.meta.get("sector", -1)) for lc in ffi_list],
        "n_points_raw": int(len(stitched_raw)),
        "n_points_flat": int(len(stitched_flat)),
        "rms_raw_ppm": rms_ppm(stitched_raw.flux.value),
        "rms_flat_ppm": rms_ppm(stitched_flat.flux.value),
        "cdpp1h_flat_ppm": approx_cdpp_ppm(stitched_flat, hours=1.0),
        "flatten_window_days": float(win_days),
        "notes": "PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing."
    }

    # -------------------------
    # 4) Figure & JSON
    # -------------------------
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,6), dpi=140, sharex=True)
    stitched_raw.plot(ax=ax1, marker=".", lw=0, alpha=0.35)
    ax1.set_ylabel("Flux (norm)")
    ax1.set_title(f"{target_toi} (TIC {target_tic}) — Stitched RAW")
    stitched_flat.plot(ax=ax2, marker=".", lw=0, alpha=0.45, label="flattened")
    ax2.set_xlabel("Time [BTJD]")
    ax2.set_ylabel("Flux (norm)")
    ax2.set_title(f"{target_toi} — Stitched FLAT (window={win_days:.2f} d)")
    ax2.legend(loc="best")
    fig.tight_layout()
    fig.savefig(out_fig); plt.close(fig)

    with open(out_json, "w") as f:
        json.dump(qc, f, indent=2)

    print("\n== DONE ==")
    print(f"Saved figure: {out_fig}")
    print(f"Saved JSON  : {out_json}")
    print("Summary:", qc)

    return qc, out_fig, out_json

# ---------- Run on Target A (deliverable) ----------
_ = run_download_clean(TARGET_TOI, TARGET_TIC, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)

# ---------- OPTIONAL: batch-run first 4 priority targets ----------
# Set RUN_BATCH=True to run on the first 4 rows in results/priority_targets.csv
RUN_BATCH = False
if RUN_BATCH:
    import pandas as pd
    dfp = pd.read_csv("results/priority_targets.csv")
    # Expect columns: TIC_ID_norm, toi (string/number). Adjust if your headers differ.
    for _, row in dfp.head(4).iterrows():
        tic = int(row["TIC_ID_norm"])
        toi = f"TOI {row['toi']}"
        try:
            run_download_clean(toi, tic, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)
        except Exception as e:
            print(f"[Batch] {toi} (TIC {tic}) failed: {e}")

Target: TOI 1801.01 (TIC 119584412)
 PDCSAP S22: ok (N=16102)
 PDCSAP S49: ok (N=13272)
  FFI S22: skip (already have PDCSAP)
  FFI S49: skip (already have PDCSAP)

== DONE ==
Saved figure: figures/TIC119584412_download_clean.png
Saved JSON  : results/TIC119584412_download_clean_summary.json
Summary: {'target_toi': 'TOI 1801.01', 'target_tic': 119584412, 'n_sectors_pdcsap': 2, 'n_sectors_ffi': 0, 'sectors_pdcsap': [22, 49], 'sectors_ffi': [], 'n_points_raw': 29374, 'n_points_flat': 29374, 'rms_raw_ppm': 1359.3999901786447, 'rms_flat_ppm': 1099.746714683546, 'cdpp1h_flat_ppm': 251.16073602817906, 'flatten_window_days': 1.0, 'notes': 'PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing.'}


In [2]:
# 02_download_clean — Target A (PDCSAP-first, TESSCut-FFI fallback) — robust & batchable
import json, warnings, pathlib, re
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# --- Config (single-target defaults) ---
TARGET_TOI= "TOI 260.01"
TARGET_TIC= 37749396
QUALITY    = 175
CUTOUT_SZ  = 15
WIN_DAYS   = 1.0

# --- Folders ---
pathlib.Path("figures").mkdir(exist_ok=True)
pathlib.Path("results").mkdir(exist_ok=True)
pathlib.Path("data_raw_fresh").mkdir(exist_ok=True)

# --- Helpers ---
def window_len_for(t_days, window_days=1.0):
    dt = np.nanmedian(np.diff(t_days))
    wl = 401 if (not np.isfinite(dt) or dt<=0) else max(5, int(round(window_days/dt)))
    return wl if wl % 2 == 1 else wl + 1

def savgol_trend(t, y, window_len, polyorder=2):
    half = window_len//2
    trend = np.full_like(y, np.nan, dtype=float)
    for i in range(len(y)):
        lo, hi = max(0, i-half), min(len(y), i+half+1)
        tt = t[lo:hi] - t[i]; yy = y[lo:hi]
        m  = np.isfinite(tt) & np.isfinite(yy)
        if np.count_nonzero(m) >= polyorder+1:
            c = np.polyfit(tt[m], yy[m], polyorder)
            trend[i] = np.polyval(c, 0.0)
    trend[~np.isfinite(trend)] = np.nanmedian(y)
    return trend

def custom_flatten(lc, window_days=1.0, polyorder=2):
    """Return a *new* LightCurve with flattened flux (no .replace() calls)."""
    t = lc.time.value
    f = lc.flux.value.astype(float)
    wl = window_len_for(t, window_days)
    tr = savgol_trend(t, f, wl, polyorder=polyorder)
    flat = f/np.where(np.isfinite(tr), tr, np.nanmedian(f))
    flat /= np.nanmedian(flat)
    # carry flux_err if present
    try:
        ferr = lc.flux_err.value if getattr(lc, "flux_err", None) is not None else None
    except Exception:
        ferr = None
    from lightkurve import LightCurve
    return LightCurve(time=lc.time, flux=flat, flux_err=ferr, meta=dict(getattr(lc, "meta", {})))

def approx_cdpp_ppm(lc, hours=1.0):
    t = lc.time.value; f = lc.flux.value
    dt_hr = np.nanmedian(np.diff(t))*24.0
    if not np.isfinite(dt_hr) or dt_hr<=0: return float("nan")
    n = max(1, int(round(hours/dt_hr)))
    run = np.convolve(f - np.nanmedian(f), np.ones(n)/n, mode="valid")
    return float(np.nanstd(run)*1e6)

def get_header_sector(header):
    # Try common sector keys; then regex fallback
    for k in ("SECTOR","sector","SEC1"):
        try:
            v = header.get(k)
            if v is not None:
                return int(v)
        except Exception:
            pass
    txt = " ".join([f"{k}={header.get(k)}" for k in header.keys()])
    m = re.search(r"[Ss]ector[^0-9]*([0-9]{1,3})", txt)
    return int(m.group(1)) if m else None

# --- Lightkurve ---
import lightkurve as lk
from lightkurve import LightCurveCollection

def run_download_clean(target_toi: str, target_tic: int, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS):
    TARGET = f"TIC {target_tic}"
    out_fig  = f"figures/TIC{target_tic}_download_clean.png"
    out_json = f"results/TIC{target_tic}_download_clean_summary.json"

    print(f"Target: {target_toi} ({TARGET})")

    # -------------------------
    # 1) Download SPOC PDCSAP first; read sector from files (robust to missing 'sector' column)
    # -------------------------
    pdcsap_list, got_spoc_sectors = [], set()
    sr_spoc = lk.search_lightcurvefile(TARGET, mission="TESS", author="SPOC")

    if len(sr_spoc) == 0:
        print("No SPOC PDCSAP products found.")
    else:
        for i in range(len(sr_spoc)):
            try:
                lcf = sr_spoc[i].download(download_dir="data_raw_fresh")
                lc  = lcf.PDCSAP_FLUX.remove_nans().normalize()
                # sector from object or header
                s = getattr(lcf, "sector", None)
                if s is None:
                    try:
                        hdr = lcf.header() if hasattr(lcf, "header") else lcf.get_header()
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                lc.meta["source"] = "PDCSAP"; lc.meta["sector"] = int(s)
                pdcsap_list.append(lc)
                if s != -1: got_spoc_sectors.add(int(s))
                print(f" PDCSAP S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f" PDCSAP entry {i}: failed -> {e}")

    # -------------------------
    # 2) Query TESSCut and add FFI for sectors not covered by PDCSAP
    # -------------------------
    ffi_list = []
    try:
        sr_cut = lk.search_tesscut(TARGET)
    except Exception as e:
        sr_cut = []
        print("TESSCut query failed:", e)

    if len(sr_cut) == 0:
        print("No TESSCut entries found.")
    else:
        for i in range(len(sr_cut)):
            try:
                tpf = sr_cut[i].download(cutout_size=cutout_sz, download_dir="data_raw_fresh")
                # sector from object or header
                s = getattr(tpf, "sector", None)
                if s is None:
                    try:
                        hdr = tpf.hdu[0].header
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                # skip if PDCSAP already covers
                if (s != -1) and (int(s) in got_spoc_sectors):
                    print(f"  FFI S{int(s)}: skip (already have PDCSAP)")
                    continue
                # aperture photometry
                mask = tpf.create_threshold_mask(threshold=3, reference_pixel=None)
                if not np.any(mask):
                    m = np.zeros_like(mask, dtype=bool)
                    yy, xx = np.unravel_index(np.nanargmax(np.nanmedian(tpf.flux.value, axis=0)), mask.shape)
                    m[yy, xx] = True
                    mask = m
                lc = tpf.to_lightcurve(aperture_mask=mask).remove_nans().normalize()
                lc.meta["source"] = "FFI"; lc.meta["sector"] = int(s)
                ffi_list.append(lc)
                print(f"  FFI S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f"  FFI entry {i}: failed -> {e}")

    if not (pdcsap_list or ffi_list):
        raise RuntimeError("No light curves obtained from SPOC or TESSCut.")

    # -------------------------
    # 3) Stitch, flatten, QC
    # -------------------------
    all_lcs_raw   = pdcsap_list + ffi_list
    stitched_raw  = LightCurveCollection(all_lcs_raw).stitch().remove_nans()
    stitched_flat = custom_flatten(stitched_raw, window_days=win_days)

    def rms_ppm(x): return float(np.nanstd(x)*1e6)
    qc = {
        "target_toi": target_toi,
        "target_tic": target_tic,
        "n_sectors_pdcsap": len(pdcsap_list),
        "n_sectors_ffi": len(ffi_list),
        "sectors_pdcsap": [int(lc.meta.get("sector", -1)) for lc in pdcsap_list],
        "sectors_ffi": [int(lc.meta.get("sector", -1)) for lc in ffi_list],
        "n_points_raw": int(len(stitched_raw)),
        "n_points_flat": int(len(stitched_flat)),
        "rms_raw_ppm": rms_ppm(stitched_raw.flux.value),
        "rms_flat_ppm": rms_ppm(stitched_flat.flux.value),
        "cdpp1h_flat_ppm": approx_cdpp_ppm(stitched_flat, hours=1.0),
        "flatten_window_days": float(win_days),
        "notes": "PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing."
    }

    # -------------------------
    # 4) Figure & JSON
    # -------------------------
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,6), dpi=140, sharex=True)
    stitched_raw.plot(ax=ax1, marker=".", lw=0, alpha=0.35)
    ax1.set_ylabel("Flux (norm)")
    ax1.set_title(f"{target_toi} (TIC {target_tic}) — Stitched RAW")
    stitched_flat.plot(ax=ax2, marker=".", lw=0, alpha=0.45, label="flattened")
    ax2.set_xlabel("Time [BTJD]")
    ax2.set_ylabel("Flux (norm)")
    ax2.set_title(f"{target_toi} — Stitched FLAT (window={win_days:.2f} d)")
    ax2.legend(loc="best")
    fig.tight_layout()
    fig.savefig(out_fig); plt.close(fig)

    with open(out_json, "w") as f:
        json.dump(qc, f, indent=2)

    print("\n== DONE ==")
    print(f"Saved figure: {out_fig}")
    print(f"Saved JSON  : {out_json}")
    print("Summary:", qc)

    return qc, out_fig, out_json

# ---------- Run on Target A (deliverable) ----------
_ = run_download_clean(TARGET_TOI, TARGET_TIC, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)

# ---------- OPTIONAL: batch-run first 4 priority targets ----------
# Set RUN_BATCH=True to run on the first 4 rows in results/priority_targets.csv
RUN_BATCH = False
if RUN_BATCH:
    import pandas as pd
    dfp = pd.read_csv("results/priority_targets.csv")
    # Expect columns: TIC_ID_norm, toi (string/number). Adjust if your headers differ.
    for _, row in dfp.head(4).iterrows():
        tic = int(row["TIC_ID_norm"])
        toi = f"TOI {row['toi']}"
        try:
            run_download_clean(toi, tic, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)
        except Exception as e:
            print(f"[Batch] {toi} (TIC {tic}) failed: {e}")

Target: TOI 260.01 (TIC 37749396)




 PDCSAP S3: ok (N=12978)
 PDCSAP S42: ok (N=11473)
 PDCSAP S70: ok (N=86180)
 PDCSAP S70: ok (N=14424)
  FFI S3: skip (already have PDCSAP)
  FFI S42: skip (already have PDCSAP)
  FFI S70: skip (already have PDCSAP)

== DONE ==
Saved figure: figures/TIC37749396_download_clean.png
Saved JSON  : results/TIC37749396_download_clean_summary.json
Summary: {'target_toi': 'TOI 260.01', 'target_tic': 37749396, 'n_sectors_pdcsap': 4, 'n_sectors_ffi': 0, 'sectors_pdcsap': [3, 42, 70, 70], 'sectors_ffi': [], 'n_points_raw': 125055, 'n_points_flat': 125055, 'rms_raw_ppm': 1043.9811740070581, 'rms_flat_ppm': 1035.729219446299, 'cdpp1h_flat_ppm': 112.01310543832174, 'flatten_window_days': 1.0, 'notes': 'PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing.'}


In [3]:
# 02_download_clean — Target A (PDCSAP-first, TESSCut-FFI fallback) — robust & batchable
import json, warnings, pathlib, re
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# --- Config (single-target defaults) ---
TARGET_TOI="TOI 550.02"
TARGET_TIC=311183180
QUALITY    = 175
CUTOUT_SZ  = 15
WIN_DAYS   = 1.0

# --- Folders ---
pathlib.Path("figures").mkdir(exist_ok=True)
pathlib.Path("results").mkdir(exist_ok=True)
pathlib.Path("data_raw_fresh").mkdir(exist_ok=True)

# --- Helpers ---
def window_len_for(t_days, window_days=1.0):
    dt = np.nanmedian(np.diff(t_days))
    wl = 401 if (not np.isfinite(dt) or dt<=0) else max(5, int(round(window_days/dt)))
    return wl if wl % 2 == 1 else wl + 1

def savgol_trend(t, y, window_len, polyorder=2):
    half = window_len//2
    trend = np.full_like(y, np.nan, dtype=float)
    for i in range(len(y)):
        lo, hi = max(0, i-half), min(len(y), i+half+1)
        tt = t[lo:hi] - t[i]; yy = y[lo:hi]
        m  = np.isfinite(tt) & np.isfinite(yy)
        if np.count_nonzero(m) >= polyorder+1:
            c = np.polyfit(tt[m], yy[m], polyorder)
            trend[i] = np.polyval(c, 0.0)
    trend[~np.isfinite(trend)] = np.nanmedian(y)
    return trend

def custom_flatten(lc, window_days=1.0, polyorder=2):
    """Return a *new* LightCurve with flattened flux (no .replace() calls)."""
    t = lc.time.value
    f = lc.flux.value.astype(float)
    wl = window_len_for(t, window_days)
    tr = savgol_trend(t, f, wl, polyorder=polyorder)
    flat = f/np.where(np.isfinite(tr), tr, np.nanmedian(f))
    flat /= np.nanmedian(flat)
    # carry flux_err if present
    try:
        ferr = lc.flux_err.value if getattr(lc, "flux_err", None) is not None else None
    except Exception:
        ferr = None
    from lightkurve import LightCurve
    return LightCurve(time=lc.time, flux=flat, flux_err=ferr, meta=dict(getattr(lc, "meta", {})))

def approx_cdpp_ppm(lc, hours=1.0):
    t = lc.time.value; f = lc.flux.value
    dt_hr = np.nanmedian(np.diff(t))*24.0
    if not np.isfinite(dt_hr) or dt_hr<=0: return float("nan")
    n = max(1, int(round(hours/dt_hr)))
    run = np.convolve(f - np.nanmedian(f), np.ones(n)/n, mode="valid")
    return float(np.nanstd(run)*1e6)

def get_header_sector(header):
    # Try common sector keys; then regex fallback
    for k in ("SECTOR","sector","SEC1"):
        try:
            v = header.get(k)
            if v is not None:
                return int(v)
        except Exception:
            pass
    txt = " ".join([f"{k}={header.get(k)}" for k in header.keys()])
    m = re.search(r"[Ss]ector[^0-9]*([0-9]{1,3})", txt)
    return int(m.group(1)) if m else None

# --- Lightkurve ---
import lightkurve as lk
from lightkurve import LightCurveCollection

def run_download_clean(target_toi: str, target_tic: int, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS):
    TARGET = f"TIC {target_tic}"
    out_fig  = f"figures/TIC{target_tic}_download_clean.png"
    out_json = f"results/TIC{target_tic}_download_clean_summary.json"

    print(f"Target: {target_toi} ({TARGET})")

    # -------------------------
    # 1) Download SPOC PDCSAP first; read sector from files (robust to missing 'sector' column)
    # -------------------------
    pdcsap_list, got_spoc_sectors = [], set()
    sr_spoc = lk.search_lightcurvefile(TARGET, mission="TESS", author="SPOC")

    if len(sr_spoc) == 0:
        print("No SPOC PDCSAP products found.")
    else:
        for i in range(len(sr_spoc)):
            try:
                lcf = sr_spoc[i].download(download_dir="data_raw_fresh")
                lc  = lcf.PDCSAP_FLUX.remove_nans().normalize()
                # sector from object or header
                s = getattr(lcf, "sector", None)
                if s is None:
                    try:
                        hdr = lcf.header() if hasattr(lcf, "header") else lcf.get_header()
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                lc.meta["source"] = "PDCSAP"; lc.meta["sector"] = int(s)
                pdcsap_list.append(lc)
                if s != -1: got_spoc_sectors.add(int(s))
                print(f" PDCSAP S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f" PDCSAP entry {i}: failed -> {e}")

    # -------------------------
    # 2) Query TESSCut and add FFI for sectors not covered by PDCSAP
    # -------------------------
    ffi_list = []
    try:
        sr_cut = lk.search_tesscut(TARGET)
    except Exception as e:
        sr_cut = []
        print("TESSCut query failed:", e)

    if len(sr_cut) == 0:
        print("No TESSCut entries found.")
    else:
        for i in range(len(sr_cut)):
            try:
                tpf = sr_cut[i].download(cutout_size=cutout_sz, download_dir="data_raw_fresh")
                # sector from object or header
                s = getattr(tpf, "sector", None)
                if s is None:
                    try:
                        hdr = tpf.hdu[0].header
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                # skip if PDCSAP already covers
                if (s != -1) and (int(s) in got_spoc_sectors):
                    print(f"  FFI S{int(s)}: skip (already have PDCSAP)")
                    continue
                # aperture photometry
                mask = tpf.create_threshold_mask(threshold=3, reference_pixel=None)
                if not np.any(mask):
                    m = np.zeros_like(mask, dtype=bool)
                    yy, xx = np.unravel_index(np.nanargmax(np.nanmedian(tpf.flux.value, axis=0)), mask.shape)
                    m[yy, xx] = True
                    mask = m
                lc = tpf.to_lightcurve(aperture_mask=mask).remove_nans().normalize()
                lc.meta["source"] = "FFI"; lc.meta["sector"] = int(s)
                ffi_list.append(lc)
                print(f"  FFI S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f"  FFI entry {i}: failed -> {e}")

    if not (pdcsap_list or ffi_list):
        raise RuntimeError("No light curves obtained from SPOC or TESSCut.")

    # -------------------------
    # 3) Stitch, flatten, QC
    # -------------------------
    all_lcs_raw   = pdcsap_list + ffi_list
    stitched_raw  = LightCurveCollection(all_lcs_raw).stitch().remove_nans()
    stitched_flat = custom_flatten(stitched_raw, window_days=win_days)

    def rms_ppm(x): return float(np.nanstd(x)*1e6)
    qc = {
        "target_toi": target_toi,
        "target_tic": target_tic,
        "n_sectors_pdcsap": len(pdcsap_list),
        "n_sectors_ffi": len(ffi_list),
        "sectors_pdcsap": [int(lc.meta.get("sector", -1)) for lc in pdcsap_list],
        "sectors_ffi": [int(lc.meta.get("sector", -1)) for lc in ffi_list],
        "n_points_raw": int(len(stitched_raw)),
        "n_points_flat": int(len(stitched_flat)),
        "rms_raw_ppm": rms_ppm(stitched_raw.flux.value),
        "rms_flat_ppm": rms_ppm(stitched_flat.flux.value),
        "cdpp1h_flat_ppm": approx_cdpp_ppm(stitched_flat, hours=1.0),
        "flatten_window_days": float(win_days),
        "notes": "PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing."
    }

    # -------------------------
    # 4) Figure & JSON
    # -------------------------
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,6), dpi=140, sharex=True)
    stitched_raw.plot(ax=ax1, marker=".", lw=0, alpha=0.35)
    ax1.set_ylabel("Flux (norm)")
    ax1.set_title(f"{target_toi} (TIC {target_tic}) — Stitched RAW")
    stitched_flat.plot(ax=ax2, marker=".", lw=0, alpha=0.45, label="flattened")
    ax2.set_xlabel("Time [BTJD]")
    ax2.set_ylabel("Flux (norm)")
    ax2.set_title(f"{target_toi} — Stitched FLAT (window={win_days:.2f} d)")
    ax2.legend(loc="best")
    fig.tight_layout()
    fig.savefig(out_fig); plt.close(fig)

    with open(out_json, "w") as f:
        json.dump(qc, f, indent=2)

    print("\n== DONE ==")
    print(f"Saved figure: {out_fig}")
    print(f"Saved JSON  : {out_json}")
    print("Summary:", qc)

    return qc, out_fig, out_json

# ---------- Run on Target A (deliverable) ----------
_ = run_download_clean(TARGET_TOI, TARGET_TIC, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)

# ---------- OPTIONAL: batch-run first 4 priority targets ----------
# Set RUN_BATCH=True to run on the first 4 rows in results/priority_targets.csv
RUN_BATCH = False
if RUN_BATCH:
    import pandas as pd
    dfp = pd.read_csv("results/priority_targets.csv")
    # Expect columns: TIC_ID_norm, toi (string/number). Adjust if your headers differ.
    for _, row in dfp.head(4).iterrows():
        tic = int(row["TIC_ID_norm"])
        toi = f"TOI {row['toi']}"
        try:
            run_download_clean(toi, tic, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)
        except Exception as e:
            print(f"[Batch] {toi} (TIC {tic}) failed: {e}")

Target: TOI 550.02 (TIC 311183180)
 PDCSAP S5: ok (N=17286)
 PDCSAP S31: ok (N=16250)
  FFI S5: skip (already have PDCSAP)
  FFI S31: skip (already have PDCSAP)

== DONE ==
Saved figure: figures/TIC311183180_download_clean.png
Saved JSON  : results/TIC311183180_download_clean_summary.json
Summary: {'target_toi': 'TOI 550.02', 'target_tic': 311183180, 'n_sectors_pdcsap': 2, 'n_sectors_ffi': 0, 'sectors_pdcsap': [5, 31], 'sectors_ffi': [], 'n_points_raw': 33536, 'n_points_flat': 33536, 'rms_raw_ppm': 3489.643335342407, 'rms_flat_ppm': 2277.229196109564, 'cdpp1h_flat_ppm': 2077.4277749925795, 'flatten_window_days': 1.0, 'notes': 'PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing.'}


In [4]:
# 02_download_clean — Target A (PDCSAP-first, TESSCut-FFI fallback) — robust & batchable
import json, warnings, pathlib, re
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# --- Config (single-target defaults) ---
TARGET_TOI="TOI 139.01"
TARGET_TIC=62483237
QUALITY    = 175
CUTOUT_SZ  = 15
WIN_DAYS   = 1.0

# --- Folders ---
pathlib.Path("figures").mkdir(exist_ok=True)
pathlib.Path("results").mkdir(exist_ok=True)
pathlib.Path("data_raw_fresh").mkdir(exist_ok=True)

# --- Helpers ---
def window_len_for(t_days, window_days=1.0):
    dt = np.nanmedian(np.diff(t_days))
    wl = 401 if (not np.isfinite(dt) or dt<=0) else max(5, int(round(window_days/dt)))
    return wl if wl % 2 == 1 else wl + 1

def savgol_trend(t, y, window_len, polyorder=2):
    half = window_len//2
    trend = np.full_like(y, np.nan, dtype=float)
    for i in range(len(y)):
        lo, hi = max(0, i-half), min(len(y), i+half+1)
        tt = t[lo:hi] - t[i]; yy = y[lo:hi]
        m  = np.isfinite(tt) & np.isfinite(yy)
        if np.count_nonzero(m) >= polyorder+1:
            c = np.polyfit(tt[m], yy[m], polyorder)
            trend[i] = np.polyval(c, 0.0)
    trend[~np.isfinite(trend)] = np.nanmedian(y)
    return trend

def custom_flatten(lc, window_days=1.0, polyorder=2):
    """Return a *new* LightCurve with flattened flux (no .replace() calls)."""
    t = lc.time.value
    f = lc.flux.value.astype(float)
    wl = window_len_for(t, window_days)
    tr = savgol_trend(t, f, wl, polyorder=polyorder)
    flat = f/np.where(np.isfinite(tr), tr, np.nanmedian(f))
    flat /= np.nanmedian(flat)
    # carry flux_err if present
    try:
        ferr = lc.flux_err.value if getattr(lc, "flux_err", None) is not None else None
    except Exception:
        ferr = None
    from lightkurve import LightCurve
    return LightCurve(time=lc.time, flux=flat, flux_err=ferr, meta=dict(getattr(lc, "meta", {})))

def approx_cdpp_ppm(lc, hours=1.0):
    t = lc.time.value; f = lc.flux.value
    dt_hr = np.nanmedian(np.diff(t))*24.0
    if not np.isfinite(dt_hr) or dt_hr<=0: return float("nan")
    n = max(1, int(round(hours/dt_hr)))
    run = np.convolve(f - np.nanmedian(f), np.ones(n)/n, mode="valid")
    return float(np.nanstd(run)*1e6)

def get_header_sector(header):
    # Try common sector keys; then regex fallback
    for k in ("SECTOR","sector","SEC1"):
        try:
            v = header.get(k)
            if v is not None:
                return int(v)
        except Exception:
            pass
    txt = " ".join([f"{k}={header.get(k)}" for k in header.keys()])
    m = re.search(r"[Ss]ector[^0-9]*([0-9]{1,3})", txt)
    return int(m.group(1)) if m else None

# --- Lightkurve ---
import lightkurve as lk
from lightkurve import LightCurveCollection

def run_download_clean(target_toi: str, target_tic: int, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS):
    TARGET = f"TIC {target_tic}"
    out_fig  = f"figures/TIC{target_tic}_download_clean.png"
    out_json = f"results/TIC{target_tic}_download_clean_summary.json"

    print(f"Target: {target_toi} ({TARGET})")

    # -------------------------
    # 1) Download SPOC PDCSAP first; read sector from files (robust to missing 'sector' column)
    # -------------------------
    pdcsap_list, got_spoc_sectors = [], set()
    sr_spoc = lk.search_lightcurvefile(TARGET, mission="TESS", author="SPOC")

    if len(sr_spoc) == 0:
        print("No SPOC PDCSAP products found.")
    else:
        for i in range(len(sr_spoc)):
            try:
                lcf = sr_spoc[i].download(download_dir="data_raw_fresh")
                lc  = lcf.PDCSAP_FLUX.remove_nans().normalize()
                # sector from object or header
                s = getattr(lcf, "sector", None)
                if s is None:
                    try:
                        hdr = lcf.header() if hasattr(lcf, "header") else lcf.get_header()
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                lc.meta["source"] = "PDCSAP"; lc.meta["sector"] = int(s)
                pdcsap_list.append(lc)
                if s != -1: got_spoc_sectors.add(int(s))
                print(f" PDCSAP S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f" PDCSAP entry {i}: failed -> {e}")

    # -------------------------
    # 2) Query TESSCut and add FFI for sectors not covered by PDCSAP
    # -------------------------
    ffi_list = []
    try:
        sr_cut = lk.search_tesscut(TARGET)
    except Exception as e:
        sr_cut = []
        print("TESSCut query failed:", e)

    if len(sr_cut) == 0:
        print("No TESSCut entries found.")
    else:
        for i in range(len(sr_cut)):
            try:
                tpf = sr_cut[i].download(cutout_size=cutout_sz, download_dir="data_raw_fresh")
                # sector from object or header
                s = getattr(tpf, "sector", None)
                if s is None:
                    try:
                        hdr = tpf.hdu[0].header
                    except Exception:
                        hdr = None
                    if hdr is not None:
                        s = get_header_sector(hdr)
                if s is None: s = -1
                # skip if PDCSAP already covers
                if (s != -1) and (int(s) in got_spoc_sectors):
                    print(f"  FFI S{int(s)}: skip (already have PDCSAP)")
                    continue
                # aperture photometry
                mask = tpf.create_threshold_mask(threshold=3, reference_pixel=None)
                if not np.any(mask):
                    m = np.zeros_like(mask, dtype=bool)
                    yy, xx = np.unravel_index(np.nanargmax(np.nanmedian(tpf.flux.value, axis=0)), mask.shape)
                    m[yy, xx] = True
                    mask = m
                lc = tpf.to_lightcurve(aperture_mask=mask).remove_nans().normalize()
                lc.meta["source"] = "FFI"; lc.meta["sector"] = int(s)
                ffi_list.append(lc)
                print(f"  FFI S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
            except Exception as e:
                print(f"  FFI entry {i}: failed -> {e}")

    if not (pdcsap_list or ffi_list):
        raise RuntimeError("No light curves obtained from SPOC or TESSCut.")

    # -------------------------
    # 3) Stitch, flatten, QC
    # -------------------------
    all_lcs_raw   = pdcsap_list + ffi_list
    stitched_raw  = LightCurveCollection(all_lcs_raw).stitch().remove_nans()
    stitched_flat = custom_flatten(stitched_raw, window_days=win_days)

    def rms_ppm(x): return float(np.nanstd(x)*1e6)
    qc = {
        "target_toi": target_toi,
        "target_tic": target_tic,
        "n_sectors_pdcsap": len(pdcsap_list),
        "n_sectors_ffi": len(ffi_list),
        "sectors_pdcsap": [int(lc.meta.get("sector", -1)) for lc in pdcsap_list],
        "sectors_ffi": [int(lc.meta.get("sector", -1)) for lc in ffi_list],
        "n_points_raw": int(len(stitched_raw)),
        "n_points_flat": int(len(stitched_flat)),
        "rms_raw_ppm": rms_ppm(stitched_raw.flux.value),
        "rms_flat_ppm": rms_ppm(stitched_flat.flux.value),
        "cdpp1h_flat_ppm": approx_cdpp_ppm(stitched_flat, hours=1.0),
        "flatten_window_days": float(win_days),
        "notes": "PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing."
    }

    # -------------------------
    # 4) Figure & JSON
    # -------------------------
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,6), dpi=140, sharex=True)
    stitched_raw.plot(ax=ax1, marker=".", lw=0, alpha=0.35)
    ax1.set_ylabel("Flux (norm)")
    ax1.set_title(f"{target_toi} (TIC {target_tic}) — Stitched RAW")
    stitched_flat.plot(ax=ax2, marker=".", lw=0, alpha=0.45, label="flattened")
    ax2.set_xlabel("Time [BTJD]")
    ax2.set_ylabel("Flux (norm)")
    ax2.set_title(f"{target_toi} — Stitched FLAT (window={win_days:.2f} d)")
    ax2.legend(loc="best")
    fig.tight_layout()
    fig.savefig(out_fig); plt.close(fig)

    with open(out_json, "w") as f:
        json.dump(qc, f, indent=2)

    print("\n== DONE ==")
    print(f"Saved figure: {out_fig}")
    print(f"Saved JSON  : {out_json}")
    print("Summary:", qc)

    return qc, out_fig, out_json

# ---------- Run on Target A (deliverable) ----------
_ = run_download_clean(TARGET_TOI, TARGET_TIC, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)

# ---------- OPTIONAL: batch-run first 4 priority targets ----------
# Set RUN_BATCH=True to run on the first 4 rows in results/priority_targets.csv
RUN_BATCH = False
if RUN_BATCH:
    import pandas as pd
    dfp = pd.read_csv("results/priority_targets.csv")
    # Expect columns: TIC_ID_norm, toi (string/number). Adjust if your headers differ.
    for _, row in dfp.head(4).iterrows():
        tic = int(row["TIC_ID_norm"])
        toi = f"TOI {row['toi']}"
        try:
            run_download_clean(toi, tic, quality=QUALITY, cutout_sz=CUTOUT_SZ, win_days=WIN_DAYS)
        except Exception as e:
            print(f"[Batch] {toi} (TIC {tic}) failed: {e}")

Target: TOI 139.01 (TIC 62483237)
 PDCSAP S1: ok (N=18094)
 PDCSAP S28: ok (N=77241)
 PDCSAP S28: ok (N=12584)
 PDCSAP S68: ok (N=87527)
 PDCSAP S68: ok (N=14710)
  FFI S1: skip (already have PDCSAP)
  FFI S28: skip (already have PDCSAP)
  FFI S68: skip (already have PDCSAP)
  FFI S95: ok (N=10114)

== DONE ==
Saved figure: figures/TIC62483237_download_clean.png
Saved JSON  : results/TIC62483237_download_clean_summary.json
Summary: {'target_toi': 'TOI 139.01', 'target_tic': 62483237, 'n_sectors_pdcsap': 5, 'n_sectors_ffi': 1, 'sectors_pdcsap': [1, 28, 28, 68, 68], 'sectors_ffi': [95], 'n_points_raw': 220270, 'n_points_flat': 220270, 'rms_raw_ppm': 368080.16896247864, 'rms_flat_ppm': 167808.4942875445, 'cdpp1h_flat_ppm': 84620.20798996135, 'flatten_window_days': 1.0, 'notes': 'PDCSAP preferred; FFI(TESSCut) used where SPOC PDCSAP missing.'}


In [7]:
# 02_download_clean — PDCSAP-first, TESSCut-FFI fallback (ONE-CELL VERSION)
# “PDCSAP first; FFI fallback; stitched + flattened; saves figure + QC.”
# ---- EDIT THESE TWO LINES ONLY ----
TARGET_TOI = "TOI 1801.01"   # label for plots
TARGET_TIC = 119584412       # integer TIC ID
# -----------------------------------

import json, warnings, pathlib, re
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

# --- Config ---
QUALITY    = 175         # standard TESS quality mask (matches Lightkurve’s default)
CUTOUT_SZ  = 15          # TESSCut postage-stamp size (pixels)
WIN_DAYS   = 1.0         # flattening window (days) – gentle, won’t erase hour-long dips
OUT_FIG    = f"figures/TIC{TARGET_TIC}_download_clean.png"
OUT_JSON   = f"results/TIC{TARGET_TIC}_download_clean_summary.json"
TARGET     = f"TIC {TARGET_TIC}"

# --- Make folders ---
for d in ["figures", "results", "data_raw_fresh"]:
    pathlib.Path(d).mkdir(exist_ok=True)

# --- Helpers ---
def window_len_for(t_days, window_days=1.0):
    dt = np.nanmedian(np.diff(t_days))
    wl = 401 if (not np.isfinite(dt) or dt<=0) else max(5, int(round(window_days/dt)))
    return wl if wl % 2 == 1 else wl + 1

def savgol_trend(t, y, window_len, polyorder=2):
    half = window_len//2
    trend = np.full_like(y, np.nan, dtype=float)
    for i in range(len(y)):
        lo, hi = max(0, i-half), min(len(y), i+half+1)
        tt = t[lo:hi] - t[i]; yy = y[lo:hi]
        m = np.isfinite(tt) & np.isfinite(yy)
        if np.count_nonzero(m) >= polyorder+1:
            c = np.polyfit(tt[m], yy[m], polyorder)
            trend[i] = np.polyval(c, 0.0)
    trend[~np.isfinite(trend)] = np.nanmedian(y)
    return trend

def custom_flatten(lc, window_days=1.0, polyorder=2):
    """Return a NEW LightCurve with same time, flattened flux (median=1)."""
    from lightkurve import LightCurve
    t = lc.time.value
    f = lc.flux.value.astype(float)
    wl = window_len_for(t, window_days)
    tr = savgol_trend(t, f, wl, polyorder=polyorder)
    flat = f/np.where(np.isfinite(tr), tr, np.nanmedian(f))
    flat /= np.nanmedian(flat)
    return LightCurve(time=lc.time, flux=flat, flux_err=getattr(lc, "flux_err", None), meta=lc.meta)

def approx_cdpp_ppm(lc, hours=1.0):
    t = lc.time.value; f = lc.flux.value
    dt_hr = np.nanmedian(np.diff(t))*24.0
    if not np.isfinite(dt_hr) or dt_hr<=0: return float("nan")
    n = max(1, int(round(hours/dt_hr)))
    run = np.convolve(f - np.nanmedian(f), np.ones(n)/n, mode="valid")
    return float(np.nanstd(run)*1e6)

def get_header_sector(header):
    for k in ("SECTOR","sector","SEC1"):
        try:
            v = header.get(k)
            if v is not None:
                return int(v)
        except Exception:
            pass
    txt = " ".join([f"{k}={header.get(k)}" for k in header.keys()])
    m = re.search(r"[Ss]ector[^0-9]*([0-9]{1,3})", txt)
    return int(m.group(1)) if m else None

def robust_ylim(flux, lo=0.2, hi=99.8):
    lo_, hi_ = np.nanpercentile(flux, [lo, hi])
    pad = 0.03*(hi_ - lo_)
    return lo_-pad, hi_+pad

def apply_quality_mask(lc, bitmask=QUALITY):
    """Keep cadences where (quality & ~bitmask)==0, i.e., standard good cadences."""
    q = getattr(lc, "quality", None)
    if q is None:
        return lc
    good = (q & ~bitmask) == 0
    return lc[good]

# --- Lightkurve imports ---
import lightkurve as lk
from lightkurve import LightCurveCollection

print(f"Target: {TARGET_TOI} ({TARGET})")

# =========================
# 1) PDCSAP (SPOC) download
# =========================
pdcsap_list, got_spoc_sectors = [], set()
sr_spoc = lk.search_lightcurvefile(TARGET, mission="TESS", author="SPOC")

if len(sr_spoc) == 0:
    print("No SPOC PDCSAP products found.")
else:
    for i in range(len(sr_spoc)):
        try:
            lcf = sr_spoc[i].download(download_dir="data_raw_fresh")
            lc  = lcf.PDCSAP_FLUX.remove_nans().normalize()
            lc  = apply_quality_mask(lc, QUALITY)

            # sector from object or header
            s = getattr(lcf, "sector", None)
            if s is None:
                hdr = None
                try:
                    hdr = lcf.header() if hasattr(lcf, "header") else lcf.get_header()
                except Exception:
                    pass
                if hdr is not None:
                    s = get_header_sector(hdr)
            if s is None: s = -1

            lc.meta["source"] = "PDCSAP"; lc.meta["sector"] = int(s)
            pdcsap_list.append(lc)
            if s != -1: got_spoc_sectors.add(int(s))
            print(f" PDCSAP S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
        except Exception as e:
            print(f" PDCSAP entry {i}: failed -> {e}")

# ==============================
# 2) TESSCut (FFI) for gaps only
# ==============================
ffi_list = []
try:
    sr_cut = lk.search_tesscut(TARGET)
except Exception as e:
    sr_cut = []
    print("TESSCut query failed:", e)

if len(sr_cut) == 0:
    print("No TESSCut entries found.")
else:
    for i in range(len(sr_cut)):
        try:
            tpf = sr_cut[i].download(cutout_size=CUTOUT_SZ, download_dir="data_raw_fresh")
            s = getattr(tpf, "sector", None)
            if s is None:
                hdr = None
                try:
                    hdr = tpf.hdu[0].header
                except Exception:
                    pass
                if hdr is not None:
                    s = get_header_sector(hdr)
            if s is None: s = -1

            if (s != -1) and (int(s) in got_spoc_sectors):
                print(f"  FFI S{int(s)}: skip (already have PDCSAP)")
                continue

            mask = tpf.create_threshold_mask(threshold=3, reference_pixel=None)
            if not np.any(mask):
                m = np.zeros_like(mask, dtype=bool)
                yy, xx = np.unravel_index(np.nanargmax(np.nanmedian(tpf.flux.value, axis=0)), mask.shape)
                m[yy, xx] = True
                mask = m

            lc = tpf.to_lightcurve(aperture_mask=mask).remove_nans().normalize()
            lc = apply_quality_mask(lc, QUALITY)
            lc.meta["source"] = "FFI"; lc.meta["sector"] = int(s)
            ffi_list.append(lc)
            print(f"  FFI S{int(s) if s!=-1 else '?'}: ok (N={len(lc)})")
        except Exception as e:
            print(f"  FFI entry {i}: failed -> {e}")

if not (pdcsap_list or ffi_list):
    raise RuntimeError("No light curves obtained from SPOC or TESSCut.")

# =======================
# 3) Stitch, clean, QC
# =======================
all_lcs_raw   = pdcsap_list + ffi_list
stitched_raw  = LightCurveCollection(all_lcs_raw).stitch().remove_nans()

# very light outlier clip (protects plots; safe for downloader stage)
f = stitched_raw.flux.value
med, sig = np.nanmedian(f), np.nanstd(f)
m = (f > med - 5*sig) & (f < med + 5*sig)
stitched_raw = stitched_raw[m]

stitched_flat = custom_flatten(stitched_raw, window_days=WIN_DAYS)

def rms_ppm(x): return float(np.nanstd(x)*1e6)
qc = {
    "target_toi": TARGET_TOI,
    "target_tic": int(TARGET_TIC),
    "n_sectors_pdcsap": len(pdcsap_list),
    "n_sectors_ffi": len(ffi_list),
    "sectors_pdcsap": [int(lc.meta.get("sector", -1)) for lc in pdcsap_list],
    "sectors_ffi": [int(lc.meta.get("sector", -1)) for lc in ffi_list],
    "n_points_raw": int(len(stitched_raw)),
    "n_points_flat": int(len(stitched_flat)),
    "rms_raw_ppm": rms_ppm(stitched_raw.flux.value),
    "rms_flat_ppm": rms_ppm(stitched_flat.flux.value),
    "cdpp1h_flat_ppm": approx_cdpp_ppm(stitched_flat, hours=1.0),
    "flatten_window_days": WIN_DAYS,
    "notes": "PDCSAP preferred; FFI(TESSCut) only when SPOC PDCSAP is missing."
}

# =======================
# 4) Figure + JSON output
# =======================
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12,6), dpi=140, sharex=True)
t0 = 2457000.0

ax1.scatter(stitched_raw.time.value, stitched_raw.flux.value, s=2, alpha=0.35)
ax1.set_ylabel("Flux (norm)")
ax1.set_title(f"{TARGET_TOI} ({TARGET}) — Stitched RAW")

ax2.scatter(stitched_flat.time.value, stitched_flat.flux.value, s=2, alpha=0.45, label="flattened")
ax2.set_xlabel("Time [BTJD]")
ax2.set_ylabel("Flux (norm)")
ax2.set_title(f"{TARGET_TOI} — Stitched FLAT (window={WIN_DAYS:.2f} d)")
ax2.legend(loc="best")

# keep y-axes sensible even if an FFI chunk is wild
y1_lo, y1_hi = robust_ylim(stitched_raw.flux.value)
y2_lo, y2_hi = robust_ylim(stitched_flat.flux.value)
ax1.set_ylim(y1_lo, y1_hi)
ax2.set_ylim(y2_lo, y2_hi)

fig.tight_layout()
fig.savefig(OUT_FIG); plt.close(fig)

with open(OUT_JSON, "w") as f:
    json.dump(qc, f, indent=2)

print("\n== DONE ==")
print(f"Saved figure: {OUT_FIG}")
print(f"Saved JSON  : {OUT_JSON}")
print("Summary:", qc)

Target: TOI 1801.01 (TIC 119584412)
 PDCSAP S22: ok (N=16101)
 PDCSAP S49: ok (N=13272)
  FFI S22: skip (already have PDCSAP)
  FFI S49: skip (already have PDCSAP)

== DONE ==
Saved figure: figures/TIC119584412_download_clean.png
Saved JSON  : results/TIC119584412_download_clean_summary.json
Summary: {'target_toi': 'TOI 1801.01', 'target_tic': 119584412, 'n_sectors_pdcsap': 2, 'n_sectors_ffi': 0, 'sectors_pdcsap': [22, 49], 'sectors_ffi': [], 'n_points_raw': 29370, 'n_points_flat': 29370, 'rms_raw_ppm': 1356.3033426180482, 'rms_flat_ppm': 1096.2121625978075, 'cdpp1h_flat_ppm': 251.0249996146523, 'flatten_window_days': 1.0, 'notes': 'PDCSAP preferred; FFI(TESSCut) only when SPOC PDCSAP is missing.'}


In [10]:
# === Per-sector QA (robust: handles Time/Quantity types) ===
import numpy as np, matplotlib.pyplot as plt, pathlib, json
import lightkurve as lk

TIC = TARGET_TIC
BASE = f"TIC{TIC}"
pathlib.Path("figures").mkdir(exist_ok=True)
pathlib.Path("results").mkdir(exist_ok=True)

def as_float_array(x):
    """Return a float numpy array from Lightkurve/astropy containers."""
    # Try .value (Quantity/Time), else assume it's already an array-like
    try:
        x = x.value
    except Exception:
        pass
    return np.asarray(x, dtype=float)

def rms_ppm(y):
    y = as_float_array(y)
    return float(np.nanstd(y) * 1e6)

def cdpp_ppm(time, flux, hours=1.0):
    t = as_float_array(time)
    f = as_float_array(flux)
    dt = np.nanmedian(np.diff(t))
    if not np.isfinite(dt) or dt <= 0:
        return float("nan")
    n = max(1, int(round((hours/24.0)/dt)))
    run = np.convolve(f - np.nanmedian(f), np.ones(n)/n, mode="valid")
    return float(np.nanstd(run)*1e6)

def build_quality_mask(lc, quality_bitmask):
    q = getattr(lc, "quality", None)
    if q is not None and len(q) == len(lc.flux):
        try:
            return lk.utils.TessQualityFlags.create_quality_mask(q, bitmask=quality_bitmask)
        except Exception:
            return np.ones(len(lc.flux), dtype=bool)
    return np.ones(len(lc.flux), dtype=bool)

entries = []
for src_list in (pdcsap_list, ffi_list):
    for lc in src_list:
        # sector & source tags
        try:
            sector = int(lc.meta.get("sector", -1))
        except Exception:
            sector = -1
        src_lbl = "PDCSAP" if lc.meta.get("source") == "PDCSAP" else "FFI"

        # arrays
        t_all = as_float_array(lc.time)
        f_all = as_float_array(lc.flux)

        # masks
        qmask  = build_quality_mask(lc, QUALITY)
        finite = np.isfinite(t_all) & np.isfinite(f_all)
        m = qmask & finite

        N_all = int(len(f_all))
        N_use = int(np.count_nonzero(m))
        frac_bad = float(1 - (N_use / max(N_all, 1)))

        # plot with plain arrays
        fig, ax = plt.subplots(1, 1, figsize=(11, 3), dpi=140)
        ax.plot(t_all[m], f_all[m], ".", ms=2, alpha=0.7)
        ax.set_title(f"{TARGET_TOI} S{sector if sector!=-1 else '?'} — {src_lbl}")
        ax.set_xlabel("Time [BTJD]"); ax.set_ylabel("Flux (norm)")
        out_png = f"figures/QA_{BASE}_S{sector if sector!=-1 else 'u'}.png"
        fig.tight_layout(); fig.savefig(out_png); plt.close(fig)

        # metrics
        entries.append({
            "sector": sector,
            "source": src_lbl,
            "N_all": N_all,
            "N_use": N_use,
            "frac_bad": round(frac_bad, 4),
            "rms_ppm": rms_ppm(f_all[m]),
            "cdpp1h_ppm": cdpp_ppm(t_all[m], f_all[m]),
        })

if not entries:
    raise RuntimeError("No sectors could be processed for QA (empty entries).")

# sort & save summary
entries = sorted(entries, key=lambda d: (d["sector"], d["source"]))
out_json = f"results/QA_{BASE}_summary.json"
with open(out_json, "w") as f:
    json.dump(entries, f, indent=2)

print(f"Per-sector QA complete: saved {len(entries)} plots to figures/ and summary to {out_json}")
for e in entries:
    cdpp_int = int(e['cdpp1h_ppm']) if np.isfinite(e['cdpp1h_ppm']) else 'nan'
    print(f" S{e['sector']}: {e['source']}, N_use={e['N_use']}/{e['N_all']}, "
          f"frac_bad={e['frac_bad']}, cdpp1h_ppm={cdpp_int}")

Per-sector QA complete: saved 2 plots to figures/ and summary to results/QA_TIC119584412_summary.json
 S22: PDCSAP, N_use=16101/16101, frac_bad=0.0, cdpp1h_ppm=740
 S49: PDCSAP, N_use=13272/13272, frac_bad=0.0, cdpp1h_ppm=937
