In [None]:
%load_ext autoreload

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv("../.env")

In [None]:
%autoreload 2

import json
from pathlib import Path

import tqdm
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rasterio
import seaborn as sns
import yaml
from PIL import Image
import datetime

from estuary.util import false_color, masked_contrast_stretch, broad_band
from estuary.model.data import parse_dt_from_pth

In [None]:
ss_df = pd.read_csv("/Volumes/x10pro/estuary/skysat/labels.csv")
ss_df["acquired"] = ss_df.source_tif.apply(lambda a: parse_dt_from_pth(Path(a)))
ss_df = ss_df.sort_values(["region", "acquired"]).reset_index(drop=True)
ss_df["simple_label"] = ss_df.label.apply(lambda a: int("open" in a))
ss_df.head()

In [None]:
dd_df = pd.read_csv("/Volumes/x10pro/estuary/dove/labels.csv")
dd_df["acquired"] = dd_df.source_tif.apply(lambda a: parse_dt_from_pth(Path(a)))
dd_df = dd_df.sort_values(["region", "acquired"]).reset_index(drop=True)
dd_df["simple_label"] = dd_df.label.apply(lambda a: int("open" in a))
dd_df.head()

In [None]:
tol = pd.Timedelta("14h")

# cross-join within region, then filter by window
tmp = dd_df.merge(ss_df, on="region", suffixes=("_dd", "_ss"))
mask = (tmp["acquired_dd"] >= tmp["acquired_ss"] - tol) & (
    tmp["acquired_dd"] <= tmp["acquired_ss"] + tol
)
pairs = tmp.loc[mask].sort_values(["region", "acquired_dd", "acquired_ss"])

pairs["match"] = pairs.label_dd == pairs.label_ss
pairs["simple_match"] = pairs.simple_label_dd == pairs.simple_label_ss

pairs.head()

In [None]:
pairs.match.mean()

In [None]:
pairs.simple_match.mean()

In [None]:
pairs

In [None]:
all_clear_dfs = []
all_dfs = []
all_to_download_dfs = []

def parse_dt_from_asset_id(asset_id: str) -> datetime.datetime:
    """Parse acquisition datetime from file stem prefix YYYYMMDD_HHMMSS_*"""
    datetime_str = "_".join(asset_id.split("_")[:2])
    date_format = "%Y%m%d_%H%M%S"
    return datetime.datetime.strptime(datetime_str, date_format)

for region_p in Path("/Volumes/x10pro/estuary/ca_grids").iterdir():
    region = region_p.stem
    for year in range(2017, 2026):
        for month in range(1, 13):
            for dove in ["superdove", "dove"]:
                pth = Path(f"/Volumes/x10pro/estuary/ca_all/{dove}/results") / str(year) / str(month) / str(region) / "clear_images_to_download.csv"
                if not pth.exists():
                    continue
                df = pd.read_csv(pth)
                df["dove"] = dove
                df["region"] = int(region)
                df["year"] = int(year)
                df["month"] = int(month)
                df["acquired"] = df.asset_id.apply(parse_dt_from_asset_id) 

                all_clear_dfs.append(df)

                pth = Path(f"/Volumes/x10pro/estuary/ca_all/{dove}/results") / str(year) / str(month) / str(region) / "images_to_download.csv"
                if not pth.exists():
                    continue
                ddf = pd.read_csv(pth)
                ddf["dove"] = dove
                ddf["region"] = int(region)
                ddf["year"] = int(year)
                ddf["month"] = int(month)
                ddf["acquired"] = df.asset_id.apply(parse_dt_from_asset_id)
                downloaded_asset_ids = ddf.asset_id.tolist()
                ddf = ddf[ddf.asset_id.isin(df.asset_id)].copy()

                all_dfs.append(ddf)

                all_to_download_dfs.append(df[~df.asset_id.isin(downloaded_asset_ids)].copy())

clear_df = pd.concat(all_clear_dfs)
clear_df = clear_df.drop(columns=['ordered_idx', 'capture_datetime'])

available_df = pd.concat(all_dfs)
available_df = available_df.drop(columns=['ordered_idx', 'capture_datetime'])

to_download_df = pd.concat(all_to_download_dfs)

print(len(clear_df), len(available_df))

clear_df.head(3)

In [None]:
ss_df = pd.read_csv("/Volumes/x10pro/estuary/skysat/labels.csv")
ss_df["acquired"] = ss_df.source_tif.apply(lambda a: parse_dt_from_pth(Path(a)))
ss_df["month"] = ss_df.acquired.dt.month
ss_df["year"] = ss_df.acquired.dt.year
ss_df = ss_df.drop(columns=['source_jpeg'])
ss_df = ss_df[ss_df.label != "unsure"].copy()
ss_df.head()

In [None]:
tmp = clear_df.merge(ss_df, on="region", suffixes=("_dd", "_ss"))
tmp["acquired_diff"] = (tmp.acquired_dd - tmp.acquired_ss).abs()
tol = pd.Timedelta("10h")
mask = (tmp["acquired_dd"] >= tmp["acquired_ss"] - tol) & (
    tmp["acquired_dd"] <= tmp["acquired_ss"] + tol
)
dove_pairs = tmp.loc[mask].sort_values(["region", "acquired_dd", "acquired_ss"])

dove_pairs = dove_pairs.drop(columns=["year_ss", "month_ss"]).rename(
    columns={"year_dd": "year", "month_dd": "month"}
)

dove_pairs

In [None]:
# strip the _3B_udm2 from the file name
# e.g. 20230901_182511_53_2486_3B_udm2.tif
def cleaned_asset_id(filename: str) -> str:
    filepath = Path(filename)
    return filepath.stem.split("_3B_")[0]

dove_labels = pd.read_csv("/Volumes/x10pro/estuary/dove/labels.csv")
dove_labels["acquired"] = pd.to_datetime(dove_labels["acquired"], errors="coerce")
dove_labels["asset_id"] = dove_labels.source_tif.apply(cleaned_asset_id)
dove_labels["year"] = dove_labels.acquired.dt.year
dove_labels["month"] = dove_labels.acquired.dt.month
dove_labels.head(3)

In [None]:
cloudy_downloads = {}
clear_downloads = {}
for region_p in Path("/Volumes/x10pro/estuary/ca_grids").iterdir():
    region = region_p.stem
    for year in range(2017, 2026):
        for month in range(1, 13):
            for dove in ["superdove", "dove"]:
                pth = Path(f"/Volumes/x10pro/estuary/ca_all/{dove}/results") / str(year) / str(month) / str(region) / "filtered_search_results.json"
                if not pth.exists():
                    continue
                with open(pth) as f:
                    data = json.load(f)

                cloudy = []
                clear = []
                for d in data:
                    asset_id = d['id']
                    visible_pct = d['properties']['visible_percent']
                    clear_pct = d['properties']['clear_percent']
                    anomalous_pixels = d['properties']["anomalous_pixels"]
                    visible_confidence_percent = d['properties']['visible_confidence_percent']
                    if visible_pct == 0:
                        cloudy.append(asset_id)
                    elif clear_pct == 100 and visible_pct == 100 and anomalous_pixels < 0.009 and visible_confidence_percent > 80:
                        clear.append(asset_id)
                if len(cloudy):
                    cloudy_downloads[(region, year, month, dove)] = cloudy
                if len(clear):
                    clear_downloads[(region, year, month, dove)] = clear

len(cloudy_downloads), len(clear_downloads)

In [None]:
(source_base / "test.tif").parents[5]

In [None]:
files = list(Path("/Volumes/x10pro/estuary/ca_all/").glob("*ove/results/*/*/*/files/*_SR_*.tif"))
ff = {(p.parents[3].stem, p.parents[2].stem, p.parents[1].stem, cleaned_asset_id(p)): p for p in files}
len(ff)

In [None]:
clears = []
for (region, year, month, dove), asset_ids in clear_downloads.items():
    for asset_id in asset_ids:
        key = (str(year), str(month), str(region), asset_id)
        filename = ff.get(key)
        if filename is None:
            continue
            # print((region, year, month, dove), asset_id)
        clears.append({
            "region": int(region),
            "year": int(year),
            "month": int(month),
            "instrument": dove,
            "source_tif": filename,
            "acquired": parse_dt_from_asset_id(asset_id),
            "asset_id": asset_id,
        })

clear_labels_df = pd.DataFrame(clears)
print(len(clear_labels_df))
clear_labels_df.head()

In [None]:
clear_labels_df.to_csv("/Volumes/x10pro/estuary/quality_dataset/possible_files.csv", index=False)

In [None]:
from estuary.util import masked_contrast_stretch
import cv2
import pywt

def denoise_mild(y: np.ndarray, method: str = "gaussian") -> np.ndarray:
    if method == "bilateral":
        # Bilateral: preserve edges a bit; parameters are mild
        # diameter 5, sigmaColor .1, sigmaSpace 3 (on [0,1] scale)
        return cv2.bilateralFilter(y, d=5, sigmaColor=0.1, sigmaSpace=3)
    # Gaussian mild blur
    return cv2.GaussianBlur(y, (5,5), 0.8)

def robust_std(x: np.ndarray) -> float:
    # MAD-based robust std
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med))
    return float(1.4826 * mad)

def gradient_mag(y: np.ndarray) -> np.ndarray:
    gx = cv2.Sobel(y, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(y, cv2.CV_32F, 0, 1, ksize=3)
    return np.sqrt(gx*gx + gy*gy)

def sharpness_score(x: np.ndarray) -> float:
    x_blur = cv2.GaussianBlur(x, (5,5), 1.0)
    lap = cv2.Laplacian(x_blur, cv2.CV_32F)
    return float(lap.var())

def noise_score(x: np.ndarray) -> float:
    x_blur = cv2.GaussianBlur(x, (5,5), 1.0)
    residual = x - x_blur
    med = np.median(residual)
    mad = np.median(np.abs(residual - med))
    return 1.4826 * mad  # robust Ïƒ estimate

clear_labels_df["sharp"] = 0.0
clear_labels_df["noise"] = 0.0


import pandas as pd, scipy.stats as st
from estuary.util.constants import FALSE_COLOR_4, FALSE_COLOR_8

for idx, row in clear_labels_df.iterrows():

    with rasterio.open(row.source_tif) as src:
        data = src.read()
        nodata = src.read(1, masked=True).mask
        if len(data) == 4:
            rgb_idx = FALSE_COLOR_4
        else:
            rgb_idx = FALSE_COLOR_8
    
        r, g, b = [data[i] for i in rgb_idx]
        luma = 0.2989 * r + 0.5870 * g + 0.1140 * b
        img = masked_contrast_stretch(luma, ~nodata)
        clear_labels_df.loc[idx, "sharp"] = sharpness_score(img)
        clear_labels_df.loc[idx, "noise"] = noise_score(img)


# z-score each to normalize
clear_labels_df["sharp_z"] = st.zscore(clear_labels_df["sharp"])
clear_labels_df["noise_z"] = st.zscore(clear_labels_df["noise"])
clear_labels_df["clarity_score"] = clear_labels_df["sharp_z"] - clear_labels_df["noise_z"]


In [None]:
N = 20

deduped = clear_labels_df.sort_values(
    by=["region", "year", "month", "clarity_score"], ascending=False
).drop_duplicates(subset=["region", "year", "month"])

top_lap = (
    deduped
    .sort_values(["region", "clarity_score"], ascending=[True, False])
    .groupby("region", group_keys=False)
    .head(N)
)

top_lap

In [None]:
top_lap.drop(columns=['lap']).to_csv("/Volumes/x10pro/estuary/quality_dataset/clear.csv", index=False)

In [None]:
a = top_lap.iloc[197].source_tif
with rasterio.open(a) as src:
    plt.figure()
    plt.imshow(false_color(src.read(), src.read(1, masked=True).mask))

In [None]:
cloudys = []
for (region, year, month, dove), asset_ids in cloudy_downloads.items():
    for asset_id in asset_ids:
        key = (str(year), str(month), str(region), asset_id)
        filename = ff.get(key)
        if filename is None:
            continue
            # print((region, year, month, dove), asset_id)
        cloudys.append({
            "region": int(region),
            "year": int(year),
            "month": int(month),
            "instrument": dove,
            "source_tif": filename,
            "acquired": parse_dt_from_asset_id(asset_id),
            "asset_id": asset_id,
        })

cloudy_labels_df = pd.DataFrame(cloudys)
print(len(cloudy_labels_df))
cloudy_labels_df.head()

In [None]:
a = cloudy_labels_df.iloc[82].source_tif
with rasterio.open(a) as src:
    plt.figure()
    plt.imshow(false_color(src.read(), src.read(1, masked=True).mask))

In [None]:
ss_df.head()

In [None]:
plt.hist([len(v) for v in cloudy_downloads.values()])

In [None]:

for (region, year, month, dove), cloudy in cloudy_downloads.items():
    save_pth = Path("/Volumes/x10pro/estuary/ca_cloudy/") / str(dove) / "results" / str(year) / str(month) / str(region) / "cloudy_images_to_download.csv"
    save_pth.parent.mkdir(exist_ok=True, parents=True)
    df = pd.DataFrame(cloudy, columns=["asset_id"])
    df['include_image'] = True
    df.to_csv(save_pth, index=False)

In [None]:
new_keep

In [None]:
labels = pd.read_csv(Path("/Volumes/x10pro/estuary/ca_all/dove/labels.csv"))
labels.head()

In [None]:
def ccc(pth):
    pth = Path(pth)
    with open(pth.parent.parent / "filtered_search_results.json") as f:
        data = json.load(f)
    dd = next(d for d in data if d["id"] in pth.stem)
    print(dd['properties'])
    return
ccc(labels.source_tif.iloc[0])

In [None]:
def scene_clear_percent(pth):
    pth = Path(pth)
    with open(pth.parent.parent / "filtered_search_results.json") as f:
        data = json.load(f)
    dd = next(d for d in data if d["id"] in pth.stem)
    print(dd['properties'])
    return
    return dd["properties"]["clear_percent"]

def scene_visible_percent(pth):
    pth = Path(pth)
    with open(pth.parent.parent / "filtered_search_results.json") as f:
        data = json.load(f)
    dd = next(d for d in data if d["id"] in pth.stem)
    return dd["properties"]["visible_percent"]

labels["scene_visible_percent"] = labels.source_tif.apply(scene_visible_percent)

In [None]:
def add_clear_percent(pth):
    pth = Path(pth)

    key = "_".join(pth.stem.split("_")[:2])
    udm_pth = next(p for p in pth.parent.glob("*udm2_clip.tif") if key in p.stem)

    with rasterio.open(udm_pth) as src:
        data = src.read([1, 7])
    clear = data[0]
    conf = data[-1]
    nodata = conf < 1
    yesdata = ~nodata
    clear[nodata] = 0

    pct = 100 * clear.sum() / yesdata.sum()
    return pct

labels["clear_percent"] = labels.source_tif.apply(add_clear_percent)

In [None]:
labels["unsure"] = labels.label.apply(lambda a: int(a == "unsure"))
labels.head()

In [None]:
sns.scatterplot(
    data=labels,
    x="clear_percent",
    y="scene_visible_percent",
    hue="unsure",
    palette={0: "steelblue", 1: "orange"},
    alpha=0.6,
)

In [None]:
sns.histplot(
    data=labels,
    x="scene_visible_percent",
    hue="unsure",
    bins=20,
    stat="density",  # or "probability"
    common_norm=False,  # don't normalize across classes
    palette={0: "steelblue", 1: "orange"},
    alpha=0.6,
)

plt.xlabel("scene_clear_percent")
plt.ylabel("Density (normalized per class)")
plt.title("Normalized distribution of scene_visible_percent by 'unsure' label")
plt.tight_layout()
plt.show()

In [None]:
sns.histplot(
    data=labels,
    x="clear_percent",
    hue="unsure",
    bins=20,
    stat="density",  # or "probability"
    common_norm=False,  # don't normalize across classes
    palette={0: "steelblue", 1: "orange"},
    alpha=0.6,
)

plt.xlabel("clear_percent")
plt.ylabel("Density (normalized per class)")
plt.title("Normalized distribution of clear_percent by 'unsure' label")
plt.tight_layout()
plt.show()

In [None]:
def rrr(lll):
    pcts = []
    # for scene_visible_pct in range(0, 30, 1):
    #     for scene_clear_pct in range(0, 30, 1):
    for clear_pct in range(50, 110, 10):
        cnts0 = len(lll[
            (lll.unsure == 0)
            & (
                # (lll.scene_clear_percent < scene_clear_pct) | 
                (lll.scene_clear_percent < 10)
                | (lll.clear_percent < clear_pct)
            )
        ])
        cnts1 = len(lll[
            (lll.unsure == 1)
            & (
                # (lll.scene_clear_percent < scene_clear_pct) | 
                (lll.scene_clear_percent < 10)
                | (lll.clear_percent < clear_pct)
            )
        ])
        pcts.append({
            # "scene_visible_pct": scene_visible_pct,
            "clear_pct": clear_pct,
            # "scene_clear_pct": scene_clear_pct,
            "u0": cnts0,
            "u1": cnts1,
            "unsure_pct": 100 * cnts1 / max(1, (cnts0 + cnts1))
        })
        
    return pd.DataFrame(pcts)

a = rrr(labels)
a.plot.scatter(y="unsure_pct", x="u1")

In [None]:
a[a.unsure_pct > 30].sort_values(["u1", "unsure_pct"], ascending=False).head(10)

In [None]:
sns.histplot(
    data=labels[
    (labels.scene_clear_percent > 10)
    ],
    x="clear_percent",
    hue="unsure",
    bins=20,
    stat="density",  # or "probability"
    common_norm=False,  # don't normalize across classes
    palette={0: "steelblue", 1: "orange"},
    alpha=0.6,
)

plt.xlabel("clear_percent")
plt.ylabel("Density (normalized per class)")
plt.title("Normalized distribution of clear_percent by 'unsure' label")
plt.tight_layout()
plt.show()

In [None]:
labels[
    (labels.scene_clear_percent >= 10)
].plot.scatter(x="

In [None]:
_, row = next(ii)
print(Path(row.source_tif).stem)
print(row)
with rasterio.open(row.source_tif) as src:
    data = src.read(out_dtype=np.float32)
    nodata = ~src.read_masks(1)
    if len(data) == 4:
        img = false_color(data, nodata)
    else:
        img = broad_band(data, nodata)

plt.figure()
plt.imshow(img)

In [None]:
runs = []
base = Path("/Users/kyledorman/data/results/estuary/train")
paths = list(base.glob("20251008*")) + [sorted(list(base.glob("20251006*")))[-1]]
for pth in paths:
    if not (pth / "preds.csv").exists():
        continue
    df = pd.read_csv(pth / "preds.csv")

    with open(pth / "cli_diff.yaml") as f:
        conf = yaml.safe_load(f)
    if "20251003" in pth.name:
        df["smooth_factor"] = 0.0
    else:
        df["smooth_factor"] = conf.get("smooth_factor", 0.0)
    df["perch_smooth_factor"] = conf.get("perch_smooth_factor", 0.0)
    df["epochs"] = conf.get("epochs", 0)
    df["model"] = pth.name
    runs.append(df)

runs_df = pd.concat(runs, ignore_index=True)

runs_df.head()

In [None]:
grouped_results = (
    runs_df.sort_values(by=["region", "epochs", "perch_smooth_factor", "smooth_factor"])
    .groupby(by=["region", "model", "dataset"], as_index=False)
    .correct.mean()
    .rename(columns={"correct": "accuracy"})
)

grouped_results.head()

In [None]:
plt.figure(figsize=(8, 20))  # tall plot for 75 regions

df = grouped_results.copy()
df["region"] = df.region.apply(str)

# Boxplots for train per region
sns.boxplot(
    data=df[df["dataset"] == "train"],
    y="region",
    x="accuracy",
    color="lightblue",
    showfliers=False,
    orient="h",
    label="Train",
    # color="lightblue", marker="D", s=30, label="Train"
)

# Overlay val points
sns.boxplot(
    data=df[df["dataset"] == "val"],
    y="region",
    x="accuracy",
    color="lightgreen",
    showfliers=False,
    orient="h",
    label="Val",
    # color="lightblue", marker="D", s=30, label="Train"
)

# Overlay test points
sns.boxplot(
    data=df[df["dataset"] == "test"],
    y="region",
    x="accuracy",
    color="pink",
    showfliers=False,
    orient="h",
    label="Val",
)

plt.title("Accuracy distribution")
plt.xlabel("Accuracy")
plt.ylabel("Region")
plt.legend()
plt.tight_layout()

plt.show()

In [None]:
a = runs_df.groupby("region").correct.mean()
worst_regions = a.sort_values().iloc[:10].index.tolist()
worst_regions

In [None]:
df = (
    runs_df[runs_df.dataset != "train"]
    .groupby(["model", "region"], as_index=False)
    .correct.mean()
    .rename(columns={"correct": "accuracy"})
)
a = df.groupby("region").accuracy.min()
worst_regions = a.sort_values().iloc[:10].index.tolist()

adf = df[df.region.isin(worst_regions)].copy().sort_values("model")
adf["region"] = adf.region.apply(str)

sns.boxplot(
    data=adf,
    y="model",
    x="accuracy",
    color="lightblue",
    showfliers=False,
    orient="h",
    label="Val",
    # color="lightblue", marker="D", s=30, label="Train"
)

# sns.boxplot(
#     data=df[df["dataset"]=="test"],
#     y="model", x="accuracy",
#     color="pink", showfliers=False, orient="h", label="Test"
#     # color="lightblue", marker="D", s=30, label="Train"
# )

# sns.boxplot(
#     data=df[df["dataset"]=="train"],
#     y="model", x="accuracy",
#     color="green", showfliers=False, orient="h", label="Train"
#     # color="lightblue", marker="D", s=30, label="Train"
# )

In [None]:
skipped_regions = pd.read_csv("/Volumes/x10pro/estuary/geos/skipped_regions.csv")[
    "Site code"
].to_list()

gdf = gpd.read_file("/Users/kyledorman/data/estuary/geos/ca_data_w_usgs.geojson")
gdf = gdf[~gdf["Site code"].isin(skipped_regions)].copy()
gdf = gdf.set_index("Site code")
gdf.head()

In [None]:
plt.figure()
pth = list(
    Path("/Volumes/x10pro/estuary/skysat/results/").glob("*/43/files/*pansharpened_clip.tif")
)[0]
with rasterio.open(pth) as src:
    data = src.read()
    nodata = src.read(1, masked=True).mask
    img = false_color(data, nodata)
    img = Image.fromarray(img).resize((224, 224))
    # .save("/Users/kyledorman/data/estuary/display/region_53.png")
plt.imshow(img)

In [None]:
with open("/Volumes/x10pro/estuary/dataset/normalization/stats.json") as f:
    stats = json.load(f)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler

pt = PowerTransformer(standardize=False)
pt.lambdas_ = np.array(stats["lambdas"])
st = StandardScaler()
st.scale_ = np.array(stats["stds"])
st.mean_ = np.array(stats["means"])

norm = Pipeline([("PowerTransformer", pt), ("StandardScaler", st)])

In [None]:
pth = list(
    Path("/Volumes/x10pro/estuary/skysat/results/").glob("*/43/files/*pansharpened_clip.tif")
)[0]
print(pth)
with open("/Volumes/x10pro/estuary/dataset/normalization/stats.json") as f:
    stats = json.load(f)

plt.figure(figsize=(10, 10))

EIGHT_TO_4 = (7, 5, 3, 1)
with rasterio.open(pth) as src:
    data = src.read(out_dtype=np.float32)
    nodata = src.read(1, masked=True).mask
    rgb = masked_contrast_stretch(data[[2, 1, 0]], ~nodata)
    plt.imshow(rgb.transpose((1, 2, 0)))

    img = np.zeros((8, *data.shape[1:]), dtype=data.dtype)
    for i, b in enumerate(reversed(EIGHT_TO_4)):
        img[b] = data[i]
    shp = img.shape
    img = norm.transform(img.reshape(len(img), -1).T).T.reshape(shp)
    img = np.array([img[i] for i in reversed(EIGHT_TO_4)])

bgrnir = ["B", "G", "R", "NIR"]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))

for i, ax in enumerate(axes.flatten()):
    ax.axis("off")
    ax.imshow(img[i])
    ax.set_title(bgrnir[i])
fig.tight_layout()
plt.show()

In [None]:
pth = list(Path("/Volumes/x10pro/estuary/dove/results/").glob("*/*/43/files/*SR*clip.tif"))[0]
with open("/Volumes/x10pro/estuary/dataset/normalization/stats.json") as f:
    stats = json.load(f)

print(pth)

# plt.figure()

EIGHT_TO_4 = (7, 5, 3, 1)
with rasterio.open(pth) as src:
    data = src.read(out_dtype=np.float32)
    nodata = src.read(1, masked=True).mask
    rgb = masked_contrast_stretch(data[[2, 1, 0]], ~nodata)
    # plt.imshow(rgb.transpose((1, 2, 0)))

    img = np.zeros((8, *data.shape[1:]), dtype=data.dtype)
    for i, b in enumerate(reversed(EIGHT_TO_4)):
        img[b] = data[i]
    shp = img.shape
    img = norm.transform(img.reshape(len(img), -1).T).T.reshape(shp)
    img = np.array([img[i] for i in reversed(EIGHT_TO_4)])

bgrnir = ["B", "G", "R", "NIR"]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
for i, ax in enumerate(axes.flatten()):
    ax.axis("off")
    ax.imshow(img[i])
    ax.set_title(bgrnir[i])
fig.tight_layout()
plt.show()

In [None]:
pth = list(Path("/Volumes/x10pro/estuary/superdove/results/").glob("*/*/43/files/*SR*clip.tif"))[1]
with open("/Volumes/x10pro/estuary/dataset/normalization/stats.json") as f:
    stats = json.load(f)

print(pth)

plt.figure()

with rasterio.open(pth) as src:
    img = src.read(out_dtype=np.float32)
    nodata = src.read(1, masked=True).mask
    rgb = masked_contrast_stretch(img[[5, 3, 1]], ~nodata)
    plt.imshow(rgb.transpose((1, 2, 0)))

    shp = img.shape
    img = norm.transform(img.reshape(len(img), -1).T).T.reshape(shp)

from estuary.util.constants import BAND_NAMES

fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(10, 20))

for i, ax in enumerate(axes.flatten()):
    ax.axis("off")
    ax.imshow(img[i])
    ax.set_title(BAND_NAMES[i])
fig.tight_layout()
plt.show()

In [None]:
contig_preds = pd.read_csv("/Volumes/x10pro/estuary/ca_all/preds.csv")
contig_preds["acquired"] = contig_preds.source_tif.apply(lambda a: parse_dt_from_pth(Path(a)))
contig_preds = contig_preds.sort_values(by=["region", "acquired"])
contig_preds["year"] = contig_preds.acquired.dt.year
contig_preds.head()

In [None]:
acc_yr = (
    contig_preds.groupby(["year", "region"], as_index=False)
    .correct.mean()
    .rename(columns={"correct": "accuracy"})
)

acc_yr.head()

In [None]:
acc_yr.region.max()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(9, 6))

for region, group in acc_yr.groupby("region"):
    group = group.sort_values("year")
    ax.plot(group["year"], group["accuracy"], marker="o", label=region)

ax.set_title("Accuracy Over Time by Region")
ax.set_xlabel("Year")
ax.set_ylabel("Accuracy")
ax.legend(title="Region", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.boxplot(data=acc_yr, x="year", y="accuracy", color="skyblue", width=0.6)
plt.title("Accuracy per Year (All Regions Combined)")
plt.xlabel("Year")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.show()