# Test

In [1]:
import pandas as pd
from pathlib import Path

QC_DIR = Path("datasets/processed/qc")

# 1) 看 daily 的列和基本情况
dfd = pd.read_parquet(QC_DIR / "greatlakes_daily_clean.parquet")
print("Daily columns:", dfd.columns.tolist())
print(dfd.head())

# 2) 每个 lake 的 CI 有多少非 NaN
print(dfd[["CI_mean", "CI_p90"]].isna().mean())        # NaN 比例
print(dfd.groupby("lake_id")[["CI_mean","CI_p90"]].count().head())

# 3) 看 monthly 的情况
dfm = pd.read_parquet(QC_DIR / "greatlakes_monthly_clean.parquet")
print("Monthly columns:", dfm.columns.tolist())
print(dfm[["CI_mean", "CI_p90"]].isna().mean())

Daily columns: ['lake_id', 'date', 'product', 'CI_mean', 'CI_p90', 'n_valid', 'src', 'engine', 'area_m2', 'expected_pixels_geom', 'lake_name', 'empiric_n_valid_max', 'pct_valid_geom', 'pct_valid_emp', 'qc_low_cov_geom', 'qc_low_cov_emp', 'qc_tiny_abs_pix', 'qc_is_valid']
  lake_id                date product  CI_mean  CI_p90  n_valid  \
0    GL-1 2024-01-01 13:52:31   daily      NaN     NaN      NaN   
1    GL-1 2024-01-02 13:36:49   daily      NaN     NaN      NaN   
2    GL-1 2024-01-03 14:02:06   daily      NaN     NaN      NaN   
3    GL-1 2024-01-04 13:40:26   daily      NaN     NaN      NaN   
4    GL-1 2024-01-05 13:48:41   daily      NaN     NaN      NaN   

                                                 src   engine       area_m2  \
0  S3M_OLCI_EFRNT.20240101.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  2.488940e+10   
1  S3M_OLCI_EFRNT.20240102.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  2.488940e+10   
2  S3M_OLCI_EFRNT.20240103.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  2.488940e+10   
3  S3M_OL

确认原始 lake parquet 里是不是已经全 NaN

In [3]:
import pandas as pd
from pathlib import Path

P_LAKE_DAILY   = Path("/dkucc/home/zy166/HAB-forecasting/datasets/processed/lake_ci_daily.parquet")
P_LAKE_MONTHLY = Path("/dkucc/home/zy166/HAB-forecasting/datasets/processed/lake_ci_monthly.parquet")

for p in [P_LAKE_DAILY, P_LAKE_MONTHLY]:
    df = pd.read_parquet(p)
    print("====", p.name, "====")
    print(df.columns.tolist())
    print(df.head())
    for c in [col for col in df.columns if "CI" in col or "ci" in col]:
        v = pd.to_numeric(df[c], errors="coerce")
        print(c, "non-null", v.notna().sum(), "finite", (v.replace([float("inf"), -float("inf")], float("nan")).notna()).sum())

==== lake_ci_daily.parquet ====
['lake_id', 'date', 'product', 'CI_mean', 'CI_p90', 'n_valid', 'src', 'engine']
  lake_id                      date product  CI_mean  CI_p90  n_valid  \
0    GL-1 2024-01-01 13:52:31+00:00   daily      NaN     NaN        0   
1    GL-1 2024-01-02 13:36:49+00:00   daily      NaN     NaN        0   
2    GL-1 2024-01-03 14:02:06+00:00   daily      NaN     NaN        0   
3    GL-1 2024-01-04 13:40:26+00:00   daily      NaN     NaN        0   
4    GL-1 2024-01-05 13:48:41+00:00   daily      NaN     NaN        0   

                                                 src   engine  
0  S3M_OLCI_EFRNT.20240101.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  
1  S3M_OLCI_EFRNT.20240102.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  
2  S3M_OLCI_EFRNT.20240103.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  
3  S3M_OLCI_EFRNT.20240104.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  
4  S3M_OLCI_EFRNT.20240105.L3m.DAY.ILW_CONUS.V5.a...  netcdf4  
CI_mean non-null 0 finite 0
CI_p90 non-null 0 finite 0
==== lake_

Check original `.nc` file

In [4]:
from pathlib import Path
import xarray as xr
import numpy as np

# 1) 选一个 daily 文件
daily_dir = Path("/dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY")
nc_path = sorted(daily_dir.glob("S3M_OLCI_EFRNT.*.L3m.DAY.*.nc"))[0]
print("Using file:", nc_path)

ds = xr.open_dataset(nc_path)
print("\n=== DATASET SUMMARY ===")
print(ds)

print("\n=== DATA VARS ===")
for name, da in ds.data_vars.items():
    print(f"- {name}: dims={da.dims}, attrs keys={list(da.attrs.keys())}")

print("\n=== COORDS ===")
for name, da in ds.coords.items():
    print(f"- {name}: dims={da.dims}")

Using file: /dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY/S3M_OLCI_EFRNT.20240101.L3m.DAY.ILW_CONUS.V5.all.CONUS.300m.nc

=== DATASET SUMMARY ===
<xarray.Dataset> Size: 27GB
Dimensions:   (y: 15138, x: 26328, rgb: 3, eightbitcolor: 256)
Coordinates:
    lat       (y, x) float32 2GB ...
    lon       (y, x) float32 2GB ...
Dimensions without coordinates: y, x, rgb, eightbitcolor
Data variables: (12/16)
    rhos_400  (y, x) float32 2GB ...
    rhos_412  (y, x) float32 2GB ...
    rhos_443  (y, x) float32 2GB ...
    rhos_490  (y, x) float32 2GB ...
    rhos_510  (y, x) float32 2GB ...
    rhos_560  (y, x) float32 2GB ...
    ...        ...
    rhos_709  (y, x) float32 2GB ...
    rhos_754  (y, x) float32 2GB ...
    rhos_865  (y, x) float32 2GB ...
    rhos_884  (y, x) float32 2GB ...
    CI_cyano  (y, x) float32 2GB ...
    palette   (rgb, eightbitcolor) uint8 768B ...
Attributes: (12/63)
    product_name:                      S3M_OLCI_EFRNT.20240101.L3m.DAY.ILW_C

In [5]:
var_name = "CI_cyano"  # 这里把名字换成你实际看到的那个
da = ds[var_name]

print("\n=== CI overall stats ===")
print("shape:", da.shape)
arr = da.where(np.isfinite(da))  # 去掉 inf
print("non-NaN ratio:", float(arr.notnull().mean().values))
print("min:", float(arr.min().values), "max:", float(arr.max().values))


=== CI overall stats ===
shape: (15138, 26328)
non-NaN ratio: 0.01559749112981797
min: 4.9999991460936144e-05 max: 0.08100177347660065


In [6]:
import geopandas as gpd
import rioxarray
import numpy as np

from pathlib import Path
import xarray as xr

P_LAKES = Path("/dkucc/home/zy166/HAB-forecasting/datasets/Lakes/shapes/lakes_greatlakes_5poly.gpkg")
lakes = gpd.read_file(P_LAKES).to_crs(4326)

daily_dir = Path("/dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY")
nc_path = sorted(daily_dir.glob("S3M_OLCI_EFRNT.*.L3m.DAY.*.nc"))[0]
ds = xr.open_dataset(nc_path)

# 把 CI 变量名换成你真正看到的那个
var_name = "CI_cyano"
da = ds[var_name]

# 确保有 CRS 信息
da = da.rio.write_crs("EPSG:4326")

for _, row in lakes.iterrows():
    geom = [row.geometry]
    clipped = da.rio.clip(geom, lakes.crs, drop=False)
    arr = clipped.values
    finite = np.isfinite(arr)
    print(row["lake_id"], "finite pixels:", int(finite.sum()))

GL-1 finite pixels: 0
GL-2 finite pixels: 0
GL-3 finite pixels: 0
GL-4 finite pixels: 0
GL-5 finite pixels: 0


In [8]:
print(float(ds.lon.min().values), float(ds.lon.max().values))

-150.3340606689453 -41.66587448120117


把 geometry 的经度从 −180–180 转到 0–360

In [9]:
import numpy as np
import geopandas as gpd
from shapely.ops import transform as shp_transform

def shift_lon_0_360(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    返回一个新的 GeoDataFrame 拷贝：
    - 维持 EPSG:4326（纬度不变）
    - 但经度从 [-180, 180] 映射到 [0, 360]
    """
    def _shift_geom(geom):
        if geom is None or geom.is_empty:
            return geom
        def _func(x, y, z=None):
            x = np.array(x, dtype="float64")
            x = np.where(x < 0, x + 360.0, x)
            if z is None:
                return x, y
            else:
                return x, y, z
        return shp_transform(_func, geom)

    out = gdf.copy()
    out["geometry"] = out.geometry.apply(_shift_geom)
    return out

一个简化版的 extract_lakes_from_nc（核心逻辑）

In [10]:
import xarray as xr
import numpy as np
import pandas as pd
import rioxarray

def extract_lakes_from_nc(nc_path: str,
                          lakes_gdf: gpd.GeoDataFrame,
                          lake_id_col: str,
                          product: str):
    ds = xr.open_dataset(nc_path)

    # 1) CI 变量
    da = ds["CI_cyano"]

    # 2) 写 CRS
    da = da.rio.write_crs("EPSG:4326")

    # 3) 准备 lakes 几何（注意经度范围）
    lakes4326 = lakes_gdf.to_crs(4326)

    # 判断 nc 的经度范围
    lon = ds["lon"]
    lon_min = float(lon.min().values)
    lon_max = float(lon.max().values)

    if lon_min >= 0:  # 说明是 0–360
        lakes_for_clip = shift_lon_0_360(lakes4326)
    else:             # 已经是 -180–180
        lakes_for_clip = lakes4326

    # 4) 时间戳（简单写法：用属性/坐标，如果没有就从文件名解析）
    if "time" in ds.coords:
        # 如果 time 是长度 1 的坐标
        t = pd.to_datetime(ds["time"].values[0])
    else:
        # 例如文件名里有 20240101 这种，按你之前的规则 parse
        # 这里给一个简单示例：
        import re
        m = re.search(r"\.(\d{8})\.", nc_path)
        if m:
            t = pd.to_datetime(m.group(1), format="%Y%m%d")
        else:
            t = pd.NaT

    rows = []
    for _, row in lakes_for_clip.iterrows():
        lake_id = row[lake_id_col]
        geom = [row.geometry]

        # rioxarray clip
        sub = da.rio.clip(geom, lakes_for_clip.crs, drop=False)
        arr = sub.values

        finite = np.isfinite(arr)
        n_valid = int(finite.sum())

        if n_valid > 0:
            vals = arr[finite]
            ci_mean = float(vals.mean())
            ci_p90  = float(np.percentile(vals, 90))
        else:
            ci_mean = np.nan
            ci_p90  = np.nan

        rows.append({
            "lake_id": lake_id,
            "time": t,
            "product": product,
            "CI_mean": ci_mean,
            "CI_p90": ci_p90,
            "n_valid": n_valid,
        })

    return pd.DataFrame(rows)

使用单日尝试

In [12]:
from pathlib import Path
import re

import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import rioxarray  # 一定要 import 一下，才能用 .rio
from shapely.ops import transform as shp_transform


# ---------------------------
# 1) 把经度从 [-180, 180] shift 到 [0, 360]
# ---------------------------
def shift_lon_0_360(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    返回一个新的 GeoDataFrame:
    - 坐标系仍然是 EPSG:4326
    - 但 geometry 中所有 x<0 的经度统一加 360
    """
    def _shift_geom(geom):
        if geom is None or geom.is_empty:
            return geom

        def _func(x, y, z=None):
            x = np.array(x, dtype="float64")
            x = np.where(x < 0, x + 360.0, x)
            if z is None:
                return x, y
            else:
                return x, y, z

        return shp_transform(_func, geom)

    out = gdf.copy()
    out["geometry"] = out.geometry.apply(_shift_geom)
    return out


# ---------------------------
# 2) 单日 lake 提取函数
# ---------------------------
def extract_lakes_from_nc(nc_path: str,
                          lakes_gdf: gpd.GeoDataFrame,
                          lake_id_col: str,
                          product: str = "daily") -> pd.DataFrame:
    """
    从单个 L3m nc 文件中，对每个 lake 计算:
      - CI_mean
      - CI_p90
      - n_valid (finite 像元数)

    注意：
    - 自动判断 lon 是否为 0–360，如是则对湖做 shift。
    - 时间戳从文件名中解析 8 位日期（YYYYMMDD）。
    """
    print(f"[INFO] open nc: {nc_path}")
    ds = xr.open_dataset(nc_path)

    # 1) 拿 CI 变量
    if "CI_cyano" not in ds.data_vars:
        raise KeyError("CI_cyano not found in dataset data_vars")
    da = ds["CI_cyano"]

    # 2) 写 CRS（经纬度）
    da = da.rio.write_crs("EPSG:4326")

    # 3) 准备 lakes 几何
    lakes4326 = lakes_gdf.to_crs(4326)

    # 判断 lon 范围
    lon = ds["lon"]
    lon_min = float(lon.min().values)
    lon_max = float(lon.max().values)
    print(f"[INFO] lon range in nc: [{lon_min:.3f}, {lon_max:.3f}]")

    if lon_min >= 0:
        print("[INFO] lon is 0–360 → shift lakes to 0–360")
        lakes_for_clip = shift_lon_0_360(lakes4326)
    else:
        print("[INFO] lon already in -180–180")
        lakes_for_clip = lakes4326

    # 4) 时间戳：从文件名 parse YYYYMMDD
    m = re.search(r"\.(\d{8})\.", Path(nc_path).name)
    if m:
        t = pd.to_datetime(m.group(1), format="%Y%m%d")
    else:
        t = pd.NaT
    print(f"[INFO] parsed date: {t}")

    # 5) 对每个 lake 做 clip
    rows = []
    for _, row in lakes_for_clip.iterrows():
        lake_id = row[lake_id_col]
        geom = [row.geometry]

        sub = da.rio.clip(geom, lakes_for_clip.crs, drop=False)
        arr = sub.values

        finite = np.isfinite(arr)
        n_valid = int(finite.sum())

        if n_valid > 0:
            vals = arr[finite]
            ci_mean = float(vals.mean())
            ci_p90 = float(np.percentile(vals, 90))
        else:
            ci_mean = np.nan
            ci_p90 = np.nan

        rows.append({
            "lake_id": lake_id,
            "date": t,
            "product": product,
            "CI_mean": ci_mean,
            "CI_p90": ci_p90,
            "n_valid": n_valid,
            "src": Path(nc_path).name,
        })

    df = pd.DataFrame(rows)
    return df


# ---------------------------
# 3) main：用 2024-08-01 这一天测试
# ---------------------------
if __name__ == "__main__":
    # 路径按你给的固定写死
    P_LAKES = Path("/dkucc/home/zy166/HAB-forecasting/datasets/Lakes/shapes/lakes_greatlakes_5poly.gpkg")
    NC_PATH = Path("/dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY/S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CONUS.V5.all.CONUS.300m.nc")

    print("[INFO] read lakes:", P_LAKES)
    lakes = gpd.read_file(P_LAKES)
    print(lakes[["lake_id"]].head())

    df = extract_lakes_from_nc(str(NC_PATH), lakes, lake_id_col="lake_id", product="daily")

    print("\n=== RESULT DF ===")
    print(df)

    # 顺便算一下简单统计
    print("\n=== quick stats ===")
    for _, r in df.iterrows():
        print(
            f"{r['lake_id']}: n_valid={r['n_valid']}, "
            f"CI_mean={r['CI_mean']}, CI_p90={r['CI_p90']}"
        )

    # 可选：把结果存成一个小 parquet 方便后面对比
    out_pq = NC_PATH.with_name("debug_lake_ci_20240801.parquet")
    df.to_parquet(out_pq, index=False)
    print(f"\n[OK] saved debug parquet → {out_pq}")

[INFO] read lakes: /dkucc/home/zy166/HAB-forecasting/datasets/Lakes/shapes/lakes_greatlakes_5poly.gpkg
  lake_id
0    GL-1
1    GL-2
2    GL-3
3    GL-4
4    GL-5
[INFO] open nc: /dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY/S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CONUS.V5.all.CONUS.300m.nc
[INFO] lon range in nc: [-150.334, -41.666]
[INFO] lon already in -180–180
[INFO] parsed date: 2024-08-01 00:00:00

=== RESULT DF ===
  lake_id       date product  CI_mean  CI_p90  n_valid  \
0    GL-1 2024-08-01   daily      NaN     NaN        0   
1    GL-2 2024-08-01   daily      NaN     NaN        0   
2    GL-3 2024-08-01   daily      NaN     NaN        0   
3    GL-4 2024-08-01   daily      NaN     NaN        0   
4    GL-5 2024-08-01   daily      NaN     NaN        0   

                                                 src  
0  S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CONUS.V5.a...  
1  S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CONUS.V5.a...  
2  S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CO

debug

In [13]:
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

# 路径
P_LAKES = Path("/dkucc/home/zy166/HAB-forecasting/datasets/Lakes/shapes/lakes_greatlakes_5poly.gpkg")
NC_PATH = Path("/dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY/S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CONUS.V5.all.CONUS.300m.nc")

if __name__ == "__main__":
    print("[INFO] read lakes:", P_LAKES)
    lakes = gpd.read_file(P_LAKES).to_crs(4326)
    print(lakes[["lake_id"]])

    print("[INFO] open nc:", NC_PATH)
    ds = xr.open_dataset(NC_PATH)

    lat = ds["lat"]
    lon = ds["lon"]
    ci  = ds["CI_cyano"]

    print("[INFO] lon range:", float(lon.min().values), float(lon.max().values))
    print("[INFO] lat range:", float(lat.min().values), float(lat.max().values))

    rows = []
    for _, row in lakes.iterrows():
        lake_id = row["lake_id"]
        minx, miny, maxx, maxy = row.geometry.bounds  # 经度, 纬度

        print(f"\n[INFO] {lake_id} bbox:", (minx, miny, maxx, maxy))

        # 只用 bbox 做粗略掩膜
        mask = (
            (lon >= minx) & (lon <= maxx) &
            (lat >= miny) & (lat <= maxy)
        )

        sub = ci.where(mask)
        arr = sub.values  # 注意：这里会把 bbox 范围的数据读进内存

        finite = np.isfinite(arr)
        n_valid = int(finite.sum())

        if n_valid > 0:
            vals = arr[finite]
            ci_mean = float(vals.mean())
            ci_p90  = float(np.percentile(vals, 90))
        else:
            ci_mean = np.nan
            ci_p90  = np.nan

        print(f"[RESULT] {lake_id}: n_valid={n_valid}, CI_mean={ci_mean}, CI_p90={ci_p90}")

        rows.append({
            "lake_id": lake_id,
            "n_valid": n_valid,
            "CI_mean": ci_mean,
            "CI_p90": ci_p90,
        })

    df = pd.DataFrame(rows)
    print("\n=== SUMMARY ===")
    print(df)

[INFO] read lakes: /dkucc/home/zy166/HAB-forecasting/datasets/Lakes/shapes/lakes_greatlakes_5poly.gpkg
  lake_id
0    GL-1
1    GL-2
2    GL-3
3    GL-4
4    GL-5
[INFO] open nc: /dkucc/home/zy166/HAB-forecasting/datasets/ILW/Merged/2024/CONUS_DAY/S3M_OLCI_EFRNT.20240801.L3m.DAY.ILW_CONUS.V5.all.CONUS.300m.nc
[INFO] lon range: -150.3340606689453 -41.66587448120117
[INFO] lat range: 9.170879364013672 57.43088150024414

[INFO] GL-1 bbox: (-83.46863511976093, 41.386089351877224, -78.86101258894625, 42.89921256238626)
[RESULT] GL-1: n_valid=89675, CI_mean=0.00011738413013517857, CI_p90=5.0000002374872565e-05

[INFO] GL-2 bbox: (-84.74594016316608, 43.008959363955704, -79.72171104277461, 46.54417264441878)
[RESULT] GL-2: n_valid=392418, CI_mean=5.083527503302321e-05, CI_p90=5.0000002374872565e-05

[INFO] GL-3 bbox: (-88.03564199558765, 41.62575045047001, -84.76135839606302, 46.09757222438803)
[RESULT] GL-3: n_valid=2916, CI_mean=0.00010147361899726093, CI_p90=5.0000002374872565e-05

[INFO] 