In [None]:
# Process IAP dissolved oxygen concentration data for spatiotemporal discretization - Argo
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
IAP Argo (Oxygen) netCDF -> one CSV per standard depth level (no seasonal split)

Final output fields (and ONLY these, fixed order):
  Date, Time, Pressure, Latitude, Longitude, Temperature, Salinity, Oxygen, Source

Formatting requirements (implemented in this script):
- Latitude / Longitude: keep 4 decimals
- Pressure: integer (rounded; you still treat m ≈ dbar)
- Temperature / Salinity: keep 2 decimals (but set to empty in this script)
- Oxygen: keep 2 decimals
- Date: YYYY-MM-DD
- Time: always empty (<NA>)
- Temperature: always empty (<NA>)
- Salinity: always empty (<NA>)
- Oxygen: DOXY_QCed_interpolated_Adjusted_IAP (no QC flags are used)

Output structure:
  ROOT_DIR/{depth1}dbar/depth{depth1}.csv

Depth matching (first-hit):
- For each observation with pressure=p, tolerance r(p):
    <=10:1, <=200:1.5, <=1000:2, <=2000:5, >2000:10
- Search DEPTH1_LIST (ascending) for the first depth1 satisfying:
    depth1 - r <= p <= depth1 + r
  If no hit, discard the observation.

Performance:
- Expand by profile blocks (CHUNK_PROFILES) to control memory usage.
"""

import os
import glob
import gc
import numpy as np
import pandas as pd
from netCDF4 import Dataset

# =========================
# Configuration
# =========================
INPUT_DIR = r"/data/wang/IAP/IAP_Oxygen_Argo_netCDF_202404"
ROOT_DIR  = r"/data/wang/Result_Data/alldoxy"

ENCODING = "utf-8-sig"
CHUNK_PROFILES = 2000  # number of profiles per block (tune for your machine / memory)

# Numeric formatting controls
LATLON_DECIMALS = 4
OXY_DECIMALS = 2

# Pressure: keep integer (round to int)
PRESSURE_ROUND_TO_INT = True

# Standard depth levels (treated as dbar here)
DEPTH1_LIST = [
    1,10,20,30,40,50,60,70,80,90,100,
    110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,
    270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,
    430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,
    590,600,610,620,630,640,650,660,670,680,690,700,710,720,730,740,
    750,760,770,780,790,800,820,840,860,880,900,920,940,960,980,1000,
    1020,1040,1060,1080,1100,1120,1140,1160,1180,1200,1220,1240,1260,
    1280,1300,1320,1340,1360,1380,1400,1420,1440,1460,1480,1500,1520,
    1540,1560,1580,1600,1620,1640,1660,1680,1700,1720,1740,1760,1780,
    1800,1820,1840,1860,1880,1900,1920,1940,1960,1980,2000,2100,2200,
    2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,
    3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,4800,4900,
    5000,5100,5200,5300,5400,5500
]
DEPTH1_ARR = np.asarray(DEPTH1_LIST, dtype=np.float64)

# Output fields (and ONLY these)
OUTPUT_COLUMNS = [
    "Date", "Time", "Pressure", "Latitude", "Longitude",
    "Temperature", "Salinity", "Oxygen", "Source", "sigma_interp"
]

# =========================
# Depth tolerance & matching (vectorized)
# =========================
def depth_range_vec(p: np.ndarray) -> np.ndarray:
    """
    p (here equivalent to depth in meters, but treated as dbar) -> tolerance:
      <=10   -> 1
      <=200  -> 1.5
      <=1000 -> 2
      <=2000 -> 5
      >2000  -> 10
    """
    p = p.astype(np.float64, copy=False)
    return np.select(
        [p <= 10, p <= 200, p <= 1000, p <= 2000],
        [1.0, 1.5, 2.0, 5.0],
        default=10.0
    ).astype(np.float64, copy=False)

def match_depth1_firsthit(p: np.ndarray, r: np.ndarray) -> np.ndarray:
    """
    Find the FIRST depth1 in DEPTH1_LIST (ascending) that hits the window:
      depth1 - r <= p <= depth1 + r
    Returns matched_depth1 (float; NaN if no hit).
    """
    p = p.astype(np.float64, copy=False)
    r = r.astype(np.float64, copy=False)

    lower = p - r
    upper = p + r

    idx = np.searchsorted(DEPTH1_ARR, lower, side="left")
    ok = idx < DEPTH1_ARR.size

    matched = np.full(p.shape, np.nan, dtype=np.float64)
    cand = np.empty_like(p, dtype=np.float64)
    cand[ok] = DEPTH1_ARR[idx[ok]]

    hit = ok & (cand <= upper)
    matched[hit] = cand[hit]
    return matched

# =========================
# Utility functions
# =========================
def ensure_depth_folders():
    os.makedirs(ROOT_DIR, exist_ok=True)
    for d1 in DEPTH1_LIST:
        os.makedirs(os.path.join(ROOT_DIR, f"{d1}dbar"), exist_ok=True)

def yyyymmdd_to_yyyy_mm_dd_obj(date_int: np.ndarray) -> np.ndarray:
    """
    Input: int YYYYMMDD (1D, may contain -999)
    Output: object array ("YYYY-MM-DD" or None)
    """
    date_int = np.asarray(date_int)
    out = np.empty(date_int.shape, dtype=object)
    out[:] = None

    valid = np.isfinite(date_int) & (date_int.astype(np.int64) != -999) & (date_int.astype(np.int64) > 0)
    if not np.any(valid):
        return out

    s = date_int[valid].astype(np.int64).astype(str)
    m8 = np.array([len(x) == 8 for x in s], dtype=bool)

    vidx = np.where(valid)[0]
    keep_idx = vidx[m8]
    s_keep = s[m8]
    if len(s_keep) > 0:
        out[keep_idx] = [f"{x[0:4]}-{x[4:6]}-{x[6:8]}" for x in s_keep]
    return out

def append_df_to_depth_csv(df: pd.DataFrame):
    """
    df must contain matched_depth1(int) + OUTPUT_COLUMNS
    Group and append to depth{d}.csv
    """
    for depth1, g in df.groupby("matched_depth1", sort=False):
        depth1 = int(depth1)
        out_dir = os.path.join(ROOT_DIR, f"{depth1}dbar")
        out_csv = os.path.join(out_dir, f"depth{depth1}.csv")

        g_out = g[OUTPUT_COLUMNS]
        write_header = not os.path.isfile(out_csv)

        g_out.to_csv(
            out_csv,
            index=False,
            mode="w" if write_header else "a",
            header=write_header,
            encoding=ENCODING
        )

# =========================
# Process a single nc file
# =========================
def process_one_nc(nc_path: str) -> dict:
    print(f"\n[NC] {nc_path}")

    with Dataset(nc_path, "r") as nc:
        oxy = nc.variables["DOXY_QCed_interpolated_Adjusted_IAP"][:]       # (N_PROF, N_LEVELS)
        dep = nc.variables["Depth_QCed_interpolated_Adjusted_IAP"][:]      # (N_PROF, N_LEVELS) units=meters
        date_prof = nc.variables["Date"][:]                                # (N_PROF,)
        lat_prof  = nc.variables["Latitude"][:]                            # (N_PROF,)
        lon_prof  = nc.variables["Longitude"][:]                           # (N_PROF,)

        m, n = oxy.shape

    # Profile-level validity (Date/Lat/Lon)
    latv = np.asarray(lat_prof, dtype=np.float64)
    lonv = np.asarray(lon_prof, dtype=np.float64)
    datei = np.asarray(date_prof)

    prof_ok = (
        np.isfinite(latv) & np.isfinite(lonv) &
        (latv != -999) & (lonv != -999) &
        np.isfinite(datei) & (datei != -999)
    )
    prof_idx = np.where(prof_ok)[0]

    total_points_written = 0

    if prof_idx.size == 0:
        return {
            "nc_file": nc_path,
            "profiles_in": int(m),
            "profiles_valid": 0,
            "points_in": int(m * n),
            "points_written": 0
        }

    # Process by profile blocks
    for start in range(0, prof_idx.size, CHUNK_PROFILES):
        block = prof_idx[start:start + CHUNK_PROFILES]
        nb = block.size

        # Convert block dates to YYYY-MM-DD strings; failures become None
        date_str_prof = yyyymmdd_to_yyyy_mm_dd_obj(datei[block])
        prof_has_date = np.array([x is not None for x in date_str_prof], dtype=bool)
        if not np.any(prof_has_date):
            continue

        # Block data (dep used as Pressure)
        oxy_b = np.asarray(oxy[block, :], dtype=np.float64, order="C")
        p_b   = np.asarray(dep[block, :], dtype=np.float64, order="C")

        # Point validity: pressure & oxygen finite and not -999; profile date must be valid
        valid = (
            np.isfinite(p_b) & np.isfinite(oxy_b) &
            (p_b != -999) & (oxy_b != -999) &
            prof_has_date[:, None]
        )
        if not np.any(valid):
            continue

        valid_flat = valid.ravel(order="C")
        idx_flat = np.nonzero(valid_flat)[0]
        prof_local = (idx_flat // n).astype(np.int64)  # 0..nb-1

        # Flattened variables
        p_flat   = p_b.ravel(order="C")[valid_flat]
        oxy_flat = oxy_b.ravel(order="C")[valid_flat]

        # Expand profile-level values to point-level
        lat_flat = latv[block][prof_local]
        lon_flat = lonv[block][prof_local]
        date_flat = np.array([date_str_prof[k] for k in prof_local], dtype=object)

        # =========================
        # Formatting: round/cast
        # =========================
        # Lat/Lon: 4 decimals
        lat_flat = np.round(lat_flat, LATLON_DECIMALS)
        lon_flat = np.round(lon_flat, LATLON_DECIMALS)

        # Pressure: integer
        if PRESSURE_ROUND_TO_INT:
            p_flat = np.rint(p_flat).astype(np.int32, copy=False)

        # Oxygen: 2 decimals
        oxy_flat = np.round(oxy_flat, OXY_DECIMALS)

        # Time / Temperature / Salinity are always empty
        npts = date_flat.shape[0]
        time_flat = np.full(npts, pd.NA, dtype=object)
        tmp_flat  = np.full(npts, pd.NA, dtype=object)
        sal_flat  = np.full(npts, pd.NA, dtype=object)
        interp_flat = np.full(npts, pd.NA, dtype=object)

        # Depth matching (first-hit)
        # Note: do we match using the raw float p, or the integer-rounded p?
        # Here we match using the integer-rounded p_flat for consistency with the output.
        # If you prefer matching using raw float depths, compute r/match on p_raw instead.
        r = depth_range_vec(p_flat.astype(np.float64, copy=False))
        matched = match_depth1_firsthit(p_flat.astype(np.float64, copy=False), r)

        keep = np.isfinite(matched)
        if not np.any(keep):
            continue

        df_out = pd.DataFrame({
            "Date": date_flat[keep],
            "Time": time_flat[keep],
            "Pressure": p_flat[keep],
            "Latitude": lat_flat[keep],
            "Longitude": lon_flat[keep],
            "Temperature": tmp_flat[keep],
            "Salinity": sal_flat[keep],
            "Oxygen": oxy_flat[keep],
            "Source": "Argo",
            "sigma_interp": interp_flat[keep],
            "matched_depth1": matched[keep].astype(np.int32)
        })

        append_df_to_depth_csv(df_out)
        total_points_written += len(df_out)

        # Cleanup
        del df_out, oxy_b, p_b
        gc.collect()

    return {
        "nc_file": nc_path,
        "profiles_in": int(m),
        "profiles_valid": int(prof_idx.size),
        "points_in": int(m * n),
        "points_written": int(total_points_written)
    }

# =========================
# Main
# =========================
def main():
    ensure_depth_folders()

    nc_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.nc")))
    if not nc_files:
        print(f"[ERROR] No nc files found in: {INPUT_DIR}")
        return

    logs = []
    for f in nc_files:
        res = process_one_nc(f)
        logs.append(res)
        print(f"[OK] profiles_valid={res['profiles_valid']} | points_written={res['points_written']}")

    log_path = os.path.join(ROOT_DIR, "argo_iap_do_only_processing_summary.csv")
    pd.DataFrame(logs).to_csv(log_path, index=False, encoding=ENCODING)

    print("\n[DONE] IAP-Argo DO-only layered CSV writing completed.")
    print(f"[LOG] {log_path}")

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
IAP OSD/CTD (WOD18 source) netCDF -> one CSV per standard depth level (no seasonal split)

Final output fields (and ONLY these, fixed order):
  Date,Time,Pressure,Latitude,Longitude,Temperature,Salinity,Oxygen,Source

Formatting requirements (implemented here):
- Latitude / Longitude: keep 4 decimals
- Pressure: integer (rounded); still treated as m ≈ dbar
- Temperature / Salinity: keep 2 decimals (but set to empty in this script)
- Oxygen: keep 2 decimals
- Date: YYYY-MM-DD (parsed from netCDF yyyymmdd; drop profiles if parsing fails)
- Time / Temperature / Salinity are always empty (<NA>)
- Oxygen uses DOXY_QCed_interpolated_Adjusted_IAP; no QC is applied
- Source is fixed as "OSDCTD"

Output layout:
  ROOT_DIR/{depth1}dbar/depth{depth1}.csv

Depth matching (first-hit):
- For each observation with pressure p, tolerance r(p):
    <=10:1, <=200:1.5, <=1000:2, <=2000:5, >2000:10
- Scan DEPTH1_LIST in ascending order and take the first depth1 satisfying:
    depth1 - r <= p <= depth1 + r
  If no match, discard that observation.

Performance:
- Expand by profile blocks (CHUNK_PROFILES) to control memory
"""

import os
import glob
import gc
import numpy as np
import pandas as pd
from netCDF4 import Dataset

# =========================
# Configuration
# =========================
INPUT_DIR = r"/data/wang/IAP/IAP_Oxygen_OSDCTD_netCDF_202404"
ROOT_DIR  = r"/data/wang/Result_Data/alldoxy"

ENCODING = "utf-8-sig"
CHUNK_PROFILES = 2000  # number of profiles processed per block (tune for memory)

# Numeric formatting
LATLON_DECIMALS = 4
OXY_DECIMALS = 2

# Pressure: keep integer (round to int)
PRESSURE_ROUND_TO_INT = True

# Standard depth levels (treated as dbar here)
DEPTH1_LIST = [
    1,10,20,30,40,50,60,70,80,90,100,
    110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,
    270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,
    430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,
    590,600,610,620,630,640,650,660,670,680,690,700,710,720,730,740,
    750,760,770,780,790,800,820,840,860,880,900,920,940,960,980,1000,
    1020,1040,1060,1080,1100,1120,1140,1160,1180,1200,1220,1240,1260,
    1280,1300,1320,1340,1360,1380,1400,1420,1440,1460,1480,1500,1520,
    1540,1560,1580,1600,1620,1640,1660,1680,1700,1720,1740,1760,1780,
    1800,1820,1840,1860,1880,1900,1920,1940,1960,1980,2000,2100,2200,
    2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,
    3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,4800,4900,
    5000,5100,5200,5300,5400,5500
]
DEPTH1_ARR = np.asarray(DEPTH1_LIST, dtype=np.float64)

# Output fields (and ONLY these)
OUTPUT_COLUMNS = [
    "Date", "Time", "Pressure", "Latitude", "Longitude",
    "Temperature", "Salinity", "Oxygen", "Source", "sigma_interp"
]

# =========================
# Depth tolerance & matching (vectorized)
# =========================
def depth_range_vec(p: np.ndarray) -> np.ndarray:
    """
    p (treated as depth/m but interpreted as dbar) -> tolerance:
      <=10   -> 1
      <=200  -> 1.5
      <=1000 -> 2
      <=2000 -> 5
      >2000  -> 10
    """
    p = p.astype(np.float64, copy=False)
    return np.select(
        [p <= 10, p <= 200, p <= 1000, p <= 2000],
        [1.0, 1.5, 2.0, 5.0],
        default=10.0
    ).astype(np.float64, copy=False)

def match_depth1_firsthit(p: np.ndarray, r: np.ndarray) -> np.ndarray:
    """
    Find the FIRST depth1 (ascending in DEPTH1_LIST) that hits:
      depth1 - r <= p <= depth1 + r
    Return matched_depth1 (float; NaN if no hit).
    """
    p = p.astype(np.float64, copy=False)
    r = r.astype(np.float64, copy=False)

    lower = p - r
    upper = p + r

    idx = np.searchsorted(DEPTH1_ARR, lower, side="left")
    ok = idx < DEPTH1_ARR.size

    matched = np.full(p.shape, np.nan, dtype=np.float64)
    cand = np.empty_like(p, dtype=np.float64)
    cand[ok] = DEPTH1_ARR[idx[ok]]

    hit = ok & (cand <= upper)
    matched[hit] = cand[hit]
    return matched

# =========================
# Helper functions
# =========================
def ensure_depth_folders():
    os.makedirs(ROOT_DIR, exist_ok=True)
    for d1 in DEPTH1_LIST:
        os.makedirs(os.path.join(ROOT_DIR, f"{d1}dbar"), exist_ok=True)

def yyyymmdd_to_yyyy_mm_dd_obj(date_int: np.ndarray) -> np.ndarray:
    """
    Input: int YYYYMMDD (1D, may contain -999)
    Output: object array ("YYYY-MM-DD" or None)
    """
    date_int = np.asarray(date_int)
    out = np.empty(date_int.shape, dtype=object)
    out[:] = None

    valid = np.isfinite(date_int) & (date_int.astype(np.int64) != -999) & (date_int.astype(np.int64) > 0)
    if not np.any(valid):
        return out

    s = date_int[valid].astype(np.int64).astype(str)
    m8 = np.array([len(x) == 8 for x in s], dtype=bool)

    vidx = np.where(valid)[0]
    keep_idx = vidx[m8]
    s_keep = s[m8]
    if len(s_keep) > 0:
        out[keep_idx] = [f"{x[0:4]}-{x[4:6]}-{x[6:8]}" for x in s_keep]
    return out

def append_df_to_depth_csv(df: pd.DataFrame):
    """
    df must contain matched_depth1(int) + OUTPUT_COLUMNS.
    Grouped append into depth{d}.csv
    """
    for depth1, g in df.groupby("matched_depth1", sort=False):
        depth1 = int(depth1)
        out_dir = os.path.join(ROOT_DIR, f"{depth1}dbar")
        out_csv = os.path.join(out_dir, f"depth{depth1}.csv")

        g_out = g[OUTPUT_COLUMNS]
        write_header = not os.path.isfile(out_csv)

        g_out.to_csv(
            out_csv,
            index=False,
            mode="w" if write_header else "a",
            header=write_header,
            encoding=ENCODING
        )

# =========================
# Single netCDF file processing
# =========================
def process_one_nc(nc_path: str) -> dict:
    print(f"\n[NC] {nc_path}")

    with Dataset(nc_path, "r") as nc:
        oxy = nc.variables["DOXY_QCed_interpolated_Adjusted_IAP"][:]       # (N_PROF, N_LEVELS)
        dep = nc.variables["Depth_QCed_interpolated_Adjusted_IAP"][:]      # (N_PROF, N_LEVELS) units=meters
        date_prof = nc.variables["Date"][:]                                # (N_PROF,)
        lat_prof  = nc.variables["Latitude"][:]                            # (N_PROF,)
        lon_prof  = nc.variables["Longitude"][:]                           # (N_PROF,)

        m, n = oxy.shape

    # Profile-level validity (Date/Lat/Lon)
    latv = np.asarray(lat_prof, dtype=np.float64)
    lonv = np.asarray(lon_prof, dtype=np.float64)
    datei = np.asarray(date_prof)

    prof_ok = (
        np.isfinite(latv) & np.isfinite(lonv) &
        (latv != -999) & (lonv != -999) &
        np.isfinite(datei) & (datei != -999)
    )
    prof_idx = np.where(prof_ok)[0]

    total_points_written = 0

    if prof_idx.size == 0:
        return {
            "nc_file": nc_path,
            "profiles_in": int(m),
            "profiles_valid": 0,
            "points_in": int(m * n),
            "points_written": 0
        }

    # Process profiles in blocks
    for start in range(0, prof_idx.size, CHUNK_PROFILES):
        block = prof_idx[start:start + CHUNK_PROFILES]
        nb = block.size

        # Convert Date to YYYY-MM-DD strings; parsing failure => None
        date_str_prof = yyyymmdd_to_yyyy_mm_dd_obj(datei[block])
        prof_has_date = np.array([x is not None for x in date_str_prof], dtype=bool)
        if not np.any(prof_has_date):
            continue

        # Block arrays (dep is used as Pressure)
        oxy_b = np.asarray(oxy[block, :], dtype=np.float64, order="C")
        p_b   = np.asarray(dep[block, :], dtype=np.float64, order="C")

        # Point validity: pressure & oxygen finite and not -999; also profile date must be valid
        valid = (
            np.isfinite(p_b) & np.isfinite(oxy_b) &
            (p_b != -999) & (oxy_b != -999) &
            prof_has_date[:, None]
        )
        if not np.any(valid):
            continue

        valid_flat = valid.ravel(order="C")
        idx_flat = np.nonzero(valid_flat)[0]
        prof_local = (idx_flat // n).astype(np.int64)  # 0..nb-1

        # Flattened variables
        p_flat   = p_b.ravel(order="C")[valid_flat]
        oxy_flat = oxy_b.ravel(order="C")[valid_flat]

        # Expand profile-level lat/lon/date to point-level
        lat_flat = latv[block][prof_local]
        lon_flat = lonv[block][prof_local]
        date_flat = np.array([date_str_prof[k] for k in prof_local], dtype=object)

        # =========================
        # Formatting: round/cast
        # =========================
        # Lat/Lon: 4 decimals
        lat_flat = np.round(lat_flat, LATLON_DECIMALS)
        lon_flat = np.round(lon_flat, LATLON_DECIMALS)

        # Pressure: integer
        if PRESSURE_ROUND_TO_INT:
            p_flat = np.rint(p_flat).astype(np.int32, copy=False)

        # Oxygen: 2 decimals
        oxy_flat = np.round(oxy_flat, OXY_DECIMALS)

        # Time/Temperature/Salinity are always empty
        npts = date_flat.shape[0]
        na_col = np.full(npts, pd.NA, dtype=object)

        # Depth matching (first-hit): match using integer Pressure to stay consistent with output
        r = depth_range_vec(p_flat.astype(np.float64, copy=False))
        matched = match_depth1_firsthit(p_flat.astype(np.float64, copy=False), r)

        keep = np.isfinite(matched)
        if not np.any(keep):
            continue

        df_out = pd.DataFrame({
            "Date": date_flat[keep],
            "Time": na_col[keep],
            "Pressure": p_flat[keep],          # Depth(m) is treated as dbar (integer)
            "Latitude": lat_flat[keep],
            "Longitude": lon_flat[keep],
            "Temperature": na_col[keep],
            "Salinity": na_col[keep],
            "Oxygen": oxy_flat[keep],
            "Source": "OSDCTD",
            "sigma_interp": na_col[keep],
            "matched_depth1": matched[keep].astype(np.int32)
        })

        append_df_to_depth_csv(df_out)
        total_points_written += len(df_out)

        del df_out, oxy_b, p_b
        gc.collect()

    return {
        "nc_file": nc_path,
        "profiles_in": int(m),
        "profiles_valid": int(prof_idx.size),
        "points_in": int(m * n),
        "points_written": int(total_points_written)
    }

# =========================
# Main entry
# =========================
def main():
    ensure_depth_folders()

    nc_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.nc")))
    if not nc_files:
        print(f"[ERROR] No nc files found in: {INPUT_DIR}")
        return

    logs = []
    for f in nc_files:
        res = process_one_nc(f)
        logs.append(res)
        print(f"[OK] profiles_valid={res['profiles_valid']} | points_written={res['points_written']}")

    log_path = os.path.join(ROOT_DIR, "iap_osdctd_do_only_processing_summary.csv")
    pd.DataFrame(logs).to_csv(log_path, index=False, encoding=ENCODING)

    print("\n[DONE] IAP-OSDCTD DO-only depth-layer CSV export finished")
    print(f"[LOG] {log_path}")

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Profile-wise PCHIP interpolation onto standard depth levels + post-filtering
+ an interpolation-uncertainty proxy field.

Proxy idea (1st-order):
  interp_unc(p0) ≈ |dO/dp|(p0) * d_min(p0)
where
  - d_min(p0) is distance to the nearest *oxygen* observation (after Pressure de-dup)
  - |dO/dp|(p0) is estimated from neighboring *oxygen* observations around p0
    (if bracketing exists use below+above; otherwise use nearest two on one side)

Profile definition (no rounding, Date kept as-is):
  profile_key = (Date, Latitude, Longitude)

Assumption:
  input file is ordered by profiles (all rows of a profile are contiguous).

Steps per profile:
  1) Deduplicate by Pressure: average Temperature/Salinity/Oxygen for identical Pressure
  2) Sort by Pressure (ascending)
  3) PCHIP interpolate whole profile onto standard depths within [minP, maxP], no extrapolation
  4) Keep standard levels by rules (using oxygen-observation pressures):
       - Rule1: within ±x(z) has >=1 oxygen obs
       - Rule2: for remaining, within ±y(z) has >=2 oxygen obs
  5) Output only kept standard levels AND only those with finite interpolated Oxygen
     plus a new field: InterpUnc (μmol kg^-1)

Overwrite:
  --overwrite => atomic replace original file
"""

import os
import csv
import argparse
from typing import Dict, List, Tuple, Optional

import numpy as np
from scipy.interpolate import PchipInterpolator


# ============================================================
# Config
# ============================================================

INPUT_FILES = [
    "/data/wang/CCHDO/cchdo_bottle_filtered.csv",
    "/data/wang/CCHDO/cchdo_ctd_filtered.csv",
    "/data/wang/Geotraces IDP2021/GEOTRACES_IDP2021_filtered.csv",
    "/data/wang/GLODAP/GLODAP2023_filtered.csv",
    "/data/wang/OceanSItes/OceanSITES_filtered.csv",
]

STD_DEPTHS = np.array([
    1,10,20,30,40,50,60,70,80,90,100,
    110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,
    270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,
    430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,
    590,600,610,620,630,640,650,660,670,680,690,700,710,720,730,740,
    750,760,770,780,790,800,820,840,860,880,900,920,940,960,980,1000,
    1020,1040,1060,1080,1100,1120,1140,1160,1180,1200,1220,1240,1260,
    1280,1300,1320,1340,1360,1380,1400,1420,1440,1460,1480,1500,1520,
    1540,1560,1580,1600,1620,1640,1660,1680,1700,1720,1740,1760,1780,
    1800,1820,1840,1860,1880,1900,1920,1940,1960,1980,2000,2100,2200,
    2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,
    3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,4800,4900,
    5000,5100,5200,5300,5400,5500
], dtype=np.float64)

# --- Adaptive windows (x stricter than y) ---
def x_window(dbar: float) -> float:
    if dbar <= 50:
        return 2.0
    elif dbar <= 800:
        return 5.0
    elif dbar <= 2000:
        return 10
    else:
        return 20.0

def y_window(dbar: float) -> float:
    if dbar <= 50:
        return 5.0
    elif dbar <= 800:
        return 15.0
    elif dbar <= 2000:
        return 30.0
    else:
        return 120.0


# ============================================================
# Helpers
# ============================================================

def to_float(s: str) -> float:
    if s is None:
        return np.nan
    ss = str(s).strip()
    if ss == "" or ss.lower() in {"nan", "na", "none"}:
        return np.nan
    try:
        return float(ss)
    except Exception:
        return np.nan


def atomic_replace(src_tmp: str, dst: str) -> None:
    os.replace(src_tmp, dst)


def profile_key_from_row(row: Dict[str, str]) -> Tuple[str, str, str]:
    # Date kept as-is; lat/lon kept as-is (string)
    return (row.get("Date", ""), row.get("Latitude", ""), row.get("Longitude", ""))


def estimate_local_grad_and_dmin(
    p_obs: np.ndarray,
    o_obs: np.ndarray,
    z: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    For each standard depth z[i]:
      - dmin[i] = min_j |z[i] - p_obs[j]|
      - grad[i] estimated from neighboring obs around z[i]
      - unc[i] = |grad[i]| * dmin[i]
    p_obs must be strictly increasing; o_obs finite.
    """
    nz = z.size
    dmin = np.full(nz, np.nan, dtype=np.float64)
    grad = np.full(nz, np.nan, dtype=np.float64)
    unc  = np.full(nz, np.nan, dtype=np.float64)

    n = p_obs.size
    if n < 2:
        return dmin, grad, unc

    # nearest distance via searchsorted (O(log n) each)
    idx_right = np.searchsorted(p_obs, z, side="left")  # first index >= z
    for i in range(nz):
        zr = float(z[i])
        ir = int(idx_right[i])

        # dmin
        candidates = []
        if 0 <= ir < n:
            candidates.append(abs(p_obs[ir] - zr))
        if 0 <= ir - 1 < n:
            candidates.append(abs(p_obs[ir - 1] - zr))
        if candidates:
            dmin[i] = float(min(candidates))

        # gradient estimation
        # Prefer bracketing: (ir-1, ir)
        if 1 <= ir <= n - 1:
            p1, p2 = float(p_obs[ir - 1]), float(p_obs[ir])
            o1, o2 = float(o_obs[ir - 1]), float(o_obs[ir])
            dp = p2 - p1
            if dp > 0:
                grad[i] = abs((o2 - o1) / dp)
        else:
            # no bracketing: use nearest two points on the available side
            if ir <= 0 and n >= 2:
                # all obs are deeper than z: use first two
                p1, p2 = float(p_obs[0]), float(p_obs[1])
                o1, o2 = float(o_obs[0]), float(o_obs[1])
                dp = p2 - p1
                if dp > 0:
                    grad[i] = abs((o2 - o1) / dp)
            elif ir >= n and n >= 2:
                # all obs are shallower than z: use last two
                p1, p2 = float(p_obs[-2]), float(p_obs[-1])
                o1, o2 = float(o_obs[-2]), float(o_obs[-1])
                dp = p2 - p1
                if dp > 0:
                    grad[i] = abs((o2 - o1) / dp)

        if np.isfinite(dmin[i]) and np.isfinite(grad[i]):
            unc[i] = grad[i] * dmin[i]

    return dmin, grad, unc


def finalize_profile_and_write(
    prof_rows: List[Dict[str, str]],
    writer: csv.DictWriter,
) -> Tuple[int, int]:
    """
    Return (n_in_rows, n_out_rows_written) for this profile.
    """
    if not prof_rows:
        return 0, 0

    n_in = len(prof_rows)

    # Keep metadata from first row
    date_s = prof_rows[0].get("Date", "")
    lat_s  = prof_rows[0].get("Latitude", "")
    lon_s  = prof_rows[0].get("Longitude", "")
    src_s  = prof_rows[0].get("Source", prof_rows[0].get("Sourcez", ""))  # tolerate typo

    # Collect by Pressure (dedup)
    bucket: Dict[float, List[Tuple[float, float, float]]] = {}
    for r in prof_rows:
        p = to_float(r.get("Pressure", ""))
        if not np.isfinite(p):
            continue
        t = to_float(r.get("Temperature", ""))
        s = to_float(r.get("Salinity", ""))
        o = to_float(r.get("Oxygen", ""))
        bucket.setdefault(p, []).append((t, s, o))

    if len(bucket) < 2:
        return n_in, 0

    p_uniq = np.array(sorted(bucket.keys()), dtype=np.float64)

    # Average duplicates per pressure (ignore NaNs)
    t_uniq = np.full_like(p_uniq, np.nan, dtype=np.float64)
    s_uniq = np.full_like(p_uniq, np.nan, dtype=np.float64)
    o_uniq = np.full_like(p_uniq, np.nan, dtype=np.float64)

    for i, p in enumerate(p_uniq):
        vals = bucket[p]
        tt = np.array([v[0] for v in vals], dtype=np.float64)
        ss = np.array([v[1] for v in vals], dtype=np.float64)
        oo = np.array([v[2] for v in vals], dtype=np.float64)

        if np.any(np.isfinite(tt)):
            t_uniq[i] = np.nanmean(tt)
        if np.any(np.isfinite(ss)):
            s_uniq[i] = np.nanmean(ss)
        if np.any(np.isfinite(oo)):
            o_uniq[i] = np.nanmean(oo)

    # Only oxygen-finite points participate in oxygen interpolation + support tests + uncertainty proxy
    ok_o = np.isfinite(o_uniq)
    if np.count_nonzero(ok_o) < 2:
        return n_in, 0

    p_obs = p_uniq[ok_o]
    o_obs = o_uniq[ok_o]

    # Define standard depths within observed *pressure* range (overall)
    pmin = float(np.nanmin(p_uniq))
    pmax = float(np.nanmax(p_uniq))
    z = STD_DEPTHS[(STD_DEPTHS >= pmin) & (STD_DEPTHS <= pmax)]
    if z.size == 0:
        return n_in, 0

    # PCHIP on whole profile (no extrapolation)
    o_pchip = PchipInterpolator(p_obs, o_obs, extrapolate=False)
    o_z = o_pchip(z)

    # Temperature/Salinity optional interpolation (only if >=2 finite points)
    t_z = np.full_like(o_z, np.nan, dtype=np.float64)
    s_z = np.full_like(o_z, np.nan, dtype=np.float64)

    ok_t = np.isfinite(t_uniq)
    if np.count_nonzero(ok_t) >= 2:
        t_pchip = PchipInterpolator(p_uniq[ok_t], t_uniq[ok_t], extrapolate=False)
        t_z = t_pchip(z)

    ok_s = np.isfinite(s_uniq)
    if np.count_nonzero(ok_s) >= 2:
        s_pchip = PchipInterpolator(p_uniq[ok_s], s_uniq[ok_s], extrapolate=False)
        s_z = s_pchip(z)

    # ----------------------------
    # Post-filter rules (based on oxygen-observation pressures p_obs)
    # Rule1: within ±x(z) has >=1 oxygen obs
    # Rule2: remaining: within ±y(z) has >=2 oxygen obs
    # ----------------------------
    keep = np.zeros(z.shape, dtype=bool)
    dist = np.abs(z[:, None] - p_obs[None, :])  # (nz, nobs_oxygen)

    x_vec = np.array([x_window(float(zz)) for zz in z], dtype=np.float64)
    cnt_x = np.sum(dist <= x_vec[:, None], axis=1)
    keep |= (cnt_x >= 1)

    y_vec = np.array([y_window(float(zz)) for zz in z], dtype=np.float64)
    cnt_y = np.sum(dist <= y_vec[:, None], axis=1)
    keep |= ((~keep) & (cnt_y >= 2))

    # Also require interpolated oxygen finite; and we do NOT output NaN rows
    keep &= np.isfinite(o_z)
    if not np.any(keep):
        return n_in, 0

    # ----------------------------
    # Interpolation uncertainty proxy: |dO/dp| * dmin
    # computed for all z, then only written for kept levels
    # ----------------------------
    _dmin, _grad, unc = estimate_local_grad_and_dmin(p_obs, o_obs, z)

    # Write kept rows
    n_out = 0
    for zz, tt, ss, oo, uu, kk in zip(z, t_z, s_z, o_z, unc, keep):
        if not kk:
            continue

        out_row = {
            "Date": date_s,
            "Pressure": f"{float(zz):.0f}",
            "Latitude": lat_s,
            "Longitude": lon_s,
            "Temperature": "" if not np.isfinite(tt) else f"{float(tt):.6f}",
            "Salinity": "" if not np.isfinite(ss) else f"{float(ss):.6f}",
            "Oxygen": f"{float(oo):.6f}",
            "sigma_interp": "" if not np.isfinite(uu) else f"{float(uu):.6f}",
            "Source": src_s,
        }
        writer.writerow(out_row)
        n_out += 1

    return n_in, n_out


def process_one_file(path: str, overwrite: bool = False) -> None:
    in_path = path
    out_path = path if overwrite else (path.replace(".csv", "_stddepth.csv"))
    tmp_path = out_path + ".tmp"

    fieldnames = [
        "Date", "Pressure", "Latitude", "Longitude",
        "Temperature", "Salinity", "Oxygen",
        "sigma_interp",
        "Source"
    ]

    total_profiles = 0
    total_in_rows = 0
    total_out_rows = 0

    with open(in_path, "r", encoding="utf-8", newline="") as fin, \
         open(tmp_path, "w", encoding="utf-8", newline="") as fout:

        reader = csv.DictReader(fin)
        writer = csv.DictWriter(fout, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()

        current_key: Optional[Tuple[str, str, str]] = None
        prof_rows: List[Dict[str, str]] = []

        for row in reader:
            key = profile_key_from_row(row)
            if current_key is None:
                current_key = key

            if key != current_key:
                # finalize previous profile
                total_profiles += 1
                n_in, n_out = finalize_profile_and_write(prof_rows, writer)
                total_in_rows += n_in
                total_out_rows += n_out

                # reset
                prof_rows = [row]
                current_key = key
            else:
                prof_rows.append(row)

        # last profile
        if prof_rows:
            total_profiles += 1
            n_in, n_out = finalize_profile_and_write(prof_rows, writer)
            total_in_rows += n_in
            total_out_rows += n_out

    if overwrite:
        atomic_replace(tmp_path, out_path)
        print(f"[OK] {in_path} -> {out_path} (overwritten)")
    else:
        atomic_replace(tmp_path, out_path)
        print(f"[OK] {in_path} -> {out_path}")

    print(f"  profiles={total_profiles} | in_rows={total_in_rows} | out_rows={total_out_rows}")


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--overwrite", action="store_true", help="Atomic replace original file(s).")
    # Jupyter/ipykernel injects extra args like --f=xxx.json; ignore them safely:
    args, _unknown = ap.parse_known_args()

    for f in INPUT_FILES:
        if not os.path.exists(f):
            print(f"[SKIP] missing: {f}")
            continue
        process_one_file(f, overwrite=bool(args.overwrite))


if __name__ == "__main__":
    main()


In [None]:
Remove duplicate sections.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Layered export of "metadata CSV" (CCHDO / GEOTRACES-IDP2021 / GLODAP / OceanSITES, etc.):

- One output file per standard depth level (depth1):
    ROOT_DIR/{depth1}dbar/depth{depth1}.csv

- Append mode:
    * If the file does not exist -> write header + data
    * If the file exists -> append rows (no header)
    * If an existing file has a different header:
        - back up the old file as *.bak
        - remove/replace and re-create with the correct header

- The final output must contain ONLY the following columns (fixed order):
    Date,Time,Pressure,Latitude,Longitude,Temperature,Salinity,Oxygen,Source,sigma_interp

- Date is normalized to YYYY-MM-DD
- Time is derived from Date:
    * If the original has no time component OR time is 00:00 -> NA
- Source filtering:
    * EXCLUDE_SOURCES (case-insensitive; internally compared in uppercase)

Numeric formatting enforced by this script:
- Latitude / Longitude: 4 decimals
- Pressure: integer (rounded to int)
- Temperature / Salinity: 2 decimals
- Oxygen: 2 decimals
- sigma_interp: if numeric -> 6 decimals; otherwise empty
"""

import os
import pandas as pd
import numpy as np

# =========================
# Configuration
# =========================

INPUT_CSVS = [
    "/data/wang/CCHDO/cchdo_bottle_filtered_stddepth.csv",
    "/data/wang/CCHDO/cchdo_ctd_filtered_stddepth.csv",
    "/data/wang/Geotraces IDP2021/GEOTRACES_IDP2021_filtered_stddepth.csv",
    "/data/wang/GLODAP/GLODAP2023_filtered_stddepth.csv",
    "/data/wang/OceanSItes/OceanSITES_filtered_stddepth.csv",
]

ROOT_DIR = "/data/wang/Result_Data/alldoxy"
CHUNKSIZE = 1_000_000
ENCODING = "utf-8-sig"

# Input columns (explicitly includes sigma_interp)
INPUT_COLUMNS = [
    "Date", "Latitude", "Longitude", "Pressure",
    "Temperature", "Salinity", "Oxygen", "Source",
    "sigma_interp"
]

# Final output columns (ONLY these; fixed order; sigma_interp after Source)
OUTPUT_COLUMNS = [
    "Date", "Time", "Pressure", "Latitude", "Longitude",
    "Temperature", "Salinity", "Oxygen", "Source", "sigma_interp"
]

# Source filtering: excluded sources are not exported (and not used for profile counts)
EXCLUDE_SOURCES = {"ARGO", "PFL", "GLD", "DRB", "UOR"}

# "Profile" definition: count unique (Date, Lat, Lon) per (input file × Source)
PROFILE_COORD_ROUND = 4   # None = no rounding; 4 = round to 4 decimals (recommended)

# Numeric formatting controls
LATLON_DECIMALS = 4
TS_DECIMALS = 2
OXY_DECIMALS = 2
UNC_DECIMALS = 6
PRESSURE_ROUND_TO_INT = True

# Standard depth levels (dbar)
DEPTH1_LIST = [
    1,10,20,30,40,50,60,70,80,90,100,
    110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,
    270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,
    430,440,450,460,470,480,490,500,510,520,530,540,550,560,570,580,
    590,600,610,620,630,640,650,660,670,680,690,700,710,720,730,740,
    750,760,770,780,790,800,820,840,860,880,900,920,940,960,980,1000,
    1020,1040,1060,1080,1100,1120,1140,1160,1180,1200,1220,1240,1260,
    1280,1300,1320,1340,1360,1380,1400,1420,1440,1460,1480,1500,1520,
    1540,1560,1580,1600,1620,1640,1660,1680,1700,1720,1740,1760,1780,
    1800,1820,1840,1860,1880,1900,1920,1940,1960,1980,2000,2100,2200,
    2300,2400,2500,2600,2700,2800,2900,3000,3100,3200,3300,3400,3500,
    3600,3700,3800,3900,4000,4100,4200,4300,4400,4500,4600,4700,4800,4900,
    5000,5100,5200,5300,5400,5500
]
DEPTH1_ARR = np.asarray(DEPTH1_LIST, dtype=np.float64)


# =========================
# Depth matching (vectorized)
# =========================
def depth_range_vec(p: np.ndarray) -> np.ndarray:
    """
    Pressure (dbar) -> tolerance window used for first-hit binning:
      <=10    -> 1
      <=200   -> 1.5
      <=1000  -> 2
      <=2000  -> 5
      >2000   -> 10
    """
    p = p.astype(np.float64, copy=False)
    return np.select(
        [p <= 10, p <= 200, p <= 1000, p <= 2000],
        [1.0, 1.5, 2.0, 5.0],
        default=10.0
    ).astype(np.float64, copy=False)


def match_depth1_firsthit(p: np.ndarray, r: np.ndarray) -> np.ndarray:
    """
    Find the FIRST depth1 in DEPTH1_LIST (ascending) that hits the window:
      depth1 - r <= p <= depth1 + r
    Returns matched_depth1 (float; NaN if no hit).
    """
    p = p.astype(np.float64, copy=False)
    r = r.astype(np.float64, copy=False)

    lower = p - r
    upper = p + r

    idx = np.searchsorted(DEPTH1_ARR, lower, side="left")
    ok = idx < DEPTH1_ARR.size

    matched = np.full(p.shape, np.nan, dtype=np.float64)
    cand = np.empty_like(p, dtype=np.float64)
    cand[ok] = DEPTH1_ARR[idx[ok]]

    hit = ok & (cand <= upper)
    matched[hit] = cand[hit]
    return matched


def ensure_depth_folders():
    os.makedirs(ROOT_DIR, exist_ok=True)
    for d1 in DEPTH1_LIST:
        os.makedirs(os.path.join(ROOT_DIR, f"{d1}dbar"), exist_ok=True)


# =========================
# Date -> Date(YYYY-MM-DD) + Time(HH:MM or NA)
# =========================
def derive_date_time(date_series: pd.Series) -> tuple[pd.Series, pd.Series]:
    ss = date_series.astype("string")
    has_time = ss.str.contains(":", regex=False)

    dt = pd.to_datetime(ss, errors="coerce")

    date_out = dt.dt.strftime("%Y-%m-%d").astype("string")
    time_out = dt.dt.strftime("%H:%M").astype("string")

    time_out = time_out.mask(~has_time, pd.NA)
    time_out = time_out.mask(time_out == "00:00", pd.NA)

    date_out = date_out.mask(dt.isna(), pd.NA)
    time_out = time_out.mask(dt.isna(), pd.NA)

    return date_out, time_out


def normalize_coord_series(s: pd.Series) -> pd.Series:
    x = pd.to_numeric(s, errors="coerce")
    if PROFILE_COORD_ROUND is None:
        return x
    return x.round(int(PROFILE_COORD_ROUND))


def ensure_out_header_compatible(out_csv: str) -> None:
    """
    If out_csv already exists but its header is not identical to OUTPUT_COLUMNS:
    - back up as *.bak
    - remove/replace the original file
    so that subsequent appends do not misalign columns.
    """
    if not os.path.isfile(out_csv):
        return
    try:
        with open(out_csv, "r", encoding=ENCODING, errors="ignore") as f:
            first = f.readline().strip("\n\r")
        existing = [x.strip() for x in first.split(",")]
        if existing == OUTPUT_COLUMNS:
            return
        bak = out_csv + ".bak"
        os.replace(out_csv, bak)
        print(f"  [WARN] header mismatch: backed up existing file -> {bak}")
    except Exception:
        bak = out_csv + ".bak"
        os.replace(out_csv, bak)
        print(f"  [WARN] failed to validate header, backed up -> {bak}")


# =========================
# Per-file processing (per-depth outputs + profile stats)
# =========================
def process_one_file(csv_path: str):
    if not os.path.isfile(csv_path):
        print(f"[SKIP] not found: {csv_path}")
        return None

    print(f"\n[FILE] {csv_path}")

    # For each file: source -> set(uint64 hashes)
    profile_hash_sets: dict[str, set[int]] = {}

    total_rows_in = 0
    total_rows_after_source_filter = 0
    total_rows_after_date_valid = 0
    total_rows_after_pressure_valid = 0
    total_rows_written = 0

    reader = pd.read_csv(
        csv_path,
        chunksize=CHUNKSIZE,
        usecols=INPUT_COLUMNS,
        low_memory=False
    )

    for chunk_idx, df in enumerate(reader, start=1):
        total_rows_in += len(df)
        print(f"  Processing chunk {chunk_idx} ... rows={len(df)}")

        # ---------- Normalize Source + filter ----------
        src_norm = df["Source"].astype("string").str.strip().str.upper()
        df = df.loc[~src_norm.isin(EXCLUDE_SOURCES)].copy()
        src_norm = src_norm.loc[df.index]
        total_rows_after_source_filter += len(df)
        if df.empty:
            continue

        # ---------- Date -> Date + Time (drop rows with invalid Date) ----------
        date_norm, time_norm = derive_date_time(df["Date"])
        df["Date"] = date_norm
        df["Time"] = time_norm

        df = df.dropna(subset=["Date"]).copy()
        src_norm = src_norm.loc[df.index]
        total_rows_after_date_valid += len(df)
        if df.empty:
            continue

        # ---------- Convert numeric columns ----------
        lat = pd.to_numeric(df["Latitude"], errors="coerce")
        lon = pd.to_numeric(df["Longitude"], errors="coerce")
        p   = pd.to_numeric(df["Pressure"], errors="coerce")
        tmp = pd.to_numeric(df["Temperature"], errors="coerce")
        sal = pd.to_numeric(df["Salinity"], errors="coerce")
        oxy = pd.to_numeric(df["Oxygen"], errors="coerce")
        sig = pd.to_numeric(df["sigma_interp"], errors="coerce")

        # ---------- Profile key: (Date, Lat, Lon) + Source ----------
        lat_key = normalize_coord_series(lat)
        lon_key = normalize_coord_series(lon)

        key_valid = df["Date"].notna() & lat_key.notna() & lon_key.notna() & src_norm.notna()
        if key_valid.any():
            key_df = pd.DataFrame({
                "Date": df.loc[key_valid, "Date"].astype("string"),
                "Lat":  lat_key.loc[key_valid].astype(np.float64),
                "Lon":  lon_key.loc[key_valid].astype(np.float64),
            })
            key_hash = pd.util.hash_pandas_object(key_df, index=False).to_numpy(dtype=np.uint64, copy=False)
            src_sub = src_norm.loc[key_valid].to_numpy()

            for s in np.unique(src_sub):
                m = (src_sub == s)
                if not m.any():
                    continue
                u = np.unique(key_hash[m])
                st = profile_hash_sets.get(s)
                if st is None:
                    st = set()
                    profile_hash_sets[s] = st
                st.update(u.tolist())

        # ---------- Pressure validity (required for depth binning) ----------
        valid_p = np.isfinite(p.to_numpy(dtype=np.float64, copy=False))
        df = df.loc[valid_p].copy()
        if df.empty:
            continue

        # Re-align series to the filtered index
        lat = lat.loc[df.index]
        lon = lon.loc[df.index]
        p   = p.loc[df.index]
        tmp = tmp.loc[df.index]
        sal = sal.loc[df.index]
        oxy = oxy.loc[df.index]
        sig = sig.loc[df.index]
        src_norm = src_norm.loc[df.index]

        total_rows_after_pressure_valid += len(df)

        # ---------- Numeric formatting ----------
        lat = lat.round(LATLON_DECIMALS)
        lon = lon.round(LATLON_DECIMALS)

        if PRESSURE_ROUND_TO_INT:
            p_int = np.rint(p.to_numpy(dtype=np.float64, copy=False)).astype(np.int32, copy=False)
            p = pd.Series(p_int, index=df.index)
        else:
            p = p.astype(np.float64).copy()

        tmp = tmp.round(TS_DECIMALS)
        sal = sal.round(TS_DECIMALS)
        oxy = oxy.round(OXY_DECIMALS)
        sig = sig.round(UNC_DECIMALS)

        # ---------- Assign matched_depth1 (use integer Pressure for consistent binning/output) ----------
        p_arr = p.to_numpy(dtype=np.float64, copy=False)
        r = depth_range_vec(p_arr)
        matched = match_depth1_firsthit(p_arr, r)

        df["matched_depth1"] = matched
        df = df.dropna(subset=["matched_depth1"]).copy()
        if df.empty:
            continue
        df["matched_depth1"] = df["matched_depth1"].astype(np.int32)

        # ---------- Build final output (strict columns + formatted values) ----------
        df["Latitude"] = lat
        df["Longitude"] = lon
        df["Pressure"] = p
        df["Temperature"] = tmp
        df["Salinity"] = sal
        df["Oxygen"] = oxy
        df["Source"] = src_norm  # write Source in normalized uppercase
        df["sigma_interp"] = sig

        # ---------- Grouped append-write by depth ----------
        for depth1, g in df.groupby("matched_depth1", sort=False):
            depth1 = int(depth1)
            out_dir = os.path.join(ROOT_DIR, f"{depth1}dbar")
            out_csv = os.path.join(out_dir, f"depth{depth1}.csv")

            ensure_out_header_compatible(out_csv)

            g_out = g[OUTPUT_COLUMNS]
            write_header = not os.path.isfile(out_csv)

            g_out.to_csv(
                out_csv,
                index=False,
                mode="w" if write_header else "a",
                header=write_header,
                encoding=ENCODING
            )
            total_rows_written += len(g_out)

    per_source_profiles = {s: len(st) for s, st in profile_hash_sets.items()}

    return {
        "input_file": csv_path,
        "rows_in": int(total_rows_in),
        "rows_after_source_filter": int(total_rows_after_source_filter),
        "rows_after_date_valid": int(total_rows_after_date_valid),
        "rows_after_pressure_valid": int(total_rows_after_pressure_valid),
        "rows_written": int(total_rows_written),
        "profiles_by_source": per_source_profiles
    }


# =========================
# Main entry
# =========================
def main():
    ensure_depth_folders()

    log_rows = []
    file_summaries = []

    for f in INPUT_CSVS:
        res = process_one_file(f)
        if res is None:
            continue

        file_summaries.append({
            "input_file": res["input_file"],
            "rows_in": res["rows_in"],
            "rows_after_source_filter": res["rows_after_source_filter"],
            "rows_after_date_valid": res["rows_after_date_valid"],
            "rows_after_pressure_valid": res["rows_after_pressure_valid"],
            "rows_written": res["rows_written"],
        })

        prof = res["profiles_by_source"]
        if len(prof) == 0:
            log_rows.append({"input_file": res["input_file"], "Source": "", "n_profiles_DateLatLon": 0})
        else:
            for s, n in sorted(prof.items(), key=lambda x: x[0]):
                log_rows.append({"input_file": res["input_file"], "Source": s, "n_profiles_DateLatLon": int(n)})

    log_path = os.path.join(ROOT_DIR, "profile_counts_by_source.csv")
    pd.DataFrame(log_rows).to_csv(log_path, index=False, encoding=ENCODING)

    summary_path = os.path.join(ROOT_DIR, "file_processing_summary.csv")
    pd.DataFrame(file_summaries).to_csv(summary_path, index=False, encoding=ENCODING)

    print("\n[DONE] Layered export of metadata CSVs completed.")
    print(f"[LOG] Profile counts: {log_path}")
    print(f"[LOG] File processing summary: {summary_path}")
    print("[INFO] Per-depth outputs include sigma_interp after Source.")


if __name__ == "__main__":
    main()