<a href="https://colab.research.google.com/github/heytian/d2d-oco3-tools/blob/main/nc4-extract-NDGL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NC4 to CSV - CO2**

Work in progress as of Feb 25, 2026

This script batch processes netcdf (.nc4) files of OCO-3 CO2 Lite data (from https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/OCO3_L2_Lite_FP.11r/) to create a csv for SAM, referencing the Nadir & Glint CO2 working code from Feb 19, 2026 (https://github.com/heytian/d2d-oco3-tools/blob/main/nc4-extract-NDGL.ipynb).

If "no data found", try clearing browser history for the day, and also double check that Earth Data credentials work when directly downloading from the gesdisc source.  

**IMPORTANT: Clear all outputs and end runtime session before saving to Colab or Github to avoid exposing your Earthdata credentials!**

In [None]:
# Run this to write .netrc to Colab for your Earthdata credentials. RESTART SESSION AND CLEAR OUTPUTS AFTER USE!

import getpass, os

u = getpass.getpass("Earthdata Username: ")
p = getpass.getpass("Earthdata Password: ")

netrc_path = os.path.expanduser("~/.netrc")
with open(netrc_path, "w") as f:
    f.write(f"machine urs.earthdata.nasa.gov\n"
            f"  login {u}\n"
            f"  password {p}\n")

os.chmod(netrc_path, 0o600)

cookie_path = os.path.expanduser("~/.urs_cookies")
open(cookie_path, "a").close()
os.chmod(cookie_path, 0o600)

print("Credentials loaded securely.")


In [None]:
!pip install pycountry
!pip install timezonefinder

In [None]:
# !rm -rf /content/drive

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

20260219 1715h ET integrated spatial join with Rob's centroids; working!

In [None]:
import os
import io
import time
import h5py
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
from tqdm import tqdm
from getpass import getpass
from urllib.parse import urljoin
import requests
from concurrent.futures import ThreadPoolExecutor
import pycountry
from functools import partial
import numpy.lib.recfunctions as rfn
from timezonefinder import TimezoneFinder
import pytz


BASE_URL = "https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/"
OUTPUT_DIR = "./output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

ref_data = pd.read_csv("/content/drive/MyDrive/Shortcuts/csv_xlsx/20260129_from_Rob/clasp_report.csv")  # Rob's centroid csv; amend location to your local file path
ref_geodata = gpd.GeoDataFrame(
    ref_data,
    geometry=ref_data["Site Shape WKT"].apply(wkt.loads),
    crs="EPSG:4326"
)


def get_earthdata_session():
    session = requests.Session()
    netrc_path = os.path.expanduser("~/.netrc")
    if os.path.exists(netrc_path):
        session.trust_env = True
        return session
    username = input("Earthdata username: ")
    password = getpass("Earthdata password: ")
    session.auth = (username, password)
    return session

session = get_earthdata_session()

def list_remote_nc4(product_dir):
    try:
        r = session.get(product_dir)
        r.raise_for_status()
        lines = r.text.splitlines()
        files = [
            L.split('href="')[1].split('"')[0]
            for L in lines
            if ".nc4" in L and not L.strip().endswith('.xml') and 'href' in L
        ]
        return sorted(files)
    except Exception:
        return []

def safe_open_h5(bio):
    bio.seek(0)
    first_bytes = bio.read(15)
    bio.seek(0)
    if first_bytes.startswith(b'<!DOCTYPE') or first_bytes.startswith(b'<html'):
        raise OSError("Downloaded file is HTML, not HDF5")
    sig = bio.read(8)
    bio.seek(0)
    if sig != b'\x89HDF\r\n\x1a\n':
        raise OSError("Not a valid HDF5 file")
    return h5py.File(bio, 'r')

def parse_city_country_pycountry(target_names):
    s = target_names.str.replace('fossil_', '', regex=False)
    parts = s.str.split('_')
    country_names = {c.name.lower() for c in pycountry.countries}
    country_names.update({getattr(c, "common_name", "").lower() for c in pycountry.countries if hasattr(c, "common_name")})
    def get_city_country(lst):
        for n in range(3, 0, -1):
            if len(lst) < n:
                continue
            candidate = ' '.join(lst[-n:]).lower()
            if candidate in country_names:
                city = ' '.join(lst[:-n])
                country = ' '.join(lst[-n:])
                return city, country
        return ' '.join(lst[:-1]), lst[-1]
    city_country = parts.apply(get_city_country)
    city = city_country.apply(lambda x: x[0])
    country = city_country.apply(lambda x: x[1])
    return city, country

def read_variable_chunked(h5file, var, chunk=50000):
    dset = h5file[var]
    n = dset.shape[0]
    for start in range(0, n, chunk):
        end = min(start + chunk, n)
        yield dset[start:end]

tf = TimezoneFinder()

def read_filter_remote_file(filename, product_dir, retries=3, fossil_only=True):
    mapping = {0:b'ND', 1:b'GL', 2:b'TG', 3:b'XS', 4:b'AM'}

    for attempt in range(retries):
        try:
            url = urljoin(product_dir, filename)
            with session.get(url, stream=True, timeout=120) as r:
                r.raise_for_status()
                bio = io.BytesIO(r.content)

            with safe_open_h5(bio) as f:
                dfs = []

                for sid_chunk, x_chunk, op_chunk, q_chunk, tn_chunk, lat_chunk, lon_chunk in zip(
                    read_variable_chunked(f, 'sounding_id'),
                    read_variable_chunked(f, 'xco2'),
                    read_variable_chunked(f, 'Sounding/operation_mode'),
                    read_variable_chunked(f, 'xco2_quality_flag'),
                    read_variable_chunked(f, 'Sounding/target_name'),
                    read_variable_chunked(f, 'latitude'),
                    read_variable_chunked(f, 'longitude')
                ):
                    op_decoded = np.array([mapping.get(int(v), b'UN') for v in op_chunk])

                    df_chunk = pd.DataFrame({
                        "sounding_id": sid_chunk,
                        "xco2": x_chunk,
                        "operation_mode": op_decoded,
                        "xco2_quality_flag": q_chunk,
                        "target_name": [t.decode("utf-8").strip() for t in tn_chunk],
                        "latitude": lat_chunk,
                        "longitude": lon_chunk
                    })

                    # ND/GL + good quality
                    df_chunk = df_chunk[
                        df_chunk["operation_mode"].isin([b"ND", b"GL"]) &
                        (df_chunk["xco2_quality_flag"] == 0)
                    ]

                    if df_chunk.empty:
                        continue

                    # Spatial filtering
                    points_gdf = gpd.GeoDataFrame(
                        df_chunk,
                        geometry=gpd.points_from_xy(df_chunk.longitude, df_chunk.latitude),
                        crs="EPSG:4326"
                    )

                    joined = gpd.sjoin(
                        points_gdf,
                        ref_geodata[["Target Name", "geometry"]],
                        how="inner",
                        predicate="within"
                    )

                    if not joined.empty:
                        dfs.append(joined.drop(columns="geometry"))

                if not dfs:
                    return None

                df_all = pd.concat(dfs, ignore_index=True)

                # Fossil filter
                if fossil_only:
                    df_all = df_all[df_all["Target Name"].str.startswith("fossil")]

                if df_all.empty:
                    return None

                # Add datetime
                df_all["datetime"] = pd.to_datetime(
                    df_all["sounding_id"].astype(str).str[:14],
                    format="%Y%m%d%H%M%S"
                )

                # Add local time
                local_times = []
                tz_abbrs = []

                for lon, lat, utc_dt in zip(df_all.longitude, df_all.latitude, df_all.datetime):
                    tz_name = tf.timezone_at(lng=lon, lat=lat)
                    utc_dt = utc_dt.tz_localize("UTC")

                    if tz_name:
                        local_dt = utc_dt.tz_convert(tz_name)
                        local_times.append(local_dt.strftime('%m/%d/%y %H:%M'))
                        tz_abbrs.append(local_dt.strftime('%Z'))
                    else:
                        local_times.append(utc_dt.strftime('%m/%d/%y %H:%M'))
                        tz_abbrs.append("UTC")

                df_all["local_time"] = local_times
                df_all["timezone"] = tz_abbrs

                return df_all

        except Exception:
            time.sleep(1)
            continue

    return None

def combine_remote_files_years(years, product_template="OCO3_L2_Lite_FP.11r/{year}/", n_files=None, output_csv="CO2_NDGL.csv", max_workers=4):
    all_data = []
    for year in years:
        product_dir = urljoin(BASE_URL, product_template.format(year=year))
        remote_files = list_remote_nc4(product_dir)
        if not remote_files:
            continue
        selected = remote_files if n_files is None else remote_files[:n_files]
        func = partial(read_filter_remote_file, product_dir=product_dir)
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(tqdm(executor.map(func, selected), total=len(selected)))
            for r in results:
                if r is not None and not r.empty:
                    all_data.append(r)
    if not all_data:
        print("No data found!")
        return None

    df = pd.concat(all_data, ignore_index=True)

    # df = pd.DataFrame(combined)
    df['target_name'] = df['target_name'].apply(lambda x: x.decode('utf-8') if isinstance(x, (bytes, bytearray)) else x)
    city, country = parse_city_country_pycountry(df['target_name'])
    df['city'] = city
    df['country'] = country
    out_path = os.path.join(OUTPUT_DIR, output_csv)
    df.to_csv(out_path, index=False)
    print(out_path)
    return df


years = [2019,2020,2021,2022,2023,2024,2025]
combine_remote_files_years(years, n_files=None, max_workers=2)


# **NC4 to CSV - SIF**

(As of Feb 25 2026, yet to be written)

This script batch processes netcdf (.nc4) files of OCO-3 SIF Lite data (from https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/OCO3_L2_Lite_SIF.11r/) to create a csv containing SAM data.


**IMPORTANT: Clear all outputs and end runtime session before saving to Colab or Github to avoid exposing your Earthdata credentials!**