<a href="https://colab.research.google.com/github/heytian/d2d-oco3-tools/blob/main/nc4-extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NC4 to CSV - CO2**

This script batch processes netcdf (.nc4) files of OCO-3 CO2 Lite data (from https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/OCO3_L2_Lite_FP.11r/) to create a csv containing high quality SAMs tagged as "fossil".

**IMPORTANT: Clear all outputs and end runtime session before saving to Colab or Github to avoid exposing your Earthdata credentials!**

In [None]:
# Run this to write .netrc to Colab for your Earthdata credentials. RESTART SESSION AND CLEAR OUTPUTS AFTER USE!

import getpass, os

u = getpass.getpass("Earthdata Username: ")
p = getpass.getpass("Earthdata Password: ")

netrc_path = os.path.expanduser("~/.netrc")
with open(netrc_path, "w") as f:
    f.write(f"machine urs.earthdata.nasa.gov\n"
            f"  login {u}\n"
            f"  password {p}\n")

os.chmod(netrc_path, 0o600)

cookie_path = os.path.expanduser("~/.urs_cookies")
open(cookie_path, "a").close()
os.chmod(cookie_path, 0o600)

print("Credentials loaded securely.")


In [None]:
!pip install pycountry

20260219 fix below:

In [None]:
import os
import io
import time
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm
from getpass import getpass
from urllib.parse import urljoin
import requests
from concurrent.futures import ThreadPoolExecutor
import pycountry
from functools import partial
import numpy.lib.recfunctions as rfn

BASE_URL = "https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/"
OUTPUT_DIR = "./output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_earthdata_session():
    session = requests.Session()
    netrc_path = os.path.expanduser("~/.netrc")
    if os.path.exists(netrc_path):
        session.trust_env = True
        return session
    username = input("Earthdata username: ")
    password = getpass("Earthdata password: ")
    session.auth = (username, password)
    return session

session = get_earthdata_session()

def list_remote_nc4(product_dir):
    try:
        r = session.get(product_dir)
        r.raise_for_status()
        lines = r.text.splitlines()
        files = [
            L.split('href="')[1].split('"')[0]
            for L in lines
            if ".nc4" in L and not L.strip().endswith('.xml') and 'href' in L
        ]
        return sorted(files)
    except Exception:
        return []

def safe_open_h5(bio):
    bio.seek(0)
    first_bytes = bio.read(15)
    bio.seek(0)
    if first_bytes.startswith(b'<!DOCTYPE') or first_bytes.startswith(b'<html'):
        raise OSError("Downloaded file is HTML, not HDF5")
    sig = bio.read(8)
    bio.seek(0)
    if sig != b'\x89HDF\r\n\x1a\n':
        raise OSError("Not a valid HDF5 file")
    return h5py.File(bio, 'r')

def parse_city_country_pycountry(target_names):
    s = target_names.str.replace('fossil_', '', regex=False)
    parts = s.str.split('_')
    country_names = {c.name.lower() for c in pycountry.countries}
    country_names.update({getattr(c, "common_name", "").lower() for c in pycountry.countries if hasattr(c, "common_name")})
    def get_city_country(lst):
        for n in range(3, 0, -1):
            if len(lst) < n:
                continue
            candidate = ' '.join(lst[-n:]).lower()
            if candidate in country_names:
                city = ' '.join(lst[:-n])
                country = ' '.join(lst[-n:])
                return city, country
        return ' '.join(lst[:-1]), lst[-1]
    city_country = parts.apply(get_city_country)
    city = city_country.apply(lambda x: x[0])
    country = city_country.apply(lambda x: x[1])
    return city, country

def read_variable_chunked(h5file, var, chunk=50000):
    dset = h5file[var]
    n = dset.shape[0]
    for start in range(0, n, chunk):
        end = min(start + chunk, n)
        yield dset[start:end]

def read_filter_remote_file(filename, product_dir, retries=3):
    dtype = [
        ('sounding_id','int64'),
        ('xco2','f8'),
        ('operation_mode','S2'),
        ('xco2_quality_flag','i1'),
        ('target_name','S100'),
        ('latitude','f8'),
        ('longitude','f8')
    ]
    mapping = {0:b'ND',1:b'GL',2:b'TG',3:b'XS',4:b'AM'}
    for attempt in range(retries):
        try:
            url = urljoin(product_dir, filename)
            with session.get(url, stream=True, timeout=120) as r:
                r.raise_for_status()
                bio = io.BytesIO(r.content)
            with safe_open_h5(bio) as f:
                dtype = [
                    ('sounding_id','int64'),
                    ('xco2','f8'),
                    ('operation_mode','S2'),
                    ('xco2_quality_flag','i1'),
                    ('target_name','S100'),
                    ('latitude','f8'),
                    ('longitude','f8')
                ]
                out = []
                for sid_chunk, x_chunk, op_chunk, q_chunk, tn_chunk, lat_chunk, lon_chunk in zip(
                    read_variable_chunked(f, 'sounding_id'),
                    read_variable_chunked(f, 'xco2'),
                    read_variable_chunked(f, 'Sounding/operation_mode'),
                    read_variable_chunked(f, 'xco2_quality_flag'),
                    read_variable_chunked(f, 'Sounding/target_name'),
                    read_variable_chunked(f, 'latitude'),
                    read_variable_chunked(f, 'longitude')
                ):
                    op_decoded = np.array([mapping.get(int(v), b'UN') for v in op_chunk])
                    chunk_struct = np.zeros(len(sid_chunk), dtype=dtype)
                    chunk_struct['sounding_id'] = sid_chunk
                    chunk_struct['xco2'] = x_chunk
                    chunk_struct['operation_mode'] = op_decoded
                    chunk_struct['xco2_quality_flag'] = q_chunk
                    chunk_struct['target_name'] = tn_chunk
                    chunk_struct['latitude'] = lat_chunk
                    chunk_struct['longitude'] = lon_chunk
                    mask = (chunk_struct['operation_mode'] == b'AM') & (chunk_struct['xco2_quality_flag'] == 0)
                    out.append(chunk_struct[mask])
                if len(out) == 0:
                    return np.empty(0, dtype=dtype)
                data = np.concatenate(out)
            dt_strings = np.array([str(s)[:14] for s in data['sounding_id']])
            data = rfn.append_fields(data, 'datetime', pd.to_datetime(dt_strings, format='%Y%m%d%H%M%S'), usemask=False)
            return data
        except (requests.HTTPError, OSError, KeyError):
            time.sleep(1)
            continue
    return np.empty(0, dtype=dtype)
    return

def combine_remote_files_years(years, product_template="OCO3_L2_Lite_FP.11r/{year}/", n_files=None, output_csv="CO2_combined.csv", max_workers=4):
    all_data = []
    for year in years:
        product_dir = urljoin(BASE_URL, product_template.format(year=year))
        remote_files = list_remote_nc4(product_dir)
        if not remote_files:
            continue
        selected = remote_files if n_files is None else remote_files[:n_files]
        func = partial(read_filter_remote_file, product_dir=product_dir)
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(tqdm(executor.map(func, selected), total=len(selected)))
            for r in results:
                if len(r) > 0:
                    all_data.append(r)
    if not all_data:
        print("No data found!")
        return None
    combined = np.concatenate(all_data)
    df = pd.DataFrame(combined)
    df['target_name'] = df['target_name'].apply(lambda x: x.decode('utf-8') if isinstance(x, (bytes, bytearray)) else x)
    df = df[df['target_name'].str.startswith('fossil_')]
    city, country = parse_city_country_pycountry(df['target_name'])
    df['city'] = city
    df['country'] = country
    agg = df.groupby(['target_name', 'datetime'], as_index=False).agg({
        'xco2':'median',
        'latitude':'mean',
        'longitude':'mean',
        'operation_mode':'first',
        'xco2_quality_flag':'first',
        'city':'first',
        'country':'first'
    })
    out_path = os.path.join(OUTPUT_DIR, output_csv)
    agg.to_csv(out_path, index=False)
    print(out_path)
    return agg

# years = [2019, 2020, 2021, 2022, 2023, 2024]
# combine_remote_files_years(years, n_files=None, max_workers=2)


years = [2025]
combine_remote_files_years(years, n_files=10, max_workers=2)


# **NC4 to CSV - SIF**

This script batch processes netcdf (.nc4) files of OCO-3 SIF Lite data (from https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/OCO3_L2_Lite_SIF.11r/) to create a csv containing high quality SAMs.


**IMPORTANT: Clear all outputs and end runtime session before saving to Colab or Github to avoid exposing your Earthdata credentials!**

In [None]:
# Run this to write .netrc to Colab for your Earthdata credentials. RESTART SESSION AND CLEAR OUTPUTS AFTER USE!

import getpass, os

u = getpass.getpass("Earthdata Username: ")
p = getpass.getpass("Earthdata Password: ")

netrc_path = os.path.expanduser("~/.netrc")
with open(netrc_path, "w") as f:
    f.write(f"machine urs.earthdata.nasa.gov\n"
            f"  login {u}\n"
            f"  password {p}\n")

os.chmod(netrc_path, 0o600)

cookie_path = os.path.expanduser("~/.urs_cookies")
open(cookie_path, "a").close()
os.chmod(cookie_path, 0o600)

print("Credentials loaded securely.")


In [None]:
pip install pycountry

In [None]:
import os
import h5py
import numpy as np
import pandas as pd
import requests
import pycountry
from tqdm import tqdm
from getpass import getpass
from urllib.parse import urljoin


BASE_URL = "https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/"
PRODUCT_TEMPLATE = "OCO3_L2_Lite_SIF.11r/{year}/"
OUTPUT_DIR = "/content/drive/MyDrive/Shortcuts/DATA/output" # replace with your own path
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_earthdata_session():
    session = requests.Session()
    netrc_path = os.path.expanduser("~/.netrc")
    if os.path.exists(netrc_path):
        session.trust_env = True
        return session
    username = input("Earthdata username: ")
    password = getpass("Earthdata password: ")
    session.auth = (username, password)
    return session

session = get_earthdata_session()

def list_remote_nc4(product_dir):
    r = session.get(product_dir, timeout=60)
    r.raise_for_status()

    files = []
    for line in r.text.splitlines():
        if 'href="' not in line:
            continue

        href = line.split('href="')[1].split('"')[0]

        if href.endswith(".nc4") and not href.endswith(".nc4.xml"):
            files.append(href)

    print(f"Found {len(files)} remote NC4 files.")
    return sorted(files)

def safe_open_h5(bio):
    bio.seek(0)
    sig = bio.read(8)
    bio.seek(0)
    if sig != b'\x89HDF\r\n\x1a\n':
        raise OSError("Not a valid HDF5 file")
    return h5py.File(bio, 'r')

def parse_city_country(df):
    seq = df['sequence_name'].astype(str)
    parts = seq.str.split('_')
    df['city'] = parts.apply(lambda x: ' '.join(x[1:-1]) if len(x) > 2 else '')
    def correct_country(c):
        try:
            return pycountry.countries.lookup(c).name
        except LookupError:
            return c
    df['country'] = parts.apply(lambda x: x[-1] if len(x) > 1 else '')
    return df

def read_filter_remote_file(filename, product_dir, retries=3):
    mapping = {0: b'ND', 1: b'GL', 2: b'TG', 3: b'AM', 4: b'XS'}
    for attempt in range(retries):
        try:
            url = urljoin(product_dir, filename)
            with session.get(url, stream=True) as r:
                r.raise_for_status()
                import io
                bio = io.BytesIO(r.content)

            with safe_open_h5(bio) as f:
                sif = f['Daily_SIF_757nm'][:]
                op_mode_arr = np.atleast_1d(f['Metadata/MeasurementMode'][:])
                qflag = f['SimplyGoodOrBadQualityFlag'][:]
                lat = f['Geolocation/latitude'][:]
                lon = f['Geolocation/longitude'][:]
                tai93 = f['Geolocation/time_tai93'][:]
                seq_idx = f['Sequences/SequencesIndex'][:]
                seq_names_all = f['Sequences/SequencesName'][:]

                op_decoded = np.array([mapping.get(int(v), b'UN') for v in op_mode_arr])

                # Filter for good quality SAMs
                mask = (op_decoded == b'AM') & (qflag == 0)
                if not np.any(mask):
                    return None

                sif = sif[mask]
                op_decoded = op_decoded[mask]
                lat = lat[mask]
                lon = lon[mask]
                tai93 = tai93[mask]
                seq_idx = seq_idx[mask]

                sequence_names = [
                    seq_names_all[i].decode('utf-8') if 0 <= i < len(seq_names_all) else 'UNKNOWN'
                    for i in seq_idx
                ]

                epoch = pd.Timestamp("1993-01-01T00:00:00Z")
                datetime = epoch + pd.to_timedelta(tai93.astype(float), unit="s")

                df = pd.DataFrame({
                    'Daily_SIF_757nm': sif,
                    'latitude': lat,
                    'longitude': lon,
                    'datetime': pd.to_datetime(datetime).tz_localize(None),
                    'measurement_mode': [v.decode('utf-8') for v in op_decoded],
                    'sequence_name': sequence_names
                })

                df = parse_city_country(df)

                print(f"{filename}: kept {len(df)} AM/SAM SIF soundings")
                return df

        except (requests.HTTPError, OSError, KeyError) as e:
            print(f"Warning: {filename} attempt {attempt+1} failed: {e}")
            import time; time.sleep(2**attempt)
            continue

    return None

def combine_remote_files_years(years, n_files=None, max_workers=2, output_csv="combined_SIF_2019-2025.csv"):
    all_dfs = []
    for year in years:
        print(f"\nProcessing year: {year}")
        product_dir = urljoin(BASE_URL, PRODUCT_TEMPLATE.format(year=year))
        remote_files = list_remote_nc4(product_dir)
        selected_files = remote_files if n_files is None else remote_files[:n_files]

        dfs = []
        for f in tqdm(selected_files):
            df = read_filter_remote_file(f, product_dir)
            if df is not None and len(df) > 0:
                dfs.append(df)

        if dfs:
            all_dfs.append(pd.concat(dfs, ignore_index=True))

    if not all_dfs:
        print("No data found!")
        return None

    df_all = pd.concat(all_dfs, ignore_index=True)
    out_path = os.path.join(OUTPUT_DIR, output_csv)
    df_all.to_csv(out_path, index=False)
    print(f"\nSaved combined SIF all-values CSV: {out_path}")
    return df_all

years = [2019,2020,2022,2023,2024,2025]
combine_remote_files_years(years, n_files=None, max_workers=2)
