<a href="https://colab.research.google.com/github/heytian/d2d-oco3-tools/blob/main/nc4_plot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# still need to debug as of 20251111/ Work in progress

# -----------------------
# 0. Install prerequisites
# -----------------------
!pip install xarray netcdf4 h5netcdf requests tqdm --quiet

# -----------------------
# 1. Mount Google Drive
# -----------------------
from pathlib import Path
from getpass import getpass
from google.colab import drive

drive.mount('/content/drive')
output_folder = Path("/content/drive/MyDrive/OCO3_CSV")
output_folder.mkdir(exist_ok=True, parents=True)

# -----------------------
# 2. Earthdata credentials (interactive)
# -----------------------
earthdata_user = input("Enter Earthdata username: ")
earthdata_pass = getpass("Enter Earthdata password: ")

# -----------------------
# 3. Imports
# -----------------------
import requests
import xarray as xr
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

# -----------------------
# 4. Functions
# -----------------------
def list_files_for_year(year):
    """Lists available NC4 files for a given year on OCO3 GESDISC HTTPS server."""
    base_url = f"https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/OCO3_L2_Lite_FP.11r/{year}/"
    resp = requests.get(base_url, auth=(earthdata_user, earthdata_pass))
    if resp.status_code != 200:
        print(f"Failed to list files for {year}: HTTP {resp.status_code}")
        return []
    soup = BeautifulSoup(resp.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a') if a['href'].endswith('.nc4')]
    return links

def process_nc4_url(url, variables=None):
    """Download remote NC4 via HTTPS and convert selected variables to DataFrame."""
    session = requests.Session()
    session.auth = (earthdata_user, earthdata_pass)
    try:
        ds = xr.open_dataset(url, engine='netcdf4', decode_times=True, session=session)
    except Exception as e:
        print(f"Error opening {url}: {e}")
        return None

    ds_sel = ds[variables] if variables else ds
    df = ds_sel.to_dataframe().reset_index()
    return df

# -----------------------
# 5. Interactive selection
# -----------------------
years = input("Enter years to process (comma-separated, e.g., 2022,2023,2024): ")
years = [y.strip() for y in years.split(',')]

num_files = int(input("Enter number of files to process per year: "))

variables_to_extract = input(
    "Enter variables to extract (comma-separated, e.g., xco2,latitude,longitude,time): "
)
variables_to_extract = [v.strip() for v in variables_to_extract.split(',')]

# -----------------------
# 6. Loop over years and files
# -----------------------
for year in years:
    print(f"\nListing files for {year} ...")
    links = list_files_for_year(year)
    if not links:
        continue

    # Select the first N files
    selected_files = links[:num_files]

    for fname in tqdm(selected_files, desc=f"Processing {year}"):
        file_url = f"https://oco2.gesdisc.eosdis.nasa.gov/data/OCO3_DATA/OCO3_L2_Lite_FP.11r/{year}/{fname}"
        df = process_nc4_url(file_url, variables=variables_to_extract)
        if df is not None:
            csv_path = output_folder / f"{Path(fname).stem}.csv"
            df.to_csv(csv_path, index=False)
