<a href="https://colab.research.google.com/github/heytian/d2d-oco3-tools/blob/main/nc4_plot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. NC4 Files to CSV**

This is a tool to filter for high quality SAMs, and to aggregate a single CO2 value for each SAM location (taking the median CO2), from a folder of multiple nc4s. The lat/lon is the center point of all the soundings with the same target_name.

In [None]:
import os
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm
from ipywidgets import interact, IntSlider

# -----------------------------
# PATHS
# -----------------------------
DATA_DIR = "/content/drive/MyDrive/Shortcuts/DATA/2024-CO2-netcdfs"
OUTPUT_DIR = "/content/drive/MyDrive/Shortcuts/DATA/output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Discover NC4 files
files = sorted([f for f in os.listdir(DATA_DIR) if f.endswith('.nc4')])
print(f"Found {len(files)} NC4 files.")

# -----------------------------
# INTERACTIVE FILE SELECTION
# -----------------------------
@interact(n_files=IntSlider(min=1, max=len(files), step=1, value=3, description='Files to process'))
def select_files(n_files):
    global SELECTED_FILES
    SELECTED_FILES = files[:n_files]
    print(f"Selected {len(SELECTED_FILES)} files:")
    for f in SELECTED_FILES:
        print(" -", f)

# -----------------------------
# READ AND FILTER ONE FILE
# -----------------------------
def read_filter_file(path):
    with h5py.File(path, 'r') as f:
        ns = len(f['sounding_id'])
        data = np.zeros(ns, dtype=[('sounding_id','int64'),
                                   ('xco2','f8'),
                                   ('operation_mode','S2'),
                                   ('xco2_quality_flag','i1'),
                                   ('target_name','S100'),
                                   ('latitude','f8'),
                                   ('longitude','f8')])
        # Direct extraction
        data['sounding_id'] = f['sounding_id'][...]
        data['xco2'] = f['xco2'][...]
        op = f['Sounding/operation_mode'][...]
        mapping = {0:b'ND',1:b'GL',2:b'TG',3:b'XS',4:b'AM'}
        data['operation_mode'] = np.array([mapping.get(int(v), b'UNK') for v in op])
        data['xco2_quality_flag'] = f['xco2_quality_flag'][...]
        data['target_name'] = f['Sounding/target_name'][...]
        data['latitude'] = f['latitude'][...]
        data['longitude'] = f['longitude'][...]

    # Filter: AM + quality_flag 0
    mask = (data['operation_mode'] == b'AM') & (data['xco2_quality_flag'] == 0)
    data = data[mask]

    # Add datetime (optional, per sounding)
    dt_strings = np.array([str(s)[:14] for s in data['sounding_id']])
    data = np.lib.recfunctions.append_fields(data, 'datetime',
                                             pd.to_datetime(dt_strings, format='%Y%m%d%H%M%S'),
                                             usemask=False)
    return data

# -----------------------------
# COMBINE FILES INTO ONE CSV
# -----------------------------
def combine_files(output_csv="combined_median.csv"):
    import numpy.lib.recfunctions as rfn
    all_data = []

    for f in tqdm(SELECTED_FILES, desc=f"Processing {len(SELECTED_FILES)} files"):
        path = os.path.join(DATA_DIR, f)
        data = read_filter_file(path)
        all_data.append(data)

    if len(all_data) == 0:
        print("No data to combine!")
        return

    combined = np.concatenate(all_data)

    # Convert to DataFrame
    df = pd.DataFrame(combined)

    # Aggregate by target_name
    agg_df = df.groupby('target_name', as_index=False).agg({
        'xco2':'median',
        'latitude':'mean',
        'longitude':'mean',
        'operation_mode':'first',   # keep a representative value
        'xco2_quality_flag':'first' # keep a representative value
    })

    # Save CSV
    out_path = os.path.join(OUTPUT_DIR, output_csv)
    agg_df.to_csv(out_path, index=False)
    print(f"Saved combined CSV: {out_path}")


In [None]:
# After choosing number of files in the slider above, run this

combine_files()


# **2. Data Exploration of NC4 File Variables**

This is a tool to interactively select variables in a SINGLE nc4 file, allowing one to preview all nested variables in an nc4 file including hidden subfolders, and to plot them into a CSV for further processing. This is more for **data exploration** than data processing since individual csvs produced here still have large file sizes.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
from ipywidgets import interact, IntSlider, SelectMultiple
from tqdm import tqdm

# Base folder containing .nc4 files (replace with your own google drive shortcut to D2D shared drive)
DATA_DIR = "/content/drive/MyDrive/Shortcuts/DATA/2025-CO2-netcdfs"
OUTPUT_DIR = "/content/drive/MyDrive/Shortcuts/DATA/output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Discover years and files
files = [f for f in os.listdir(DATA_DIR) if f.endswith(".nc4")]
years = sorted(list({f.split('_')[2][:2] for f in files}))  # extract '24' from '240716'
print(f"Found {len(files)} total NC4 files.")


In [None]:
# Display sample files per year
files_by_year = {}
for f in files:
    year = "20" + f.split('_')[2][:2]
    files_by_year.setdefault(year, []).append(f)

available_years = sorted(files_by_year.keys())
print("Available years:", available_years)

# Let user choose interactively
@interact(
    year=available_years,
    n_files=IntSlider(min=1, max=10, step=1, value=3, description='Files to process'),
)
def choose_year(year, n_files):
    chosen = files_by_year[year][:n_files]
    print(f"Will process {len(chosen)} files from {year}:")
    for f in chosen:
        print(" -", f)
    globals()['SELECTED_FILES'] = chosen


In [None]:
# Function to recursively collect dataset paths in an HDF5 file
def collect_datasets(h5file):
    dataset_paths = []

    def visitor(name, obj):
        if isinstance(obj, h5py.Dataset):
            dataset_paths.append(name)
    h5file.visititems(visitor)
    return dataset_paths

# First, pick a sample file to list variables
sample_file = os.path.join(DATA_DIR, SELECTED_FILES[0]) # test 1 file
# sample_file = os.path.join(DATA_DIR, files[0]) # all files
with h5py.File(sample_file, 'r') as f:
    all_datasets = collect_datasets(f)

# Let user select variables interactively

preselected_vars = [
    'sounding_id',
    'Sounding/operation_mode',
    'xco2',
    'xco2_quality_flag',
    'latitude',
    'longitude',
    'Sounding/target_name'
]

@interact(
    variables=SelectMultiple(
        options=all_datasets,
        value=[v for v in preselected_vars if v in all_datasets],
        description='Variables',
        layout={'width': 'max-content'},
        rows=15
    )
)
def choose_variables(variables):
    print("Selected variables for CSV extraction:")
    for v in variables:
        print(" -", v)
    globals()['SELECTED_VARS'] = variables


In [None]:
# Get time from sounding_id and filter functions for good quality SAMs

def filter_and_extract_time(df):
    """
    Filter the DataFrame for SAM/AM mode and good quality, and extract datetime from sounding_id.
    """
    # Ensure sounding_id is string
    df['sounding_id'] = df['sounding_id'].astype(str)

    # Extract datetime from sounding_id (YYYYMMDDHHMM)
    df['year'] = df['sounding_id'].str[:4].astype(int)
    df['month'] = df['sounding_id'].str[4:6].astype(int)
    df['day'] = df['sounding_id'].str[6:8].astype(int)
    df['hour'] = df['sounding_id'].str[8:10].astype(int)
    df['minute'] = df['sounding_id'].str[10:12].astype(int)
    df['datetime'] = pd.to_datetime(df[['year','month','day','hour','minute']])

    # Filter for SAM (AM) only
    df = df[df['sounding_operation_mode'] == 'AM']

    # Filter for good quality flag
    df = df[df['xco2_quality_flag'] == 0]

    # Drop intermediate columns
    df = df.drop(columns=['year','month','day','hour','minute'])

    return df

In [None]:
# CSV file extraction

def process_nc4_file(local_path, variables): # all files
    filename = os.path.basename(local_path)
    csv_name = os.path.join(OUTPUT_DIR, filename.replace(".nc4", ".csv"))

    try:
        with h5py.File(local_path, 'r') as f:
            out_dict = {}
            for var in variables:
                # Direct access
                if var in f:
                    out_dict[var.replace('/', '_')] = f[var][...]
                else:
                    # Search for nested dataset ending with last component of var
                    var_name = var.split('/')[-1]
                    found = False
                    def find_dataset(name, obj):
                        nonlocal found
                        if isinstance(obj, h5py.Dataset) and name.endswith(var_name):
                            out_dict[name.replace('/', '_')] = obj[...]
                            found = True
                    f.visititems(find_dataset)
                    if not found:
                        print(f"Warning: variable {var} not found in {filename}")

            # Map Sounding/operation_mode integers to strings if present
            key_opmode = [k for k in out_dict.keys() if k.endswith('operation_mode')]
            if key_opmode:
                k = key_opmode[0]
                mapping = {0: 'ND', 1: 'GL', 2: 'TG', 3: 'XS', 4: 'AM'}
                out_dict[k] = [mapping.get(int(v), 'UNK') for v in out_dict[k]]

            df = pd.DataFrame(out_dict)

            # Filter for SAM mode (AM) and good quality, and extract datetime
            if 'sounding_operation_mode' in df.columns and 'xco2_quality_flag' in df.columns:
                df = df[(df['sounding_operation_mode'] == 'AM') & (df['xco2_quality_flag'] == 0)]
            if 'sounding_id' in df.columns:
                df['datetime'] = df['sounding_id'].astype(str).apply(
                    lambda x: f"20{x[:2]}-{x[2:4]}-{x[4:6]} {x[8:10]}:{x[10:12]}:00"
                )

            df.to_csv(csv_name, index=False)
            print(f"Saved CSV: {csv_name}")

    except Exception as e:
        print(f"Failed {local_path}: {e}")


In [None]:
def run_all():
    for f in tqdm(SELECTED_FILES, desc=f"Processing {len(SELECTED_FILES)} files"):
        local_path = os.path.join(DATA_DIR, f)
        process_nc4_file(local_path, SELECTED_VARS)

run_all()
