<a href="https://colab.research.google.com/github/heytian/d2d-oco3-tools/blob/main/nc4_plot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
from ipywidgets import interact, IntSlider, SelectMultiple
from tqdm import tqdm

# Base folder containing .nc4 files (replace with your own google drive shortcut to D2D shared drive)
DATA_DIR = "/content/drive/MyDrive/Shortcuts/DATA/2019-CO2-netcdfs"
OUTPUT_DIR = "/content/drive/MyDrive/Shortcuts/DATA/output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Discover years and files
files = [f for f in os.listdir(DATA_DIR) if f.endswith(".nc4")]
years = sorted(list({f.split('_')[2][:2] for f in files}))  # extract '24' from '240716'
print(f"Found {len(files)} total NC4 files.")


In [None]:
# # Data exploration - List the variables in each netcdf file, along with their subfolders

# def print_hdf5_structure(name, obj):
#     print(name)

# with h5py.File(sample_file, "r") as f:
#     f.visititems(print_hdf5_structure)

In [None]:
# Display sample files per year
files_by_year = {}
for f in files:
    year = "20" + f.split('_')[2][:2]
    files_by_year.setdefault(year, []).append(f)

available_years = sorted(files_by_year.keys())
print("Available years:", available_years)

# Let user choose interactively
@interact(
    year=available_years,
    n_files=IntSlider(min=1, max=10, step=1, value=3, description='Files to process'),
)
def choose_year(year, n_files):
    chosen = files_by_year[year][:n_files]
    print(f"Will process {len(chosen)} files from {year}:")
    for f in chosen:
        print(" -", f)
    globals()['SELECTED_FILES'] = chosen


In [None]:
# Function to recursively collect dataset paths in an HDF5 file
def collect_datasets(h5file):
    dataset_paths = []

    def visitor(name, obj):
        if isinstance(obj, h5py.Dataset):
            dataset_paths.append(name)
    h5file.visititems(visitor)
    return dataset_paths

# First, pick a sample file to list variables
sample_file = os.path.join(DATA_DIR, files[0])
with h5py.File(sample_file, 'r') as f:
    all_datasets = collect_datasets(f)

# Let user select variables interactively

preselected_vars = [
    'Sounding/target_id',
    'Sounding/operation_mode',
    'xco2',
    'xco2_quality_flag',
    'latitude',
    'longitude',
    'time'
]

@interact(
    variables=SelectMultiple(
        options=all_datasets,
        value=[v for v in preselected_vars if v in all_datasets],
        description='Variables',
        layout={'width': 'max-content'},
        rows=15
    )
)
def choose_variables(variables):
    print("Selected variables for CSV extraction:")
    for v in variables:
        print(" -", v)
    globals()['SELECTED_VARS'] = variables


Don't choose "date" as one of the variables, just "time" will show date time etc. "Date" weirdly breaks a day down into 7+ rows i.e. year, month, day, time - each becomes one row.

In [None]:
# # Let user preview variables in the first file
# sample_file = os.path.join(DATA_DIR, files[0])
# ds = xr.open_dataset(sample_file)
# vars_list = list(ds.variables)
# print(f"Variables found in sample file ({len(vars_list)}):")
# print(vars_list[:20])  # show a subset

# @interact
# def choose_vars(
#     csv_vars=SelectMultiple(options=vars_list, value=("xco2", "latitude", "longitude", "time"), description="CSV vars"),
#     geo_var=SelectMultiple(options=vars_list, value=("xco2",), description="Geo var")
# ):
#     globals()['CSV_VARS'] = list(csv_vars)
#     globals()['GEO_VARS'] = list(geo_var)[0]
#     print(f"Selected CSV vars: {CSV_VARS}")
#     print(f"Selected Geo var: {GEO_VARS}")


In [None]:
# CSV file extraction

def process_nc4_file(local_path, variables):
    filename = os.path.basename(local_path)
    csv_name = os.path.join(OUTPUT_DIR, filename.replace(".nc4", ".csv"))

    try:
        with h5py.File(local_path, 'r') as f:
            out_dict = {}
            for var in variables:
                # Direct access
                if var in f:
                    out_dict[var.replace('/', '_')] = f[var][...]
                else:
                    # Search for nested dataset ending with last component of var
                    var_name = var.split('/')[-1]
                    found = False
                    def find_dataset(name, obj):
                        nonlocal found
                        if isinstance(obj, h5py.Dataset) and name.endswith(var_name):
                            out_dict[name.replace('/', '_')] = obj[...]
                            found = True
                    f.visititems(find_dataset)
                    if not found:
                        print(f"Warning: variable {var} not found in {filename}")

            # Map Sounding/operation_mode integers to strings if present
            key_opmode = [k for k in out_dict.keys() if k.endswith('operation_mode')]
            if key_opmode:
                k = key_opmode[0]
                mapping = {0: 'ND', 1: 'GL', 2: 'TG', 3: 'XS', 4: 'AM'}
                out_dict[k] = [mapping.get(int(v), 'UNK') for v in out_dict[k]]

            df = pd.DataFrame(out_dict)
            df.to_csv(csv_name, index=False)
            print(f"Saved CSV: {csv_name}")

    except Exception as e:
        print(f"Failed {local_path}: {e}")


In [None]:
def run_all():
    for f in tqdm(SELECTED_FILES, desc=f"Processing {len(SELECTED_FILES)} files"):
        local_path = os.path.join(DATA_DIR, f)
        process_nc4_file(local_path, SELECTED_VARS)

run_all()
