In [None]:
import xarray as xr
import numpy as np
import glob
import os
from xarray.coding.times import CFDatetimeCoder

In [None]:
def get_MIP(experiment):
    """
    Utility function to get the activity associated with a particular experiment
    """
    if experiment == 'ssp245-covid':
        return 'DAMIP'
    elif experiment == 'ssp370-lowNTCF':
        return 'AerChemMIP'
    elif experiment.startswith('ssp'):
        return 'ScenarioMIP'
    elif experiment.startswith('hist-'):
        return 'DAMIP'
    else:
        return 'CMIP'

In [None]:
def get_data_Amon(variable, experiment, member):
    """
    Reads a CMIP6 variable (day) for a given model/experiment/member from NorESM2-LM.
    Gets the latest available version if multiple exist.
    Encoding uses cftime for compatibility with non-standard calendars.
    """
    files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn/v20210118/{variable}/*.nc")

    if not files:
        files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn/v20191108/{variable}/*.nc")

    if not files:
        files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn/v20190815/{variable}/*.nc")

    if not files:
        return -1

    time_coder = CFDatetimeCoder(use_cftime=True)
    return xr.open_mfdataset(files, decode_times=time_coder, combine='by_coords')[variable]


def get_data_day(variable, experiment, member):
    """
    Reads a CMIP6 variable (day) for a given model/experiment/member from NorESM2-LM.
    Gets the latest available version if multiple exist.
    Encoding uses cftime for compatibility with non-standard calendars.
    """
    files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/day/{variable}/gn/v20210118/{variable}/*.nc")

    if not files:
        files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/day/{variable}/gn/v20191108/{variable}/*.nc")

    if not files:
        files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/day/{variable}/gn/v20190815/{variable}/*.nc")

    if not files:
        return -1

    time_coder = CFDatetimeCoder(use_cftime=True)
    return xr.open_mfdataset(files, decode_times=time_coder, combine='by_coords')[variable]


def is_missing(var):
    """Check if a variable returned -1 instead of an xarray object."""
    return isinstance(var, int) and var == -1


# def get_data_Amon(variable, experiment, member):
#     """
#     Read CMIP6 variable, handling overlapping or unsorted time coordinates.
#     This is an alternative to xarray's open_mfdataset for cases where time
#     coordinates are not monotonic or have overlaps. (only ssp585)
#     """

#     base = f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn"
#     versions = ["v20210118", "v20191108", "v20190815"]

#     files = []
#     for v in versions:
#         files = glob.glob(f"{base}/{v}/{variable}/*.nc")
#         if files:
#             break

#     if not files:
#         print(f"No files found for {variable}, {experiment}, {member}")
#         return -1

#     time_coder = CFDatetimeCoder(use_cftime=True)

#     # Open files one at a time, sort and fix time
#     datasets = []
#     for f in sorted(files):
#         try:
#             ds = xr.open_dataset(f, decode_times=time_coder)
#             ds = ds.sortby("time")  # ensure monotonic
#             datasets.append(ds)
#         except Exception as e:
#             print(f"‚ö†Ô∏è Skipping {f}: {e}")

#     combined = xr.concat(datasets, dim="time")

#     # Drop any duplicate timestamps (common issue)
#     _, index = np.unique(combined["time"], return_index=True)
#     combined = combined.isel(time=index)

#     return combined[variable]

In [None]:
def process(experiment, member, model="NorESM2-LM", output_dir="./processed"):
    """
    Process daily/monthly data for one CMIP6 experiment + ensemble member.
    Saves one NetCDF file containing annual mean variables.
    """
    os.makedirs(output_dir, exist_ok=True)
    outfile = os.path.join(output_dir, f"{model}_{experiment}_{member}.nc")

    print(f"\n Processing {model} | {experiment} | {member}")

    tasmin = get_data_day('tasmin', experiment, member)
    tasmax = get_data_day('tasmax', experiment, member)
    tas = get_data_Amon('tas', experiment, member)
    pr = get_data_Amon('pr', experiment, member)

    if (is_missing(tasmin) and is_missing(tasmax) and is_missing(tas) and is_missing(pr)):
        print("skip")
        return

    if not is_missing(pr):
        pr.persist()
    # Since we need to process it twice (not sure what this does but ok)

    if (is_missing(tasmin) or is_missing(tasmax)):
        data_vars = {}
        if not is_missing(tas):
            data_vars['tas'] = tas.groupby('time.year').mean('time')
        if not is_missing(pr):
            pr = pr.chunk({'time': -1})
            data_vars['pr'] = pr.groupby('time.year').mean('time')
            data_vars['pr90'] = pr.groupby('time.year').quantile(0.9, skipna=True)

    else:
        dtr = tasmax - tasmin
        data_vars = {'diurnal_temperature_range': dtr.groupby('time.year').mean('time')}
        if not is_missing(tas):
            data_vars['tas'] = tas.groupby('time.year').mean('time')
        if not is_missing(pr):
            pr = pr.chunk({'time': -1})
            data_vars['pr'] = pr.groupby('time.year').mean('time')
            data_vars['pr90'] = pr.groupby('time.year').quantile(0.9, skipna=True)

    ds = xr.Dataset(data_vars)
    ds.to_netcdf(outfile)
    print(f"Saved processed data to {outfile}")


def main():
    """Main processing loop over experiments and ensemble members."""
    experiments1 = [
        'ssp126', 'ssp245', 'ssp370',  # ScenarioMIP skip 'ssp585'
        'ssp370-lowNTCF',  # AerChemMIP
        'historical', 'abrupt-4xCO2', '1pctCO2', 'piControl',  # CMIP6
        'hist-GHG'  # DAMIP skips 'hist-aer'
    ]
    # experiments1 = ['ssp585']

    for experiment in experiments1:
        for i in range(3):
            # f2 for 'ssp245-covid' since use different physics model (idk)
            physics = 2 if experiment == 'ssp245-covid' else 1 
            member = f"r{i+1}i1p1f{physics}"
            process(experiment, member)

    print("\nüéâ All preprocessing complete.")

In [None]:
# Runs the main processing function can take 30-60 minutes
main()

Testing and other pain being solved. 

In [None]:
def get_data(variable, experiment, member):

    files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn/v20210118/{variable}/*.nc")

    if not files:
        files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn/v20191108/{variable}/*.nc")

    if not files:
        files = glob.glob(f"/glade/collections/cmip/CMIP6/{get_MIP(experiment)}/NCC/NorESM2-LM/{experiment}/{member}/Amon/{variable}/gn/v20190815/{variable}/*.nc")

    if not files:
        return -1

    time_coder = CFDatetimeCoder(use_cftime=True)
    return xr.open_mfdataset(files, decode_times=time_coder, combine='by_coords')[variable]

In [None]:
tas = get_data('tas', 'ssp126', 'r1i1p1f1')
ds = xr.Dataset({'tas': tas.groupby('time.year').mean('time')})

os.makedirs("./processed", exist_ok=True)
outfile = os.path.join("./processed", "yay.nc")
ds.to_netcdf(outfile)
print('done')

^ the interesting bit is that this file is half the size of main processed file, and it runs in ~ 1/5 of the time.. not to sure why

In [None]:
pr = get_data('pr', 'ssp245', 'r1i1p1f1')
pr = pr.chunk({'time': -1})
ds = xr.Dataset({
    'pr': pr.groupby('time.year').mean('time'),
    'pr90': pr.groupby('time.year').quantile(0.9, skipna=True)
    })

os.makedirs("./processed", exist_ok=True)
outfile = os.path.join("./processed", "yay1.nc")
ds.to_netcdf(outfile)
print('done')

^ I was having trouble with encoding

In [None]:
tas1 = get_data('tas', 'ssp585', 'r1i1p1f1')
tas1

^ this errors I think