# Save out o2_sat, aou, o2 + metrics for all models



This is honestly a bit overkill. I could integrate this into a major preprocessing script, which is run per source_id and does:
- Diagnoses volume
- Adds o2sat + aou
- Averages annually
    - Write out volume/export values for this, since we want to show timeseries.
- Computes historical + linear trend for all variables
    - write out these results
- Reload and regrid the output for plotting purposes


## Notes

- 2022/2/23 Ran for all members of the Pacific OMZ study

In [1]:
%load_ext autoreload
%autoreload 2

import cf_xarray
import intake
import xarray as xr
import numpy as np

from cmip6_preprocessing.utils import cmip6_dataset_id
from cmip6_preprocessing.preprocessing import combined_preprocessing
from cmip6_preprocessing.postprocessing import (
    match_metrics,
    interpolate_grid_label,
    merge_variables,
    concat_experiments,
)

from cmip6_preprocessing.drift_removal import match_and_remove_trend
from fastprogress.fastprogress import progress_bar

from xhistogram.xarray import histogram

from cmip6_omz.utils import cmip6_collection, o2_models
from cmip6_omz.upstream_stash import (
    pick_first_member,
    construct_static_dz,
    concat_time,
    zarr_exists,
    pick_first_member,
)
from cmip6_omz.units import convert_mol_m3_mymol_kg

from xarrayutils.file_handling import maybe_create_folder

### needs cleaning
from cmip6_omz.omz_tools import omz_thickness_efficient
import matplotlib.pyplot as plt
from cmip6_omz.upstream_stash import append_write_zarr

from dask.diagnostics import ProgressBar

from busecke_etal_2021_aguadv.utils import fail_age, cut_long_members

In [2]:
import dask
from multiprocessing.pool import ThreadPool
dask.config.set(pool=ThreadPool(10))

<dask.config.set at 0x7fbbd2122970>

## Start the processing

In [3]:
ofolder = maybe_create_folder('/projects/GEOCLIM/LRGROUP/jbusecke/projects_data/full_o2_sat_v0.2/')



In [4]:
o2_models()

['ACCESS-ESM1-5',
 'CESM2',
 'CESM2-WACCM',
 'CMCC-ESM2',
 'CNRM-ESM2-1',
 'CanESM5',
 'CanESM5-CanOE',
 'EC-Earth3-CC',
 'GFDL-CM4',
 'GFDL-ESM4',
 'IPSL-CM5A2-INCA',
 'IPSL-CM6A-LR',
 'KIOST-ESM',
 'MIROC-ES2L',
 'MPI-ESM-1-2-HAM',
 'MPI-ESM1-2-HR',
 'MPI-ESM1-2-LR',
 'MRI-ESM2-0',
 'NorESM2-LM',
 'NorESM2-MM',
 'UKESM1-0-LL']

In [5]:
col = cmip6_collection(zarr=False)
kwargs = dict(
    aggregate=False,
    zarr_kwargs={"decode_times": True, "use_cftime": True, "consolidated": True},
    cdf_kwargs={"decode_times": True, "use_cftime": True, "chunks": {"time": 1}},
    preprocess=combined_preprocessing,
)

variable_ids = ['thetao', 'so', 'o2', 'agessc']
metric_variable_ids = ["thkcello", "areacello"]

# models = o2_models()
models = [
 'IPSL-CM6A-LR', 
 'CanESM5',
 'ACCESS-ESM1-5',
 'MPI-ESM1-2-LR',
 'MPI-ESM1-2-HR',
 'UKESM1-0-LL',
 'CNRM-ESM2-1',
 'CanESM5-CanOE',
 'MIROC-ES2L',
 'GFDL-CM4',
 'GFDL-ESM4',
 'MRI-ESM2-0',
 'NorESM2-LM',
 'NorESM2-MM',
    
## new models
#  'EC-Earth3-CC',
#  'KIOST-ESM',
#  'MPI-ESM-1-2-HAM',
#  'IPSL-CM5A2-INCA',
#  'CESM2',
#  'CESM2-WACCM',
#  'CMCC-ESM2',
]

cat = col.search(
    source_id=models,
    grid_label=["gr", "gn"],
    experiment_id=["historical", "ssp585"],
    table_id=["Omon"],
    variable_id=variable_ids,
)
ds_dict = cat.to_dataset_dict(**kwargs)


# make a separate metric dict to catch all possible metrics!
cat_metrics = col.search(source_id=models, variable_id=metric_variable_ids)
ds_metric_dict = cat_metrics.to_dataset_dict(**kwargs)

# combine in time (only needed for netcdf collection)
ds_dict = concat_time(ds_dict)
ds_metric_dict = concat_time(ds_metric_dict)



# For now remove one of the norwegian models that returns this error: KeyError: cftime.DatetimeNoLeap(2100, 12, 31, 23, 59, 59, 999999, has_year_zero=True)
# I wonder if I could resolve this by replacing the calendar type. 
ds_dict = {k:ds for k,ds in ds_dict.items() if not ('NorESM2-MM' in k and 'r3i1p1f1' in k)}
ds_metric_dict = {k:ds for k,ds in ds_metric_dict.items() if not ('NorESM2-MM' in k and 'r3i1p1f1' in k)}

# cut longer members
ds_dict = {k:cut_long_members(ds) for k,ds in ds_dict.items()}
ds_metric_dict = {k:cut_long_members(ds) for k,ds in ds_metric_dict.items()}

# Brute Force add the GFDL age
patch_source_ids = [si for si in models if 'GFDL' in si]

if 'agessc' in variable_ids and len(patch_source_ids)>0:
    # TODO: Make this nicer with the original netcdf files (not tonight though)
    col_gfdl = cmip6_collection(zarr=True)
    # BUG: There is something weird going on in the reading process here
    # Just drop everything that is not GFDL
    df = col_gfdl.df
    df = df.iloc[ ['GFDL' in i for i in df['source_id']], :]
    df = df.iloc[ ['agessc' in i for i in df['variable_id']], :]
    
    col_gfdl.df = df
    cat_gfdl = col_gfdl.search(
        source_id=patch_source_ids,
        variable_id=["agessc"],
        experiment_id=["historical", "ssp585"],
    )

    ddict_gfdl_age = cat_gfdl.to_dataset_dict(**kwargs)
    ddict_gfdl_age = {k:ds for k, ds in ddict_gfdl_age.items()}
    ds_dict.update(ddict_gfdl_age)
    
    chunk_dict = {'lev':5}

    # rechunk the GFDL models in depth
    def maybe_rechunk(ds):
        subset_chunk_dict = {k:v for k,v in chunk_dict.items() if k in ds.dims}
        if ds.source_id in ["GFDL-CM4","GFDL-ESM4"] and len(subset_chunk_dict)>0:
            ds = ds.chunk(subset_chunk_dict)
        return ds
    ds_dict = {k:maybe_rechunk(ds) for k,ds in ds_dict.items()}

Dataframe size before picking latest version: 2280
Getting latest version...

Dataframe size after picking latest version: 2258

Done....


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.dcpp_init_year.version.time_range.path'


    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit


    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit

    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit

    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit

    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit

    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit

    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit

    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit
    incompatible units for variable 'lev': cannot convert a non-quantity using 'm' as unit


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.dcpp_init_year.version.time_range.path'


KeyError: cftime.DatetimeNoLeap(2100, 12, 31, 23, 59, 59, 999999, has_year_zero=True)

## Cleanup datasets early

I am currently: 
- dropping all variables except for the one specified in ds.variable_id
- Checking if datasets have the expected length (otherwise drop)

I am currently allowing longer ssp585 runs, but could cut them here!

In [19]:
import warnings

def _expected_length(ds):
    if ds.experiment_id == "historical":
        if ds.table_id == "Omon":
            return 1980
        else:
            warnings.warn(
                f"unknown table_id [{ds.table_id}] for {cmip6_dataset_id(ds)}"
            )
            return 1

    elif "ssp" in ds.experiment_id:
        if ds.table_id == "Omon":
            return 1032
        else:
            warnings.warn(
                f"unknown table_id [{ds.table_id}] for {cmip6_dataset_id(ds)}"
            )
            return 1

    elif "Control" in ds.experiment_id:
        if ds.table_id == "Omon":
            return (
                12 * 50
            )  # just give a low number here so none of the controls are dropped
        else:
            warnings.warn(
                f"unknown table_id [{ds.table_id}] for {cmip6_dataset_id(ds)}"
            )
            return 1
    else:
        warnings.warn(
            f"unknown experiment_id [{ds.experiment_id}] for {cmip6_dataset_id(ds)}"
        )
        return 1


def filter_ddict(ddict):
    ddict_filtered = {}
    for name, ds in ddict.items():
        # drop everything but main variable
        ds = ds.drop([v for v in ds.data_vars if v != ds.variable_id])
        # remove any output in density coordinates (Nor ESM?)
        if not 'rho' in ds.dims:
            # filter out too short runs
            if "time" not in ds.dims:
                ddict_filtered[name] = ds
            else:
                if len(ds.time) < _expected_length(ds):
                    print("---------DROPPED--------")
                    print(name)
                    print(_expected_length(ds))
                    print(len(ds.time))
                    print("---------DROPPED--------")
                else:
                    ddict_filtered[name] = ds
    return ddict_filtered


# apply to data and metrics
ds_dict_filtered = filter_ddict(ds_dict)
ds_metric_dict_filtered = filter_ddict(ds_metric_dict)

---------DROPPED--------
CNRM-ESM2-1.gn.historical.Omon.r6i1p1f2.so
1980
1380
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r26i1p1f1.thkcello
1032
432
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r20i1p1f1.thkcello
1032
432
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r30i1p1f1.thkcello
1032
240
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r25i1p1f1.thkcello
1032
312
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r16i1p1f1.thkcello
1032
360
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r17i1p1f1.thkcello
1032
360
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r29i1p1f1.thkcello
1032
120
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn.ssp245.Omon.r13i1p1f1.thkcello
1032
360
---------DROPPED--------
---------DROPPED--------
ACCESS-ESM1-5.gn



In [20]:
import pathlib
# new files (change in later and get rid of `load_trend_dict` (or refactor?) and `fix_trend_metadata`)
# Load all trend files
flist = list(pathlib.Path('../../data/external/cmip6_control_drifts/').absolute().glob('*.nc'))
flist = [f for f in flist if any([v in str(f) for v in variable_ids])]
trend_dict = {}
for f in progress_bar(flist):
    trend_dict[f.stem] = xr.open_mfdataset([f])

In [21]:
# these ones are messed up...need a better way to deal with that in the previous step
# see https://github.com/jbusecke/cmip6_preprocessing/issues/175
incomplete_keys = ['CMIP.IPSL.IPSL-CM6A-LR.historical.r3i1p1f1.Omon.gn.none.area_o2']
trend_dict = {k:ds for k,ds in trend_dict.items() if k not in incomplete_keys}
# i think this should be taken care of in the filtering step above...TODO check at a later point

ddict_tracers_detrended = match_and_remove_trend(
    ds_dict_filtered,
    trend_dict,
)





















## Match metrics

In [22]:
# these cause trouble (TODO: See if this remains after cutting the long runs)
problem_metrics = [
    'ACCESS-ESM1-5.gn.ssp585.Omon.r3i1p1f1.thkcello', # metric too short
    'ACCESS-ESM1-5.gn.ssp585.Omon.r2i1p1f1.thkcello', # metric too short
    'ACCESS-ESM1-5.gn.ssp585.Omon.r1i1p1f1.thkcello', # metric too long (I guess I could fix that with a join='inner', but probably not worth it now
    'ACCESS-ESM1-5.gn.ssp585.Omon.r9i1p1f1.thkcello', # metric too short
    'ACCESS-ESM1-5.gn.ssp585.Omon.r6i1p1f1.thkcello', # metric too long (I guess I could fix that with a join='inner', but probably not worth it now
    'ACCESS-ESM1-5.gn.ssp585.Omon.r4i1p1f1.thkcello', # metric too long (I guess I could fix that with a join='inner', but probably not worth it now
    'ACCESS-ESM1-5.gn.ssp585.Omon.r8i1p1f1.thkcello', # metric too long (I guess I could fix that with a join='inner', but probably not worth it now
    'ACCESS-ESM1-5.gn.ssp585.Omon.r10i1p1f1.thkcello', # metric too long (I guess I could fix that with a join='inner', but probably not worth it now
    'ACCESS-ESM1-5.gn.ssp585.Omon.r5i1p1f1.thkcello', # metric too long (I guess I could fix that with a join='inner', but probably not worth it now
]
for key in problem_metrics:
    if key in ds_metric_dict_filtered.keys():
        del ds_metric_dict_filtered[key]

In [23]:
# this one causes problems because the time is not as long as the full data...apparently they stopped writing the thickness
ddict_tracers_detrended_filtered = {
    k: ds.squeeze()
    for k, ds in ddict_tracers_detrended.items()
    if not ("ACCESS-ESM1-5" in k and "r3i1p1f1" in k) or not ()
}

ddict_matched = match_metrics(
    ddict_tracers_detrended_filtered,
    ds_metric_dict_filtered,
    ["areacello", "thkcello"],
    print_statistics=True,
)



Processed 800 datasets.
Exact matches:{'areacello': 0, 'thkcello': 341}
Other matches:{'areacello': 749, 'thkcello': 271}
No match found:{'areacello': 51, 'thkcello': 130}


## Interpolate Grids and merge variables

- handle the Norwegian Models inside `interpolate_grid_label` (TODO: Check if this works)

In [24]:
# import dask
# with dask.config.set(**{'array.slicing.split_large_chunks': True}): # only necessary for ACCESS, they are all different lengths?

print("interpolate grids\n")
ddict_matched_regrid = interpolate_grid_label(
    ddict_matched, merge_kwargs={"compat": "override"}
)  # This should be a default soon

# #patch the norwegian model in manually
# ddict_patch = merge_variables(ddict_matched)
# for name, ds in ddict_patch.items():
#     if 'Nor' in name and 'gr' in name:
#         patch_name = name.replace('.gr','')
#         ddict_matched_regrid[patch_name] = ds

interpolate grids



  ds_out = xr.apply_ufunc(
  ds_out = xr.apply_ufunc(
  ds_out = xr.apply_ufunc(
  ds_out = xr.apply_ufunc(


In [25]:
np.sort(list(ddict_matched_regrid.keys()))

array(['ACCESS-ESM1-5.historical.Omon.r10i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r1i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r2i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r3i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r4i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r5i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r6i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r8i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r9i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r10i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r1i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r2i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r3i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r4i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r5i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r6i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r8i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r9i1p1f1',
       'CNRM-ESM2-1.historical.Omon.r10i1p1f2',
       'CNRM-ESM2-1.historical.Omon.r11i1p1f2',
       'CNRM-ESM2-1.historical.Omon.r1i1p1f2',
       'C

In [26]:
# TODO: This needs to go to cmip6_pp: Force remove the attr that was just combined by, otherwise subsequent steps fail
def _del_attr(ds, attr):
    if attr in ds.attrs:
        del ds.attrs[attr]
    return ds

ddict_matched_regrid = {k:_del_attr(ds, 'variable_id') for k,ds in ddict_matched_regrid.items()}

In [27]:
np.sort(list(ddict_matched_regrid.keys()))

array(['ACCESS-ESM1-5.historical.Omon.r10i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r1i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r2i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r3i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r4i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r5i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r6i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r8i1p1f1',
       'ACCESS-ESM1-5.historical.Omon.r9i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r10i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r1i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r2i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r3i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r4i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r5i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r6i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r8i1p1f1',
       'ACCESS-ESM1-5.ssp585.Omon.r9i1p1f1',
       'CNRM-ESM2-1.historical.Omon.r10i1p1f2',
       'CNRM-ESM2-1.historical.Omon.r11i1p1f2',
       'CNRM-ESM2-1.historical.Omon.r1i1p1f2',
       'C

## Concatenate experiments and pick only 'full (both hist and ssp)' runs

In [28]:
ddict_ex_combined = concat_experiments(
    ddict_matched_regrid,
    concat_kwargs={
        'combine_attrs': 'drop_conflicts',
        'compat': 'override',
        'coords': 'minimal'
    }
)



Still need to deal with the access stuff here...

In [29]:
# only pick full runs (historical and ssp585)
ddict_ex_combined_full = {k:ds for k,ds in ddict_ex_combined.items() if len(ds.time)>3000}

# # TODO/!!! this eliminates some of the models I would like to show? O2 only (for now that should be fine)
# ddict_ex_combined_full = {k:ds for k,ds in ddict_ex_combined_full.items() if not fail_age(ds)}
list(np.sort(list(ddict_ex_combined_full.keys())))

['ACCESS-ESM1-5.gn.Omon.r10i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r1i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r2i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r3i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r4i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r5i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r6i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r8i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r9i1p1f1',
 'CNRM-ESM2-1.gn.Omon.r1i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r2i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r3i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r4i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r5i1p1f2',
 'CanESM5-CanOE.gn.Omon.r1i1p2f1',
 'CanESM5-CanOE.gn.Omon.r2i1p2f1',
 'CanESM5-CanOE.gn.Omon.r3i1p2f1',
 'CanESM5.gn.Omon.r10i1p1f1',
 'CanESM5.gn.Omon.r10i1p2f1',
 'CanESM5.gn.Omon.r1i1p1f1',
 'CanESM5.gn.Omon.r1i1p2f1',
 'CanESM5.gn.Omon.r2i1p1f1',
 'CanESM5.gn.Omon.r2i1p2f1',
 'CanESM5.gn.Omon.r3i1p1f1',
 'CanESM5.gn.Omon.r3i1p2f1',
 'CanESM5.gn.Omon.r4i1p1f1',
 'CanESM5.gn.Omon.r4i1p2f1',
 'CanESM5.gn.Omon.r5i1p1f1',
 'CanESM5.gn.Omon.r5i1p2f1',
 'CanESM5.gn.Omon.r6i1p1f1',
 'CanESM5.gn.Omon.r6i1p2f1',
 'CanE


## Check for problems and fix missing area/thickness manually

This should be wrapped and brought upstream

In [30]:
from cmip6_preprocessing.grids import combine_staggered_grid
problems = {'missing_variables':[], 'missing_area':[], 'missing_thickness':[], 'reconstructed_area':[], 'reconstructed_thickness':[]}
ddict_filtered = {}
for name, ds in ddict_ex_combined_full.items():
    flag = False
    # Check that all necessary variables are given
    missing_variables = [va for va in ["thetao", "so", 'o2'] if va not in ds.variables]
    if len(missing_variables)>0:
        flag = True
        problems['missing_variables'].append((name, missing_variables))
        
    # Check for area
    if not 'areacello' in ds.coords:
        if ds.attrs['grid_label'] == 'gr': # only reconstruct for regular grids
            grid, ds = combine_staggered_grid(ds, recalculate_metrics=True)
            # I am dropping dz_t here so it can be uniformly reconstructed
            ds = ds.drop('dz_t')
            ds = ds.assign_coords(areacello = (ds.dx_t * ds.dy_t).reset_coords(drop=True))
            problems['reconstructed_area'].append(name)
            assert 'areacello' in ds.coords
        else:
            flag = True
            problems['missing_area'].append(name)
    
    # Check for thickness (and rename) TODO: We should probably not rename and just refactor to use `thkcello`
    if "thkcello" in ds.coords:
        ds = ds.rename({'thkcello': 'dz_t'})
    else:
        # try to reconstruct the thickness from static info
        try:
#             lev_vertices = cf_xarray.bounds_to_vertices(ds.lev_bounds, 'bnds').load()
#             dz_t = lev_vertices.diff('lev_vertices')
#             ds = ds.assign_coords(dz_t=('lev', dz_t.data))
            ds = construct_static_dz(ds).rename({'thkcello': 'dz_t'})
            problems['reconstructed_thickness'].append(name)
        except Exception as e:
            print(f'{name} thickness reconstruction failed with {e}')
            print(ds)
            problems['missing_thickness'].append(name)
            flag=True
            
    if not flag:
        ddict_filtered[name] = ds
problems

{'missing_variables': [('IPSL-CM6A-LR.gn.Omon.r3i1p1f1', ['o2']),
  ('MRI-ESM2-0.gn.Omon.r1i1p1f1', ['o2']),
  ('ACCESS-ESM1-5.gn.Omon.r10i1p1f1', ['thetao'])],
 'missing_area': [],
 'missing_thickness': [],
 'reconstructed_area': ['NorESM2-MM.gr.Omon.r1i1p1f1',
  'NorESM2-LM.gr.Omon.r1i1p1f1'],
 'reconstructed_thickness': ['ACCESS-ESM1-5.gn.Omon.r8i1p1f1',
  'ACCESS-ESM1-5.gn.Omon.r4i1p1f1',
  'ACCESS-ESM1-5.gn.Omon.r9i1p1f1',
  'MRI-ESM2-0.gn.Omon.r1i2p1f1',
  'MRI-ESM2-0.gn.Omon.r1i1p1f1',
  'ACCESS-ESM1-5.gn.Omon.r5i1p1f1',
  'ACCESS-ESM1-5.gn.Omon.r3i1p1f1',
  'ACCESS-ESM1-5.gn.Omon.r2i1p1f1',
  'ACCESS-ESM1-5.gn.Omon.r10i1p1f1',
  'MIROC-ES2L.gn.Omon.r2i1p1f2',
  'MIROC-ES2L.gn.Omon.r4i1p1f2',
  'MIROC-ES2L.gn.Omon.r9i1p1f2',
  'MIROC-ES2L.gn.Omon.r1i1p1f2',
  'MIROC-ES2L.gn.Omon.r3i1p1f2',
  'MIROC-ES2L.gn.Omon.r8i1p1f2',
  'MIROC-ES2L.gn.Omon.r7i1p1f2',
  'MIROC-ES2L.gn.Omon.r10i1p1f2',
  'MIROC-ES2L.gn.Omon.r6i1p1f2',
  'MIROC-ES2L.gn.Omon.r5i1p1f2']}

In [31]:
# ddict_final = pick_first_member(ddict_filtered)#

# # # Final version: Put out all full memmbers
ddict_final = ddict_filtered

list(np.sort(list(ddict_final.keys())))

['ACCESS-ESM1-5.gn.Omon.r1i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r2i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r3i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r4i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r5i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r6i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r8i1p1f1',
 'ACCESS-ESM1-5.gn.Omon.r9i1p1f1',
 'CNRM-ESM2-1.gn.Omon.r1i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r2i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r3i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r4i1p1f2',
 'CNRM-ESM2-1.gn.Omon.r5i1p1f2',
 'CanESM5-CanOE.gn.Omon.r1i1p2f1',
 'CanESM5-CanOE.gn.Omon.r2i1p2f1',
 'CanESM5-CanOE.gn.Omon.r3i1p2f1',
 'CanESM5.gn.Omon.r10i1p1f1',
 'CanESM5.gn.Omon.r10i1p2f1',
 'CanESM5.gn.Omon.r1i1p1f1',
 'CanESM5.gn.Omon.r1i1p2f1',
 'CanESM5.gn.Omon.r2i1p1f1',
 'CanESM5.gn.Omon.r2i1p2f1',
 'CanESM5.gn.Omon.r3i1p1f1',
 'CanESM5.gn.Omon.r3i1p2f1',
 'CanESM5.gn.Omon.r4i1p1f1',
 'CanESM5.gn.Omon.r4i1p2f1',
 'CanESM5.gn.Omon.r5i1p1f1',
 'CanESM5.gn.Omon.r5i1p2f1',
 'CanESM5.gn.Omon.r6i1p1f1',
 'CanESM5.gn.Omon.r6i1p2f1',
 'CanESM5.gn.Omon.r7i1p1f1',
 'CanESM5.gn.

## Quick update for revision: Add O2_sat and aou

In [32]:
from cmip6_omz.omz_tools import o2sat
def _add_o2sat(ds):
    ds['o2sat'] = o2sat(ds.thetao, ds.so)
    ds['aou'] = ds.o2sat - ds.o2
    return ds

In [33]:
ddict_final = {k:_add_o2sat(ds) for k,ds in ddict_final.items()}

In [36]:
overwrite=False
for i, (name, ds) in enumerate(ddict_final.items()):
    print(f'-----{i+1}/{len(ddict_final)}------')
    print(name)
    ds_out = ds[[var for var in ['o2', 'o2sat', 'aou', 'agessc'] if var in ds]]
    
    ds_out.attrs = {k:v for k,v in ds.attrs.items() if k not in ['intake_esm_varname']}
    ds_out = ds_out.coarsen(time=12).mean('time')

    filename = ofolder.joinpath(f"{cmip6_dataset_id(ds_out)}_full_o2_sat.zarr")
    
    if 'GFDL' in name or 'HR' in name:
        dim_split = 1
    else:
        dim_split = 30
    if not zarr_exists(filename) or overwrite:
        print(filename)
        append_write_zarr(ds_out, filename, dim_split)
    else:
        print('exists')

-----1/72------
MPI-ESM1-2-LR.gn.Omon.r7i1p1f1
exists
-----2/72------
ACCESS-ESM1-5.gn.Omon.r8i1p1f1
exists
-----3/72------
CanESM5.gn.Omon.r8i1p1f1
exists
-----4/72------
MPI-ESM1-2-LR.gn.Omon.r4i1p1f1
exists
-----5/72------
CanESM5.gn.Omon.r7i1p1f1
exists
-----6/72------
NorESM2-MM.gr.Omon.r1i1p1f1
exists
-----7/72------
CanESM5.gn.Omon.r5i1p2f1
exists
-----8/72------
CanESM5.gn.Omon.r4i1p1f1
exists
-----9/72------
ACCESS-ESM1-5.gn.Omon.r4i1p1f1
exists
-----10/72------
CanESM5.gn.Omon.r2i1p2f1
exists
-----11/72------
CanESM5.gn.Omon.r3i1p1f1
exists
-----12/72------
CanESM5.gn.Omon.r9i1p1f1
exists
-----13/72------
CanESM5.gn.Omon.r1i1p1f1
exists
-----14/72------
ACCESS-ESM1-5.gn.Omon.r9i1p1f1
exists
-----15/72------
CanESM5-CanOE.gn.Omon.r3i1p2f1
exists
-----16/72------
CanESM5-CanOE.gn.Omon.r2i1p2f1
exists
-----17/72------
MPI-ESM1-2-LR.gn.Omon.r6i1p1f1
exists
-----18/72------
ACCESS-ESM1-5.gn.Omon.r1i1p1f1
exists
-----19/72------
MPI-ESM1-2-HR.gn.Omon.r1i1p1f1
exists
-----20/72-----