# Maintenance Notebook: Keeping the staggered_grid_config.yaml up to date

Progress is hindered by a weird problem where I cant replace the datafram in the esm store anymore, like I used to. I raised an [issue](https://github.com/NCAR/intake-esm/issues/246) on github.

## TODO: 

- One of the MPI versions has two different shift conventions (I might need to implement a 'force override' option.

In [1]:
# ! pip install git+https://github.com/NCAR/intake-esm.git
# ! pip install git+https://github.com/jbusecke/cmip6_preprocessing.git

In [2]:
%load_ext autoreload
%autoreload 2
import intake
import pandas as pd
import xarray as xr
from cmip6_preprocessing.grids import detect_shift
from cmip6_preprocessing.preprocessing import combined_preprocessing



In [7]:
# Grab all available ocean output.


def available_output(variables=None):
    """Show which source_id/grid_label combos have any data, and return a df that picks only one dataset for each combo"""
    url = "https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json"
    col = intake.open_esm_datastore(url)
    query = dict(table_id=['Omon', 'Oyr'], variable_id=variables) # pick all available ocean fields for now
    cat = col.search(**query)
    
    available = []
    dataframes = []
    df = cat.df.copy()
    groups = df.groupby(['source_id', 'grid_label'])
    for group in groups:
        
        # add source_id/grid_label combo to list
        label = '.'.join(group[0])
        # pick only the first index of each group
        line = group[1].iloc[0,:]
        
        available.append(label)
        dataframes.append(line)

    new_df = pd.concat(dataframes, axis=1).transpose()
    cat.df = new_df
    
    return cat, available

_, all_models = available_output()
    
# print(len(all_models))

cat_tracer, tracer_models = available_output(['tos', 'thetao'])
missing_tracer_models = set(tracer_models).symmetric_difference(set(all_models))
print(f"Did not find tracer data for these models:{missing_tracer_models}\n")

cat_u, u_models = available_output(['uo'])
missing_u_models = set(u_models).symmetric_difference(set(all_models))
print(f"Did not find u data for these models:{missing_u_models}\n")

cat_v, v_models = available_output(['vo'])
missing_v_models = set(v_models).symmetric_difference(set(all_models))
print(f"Did not find v data for these models:{missing_v_models}\n")

print(f"Any models that have only u or v:{set(v_models).symmetric_difference(set(u_models))}")

Did not find tracer data for these models:{'UKESM1-0-LL.gm', 'HadGEM3-GC31-LL.gnz', 'HadGEM3-GC31-MM.gm', 'NorESM1-F.gr', 'HadGEM3-GC31-LL.gm', 'NorESM2-LM.gm', 'NorESM2-MM.gm', 'IPSL-CM6A-LR.gr', 'NorCPM1.grz', 'NorCPM1.gm', 'MRI-ESM2-0.gr2z', 'NorESM2-LM.grz', 'NorESM2-MM.grz', 'UKESM1-0-LL.gnz', 'MRI-ESM2-0.gm', 'MIROC6.gm', 'EC-Earth3-Veg.gr', 'MRI-ESM2-0.gnz'}

Did not find u data for these models:{'UKESM1-0-LL.gm', 'HadGEM3-GC31-LL.gnz', 'HadGEM3-GC31-MM.gm', 'NorESM1-F.gr', 'FGOALS-g3.gn', 'IITM-ESM.gn', 'CNRM-CM6-1.gr1', 'CESM1-1-CAM5-CMIP5.gn', 'HadGEM3-GC31-LL.gm', 'NorESM2-LM.gm', 'EC-Earth3.gr', 'KACE-1-0-G.gr', 'GISS-E2-1-H.gn', 'NorESM2-MM.gm', 'NorCPM1.gn', 'CESM2.gr', 'CESM1-1-CAM5-CMIP5.gr', 'IPSL-CM6A-LR.gr', 'EC-Earth3-Veg-LR.gn', 'NorCPM1.grz', 'CESM2-WACCM.gr', 'GFDL-OM4p5B.gn', 'NorCPM1.gm', 'GFDL-CM4.gr', 'MRI-ESM2-0.gr2z', 'NorESM2-LM.grz', 'CAS-ESM2-0.gn', 'NorESM2-MM.grz', 'CESM2-WACCM-FV2.gr', 'GFDL-OM4p5B.gr', 'CESM2-FV2.gr', 'UKESM1-0-LL.gnz', 'AWI-ESM-1-1-

In [4]:
# for now load them manually
import fsspec
import xarray as xr
super_dict = {}
for var, cat in zip(['thetao', 'uo', 'vo'],[cat_tracer, cat_u, cat_v]):
    super_dict[var]={}
    for ri,(rr,row) in enumerate(cat.df.iterrows()):
#         print(ri)
        ds = combined_preprocessing(xr.open_zarr(fsspec.get_mapper(row['zstore']), consolidated=True, decode_times=False))
        label = f"{row['source_id']}.{row['grid_label']}"
        super_dict[var][label] = ds

0
1
2
3
4
5
6
7
8
9
10
CESM2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
11
12


  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)


13
14
CESM2-WACCM: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34




35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
MIROC-ES2L: No units found
57
MIROC6: No units found
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
0
1
2
3
4
5
6
CESM2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
7
CESM2-FV2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`


  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)


8
CESM2-WACCM: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
9
CESM2-WACCM-FV2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
10
11
12
13
14
15
16
17
18
19
20
21
22




23
24
25
26
27
28
29
30
31
32
33
34
35
MIROC-ES2L: No units found
36
MIROC6: No units found
37
38
39
40
41
42
43
44
45
46
47


  f"Found {ed} as dimension in `{co}`. Assuming this is an error and just picking the first step along that dimension."


48
49
0
1
2
3
4
5
6
CESM2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
7
CESM2-FV2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`


  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)


8
CESM2-WACCM: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
9
CESM2-WACCM-FV2: Unexpected unit (centimeters) for coordinate `lev` detected.
	 Converted to `m`
10
11
12
13
14
15
16
17
18
19
20
21
22




23
24
25
26
27
28
29
30
31
32
33
34
35
MIROC-ES2L: No units found
36
MIROC6: No units found
37
38
39
40
41
42
43
44
45
46
47


  f"Found {ed} as dimension in `{co}`. Assuming this is an error and just picking the first step along that dimension."


48
49


In [29]:
staggered_grid_dict = {}
for k in super_dict['thetao'].keys():
    ds_ref = super_dict['thetao'][k]
    s_id = ds_ref.attrs['source_id']
    g_la = ds_ref.attrs['grid_label']
    
    if not ('AWI' in k and 'gn' in k):
        print(f"############### {k} #######################")
        if k in super_dict['uo'].keys() and k in super_dict['vo'].keys():
            
            ds_u = super_dict['uo'][k]
            ds_v = super_dict['vo'][k]
            
            if 'x' not in ds_ref.dims:
                print(f'THIS IS SOME ERROR IN THE PREPROCESSSING. INVESTIGATE {k}')
                # a nevermind, these are just the AWI ones...remove them earlier...
            else:
                x_shift_u = detect_shift(ds_ref, ds_u, 'X')
                y_shift_u = detect_shift(ds_ref, ds_u, 'Y')

                x_shift_v = detect_shift(ds_ref, ds_v, 'X')
                y_shift_v = detect_shift(ds_ref, ds_v, 'Y')

                # check that there is only one left after removing 'center'
                x_shift = set([x_shift_u, x_shift_v]) - set(['center'])
                y_shift = set([y_shift_u, y_shift_v]) - set(['center'])
                # if they are all on center default to left

                if len(x_shift) == 0:
                    x_shift = 'left'
                elif len(x_shift) == 1:
                    x_shift = list(x_shift)[0]
                else:
                    print(x_shift)
                    print('SCHEISSE')
                    x_shift=None

                if len(y_shift) == 0:
                    y_shift = 'left'
                elif len(y_shift) == 1:
                    y_shift = list(y_shift)[0]
                else:
                    print(y_shift)
                    print('SCHEISSE')
                    y_shift = None
        else:
            print(f"ATTENTION: Setting shift to left for {k}, since there is no velocity data")
            x_shift = 'left'
            y_shift = 'left'
            
        if x_shift is not None and y_shift is not None:
            if not s_id in staggered_grid_dict.keys():
                staggered_grid_dict[s_id] = {}
            staggered_grid_dict[s_id][g_la] = {'axis_shift':{'X': x_shift, 'Y': y_shift}}

############### ACCESS-CM2.gn #######################
############### ACCESS-ESM1-5.gn #######################
############### BCC-CSM2-MR.gn #######################
############### BCC-ESM1.gn #######################
############### CAMS-CSM1-0.gn #######################
############### CAS-ESM2-0.gn #######################
ATTENTION: Setting shift to left for CAS-ESM2-0.gn, since there is no velocity data
############### CESM1-1-CAM5-CMIP5.gn #######################
ATTENTION: Setting shift to left for CESM1-1-CAM5-CMIP5.gn, since there is no velocity data
############### CESM1-1-CAM5-CMIP5.gr #######################
ATTENTION: Setting shift to left for CESM1-1-CAM5-CMIP5.gr, since there is no velocity data
############### CESM2.gn #######################
############### CESM2.gr #######################
ATTENTION: Setting shift to left for CESM2.gr, since there is no velocity data
############### CESM2-FV2.gn #######################


  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)
  return func(*args2)


############### CESM2-FV2.gr #######################
ATTENTION: Setting shift to left for CESM2-FV2.gr, since there is no velocity data
############### CESM2-WACCM.gn #######################
############### CESM2-WACCM.gr #######################
ATTENTION: Setting shift to left for CESM2-WACCM.gr, since there is no velocity data
############### CESM2-WACCM-FV2.gn #######################
############### CESM2-WACCM-FV2.gr #######################
ATTENTION: Setting shift to left for CESM2-WACCM-FV2.gr, since there is no velocity data
############### CIESM.gn #######################
############### CNRM-CM6-1.gn #######################
############### CNRM-CM6-1.gr1 #######################
ATTENTION: Setting shift to left for CNRM-CM6-1.gr1, since there is no velocity data
############### CNRM-CM6-1-HR.gn #######################
############### CNRM-ESM2-1.gn #######################
############### CNRM-ESM2-1.gr1 #######################
ATTENTION: Setting shift to left for CNRM-ESM2-1.gr

## Check with old dict...

In [30]:
import yaml
ff = open('/home/jovyan/cmip6_preprocessing/cmip6_preprocessing/specs/staggered_grid_config.yaml', "r")
grid_dict = yaml.safe_load(ff)
ff.close()

In [31]:
# any keys in the old dict that are not in the new one?
print(f"Keys in the old grid, which are not in the new one {set(grid_dict.keys())- set(staggered_grid_dict.keys())}")

print(f"Newly added models {set(staggered_grid_dict.keys()) - set(grid_dict.keys())}")

Keys in the old grid, which are not in the new one set()
Newly added models {'GFDL-ESM4', 'INM-CM5-0', 'CESM1-1-CAM5-CMIP5', 'NorCPM1', 'INM-CM4-8', 'FGOALS-g3', 'KACE-1-0-G', 'E3SM-1-1-ECA', 'E3SM-1-1', 'GFDL-OM4p5B', 'GISS-E2-1-H', 'CAS-ESM2-0', 'IITM-ESM', 'E3SM-1-0', 'EC-Earth3-Veg-LR', 'NorESM1-F'}


In [32]:
staggered_grid_dict['GFDL-CM4']

{'gn': {'axis_shift': {'X': 'left', 'Y': 'left'}},
 'gr': {'axis_shift': {'X': 'left', 'Y': 'left'}}}

In [33]:
with open('test.yaml', 'w') as file:
    documents = yaml.dump(staggered_grid_dict, file)

# Simple example for issue

In [1]:

# # Test PR from anderson
# ! pip install git+https://github.com/NCAR/intake-esm.git@refs/pull/247/head   

In [2]:
import intake_esm



In [3]:
intake_esm.__version__

'2020.5.21.post4'

In [4]:
import intake
import pandas as pd
url = "https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json"
col = intake.open_esm_datastore(url)

cat = col.search(table_id=['Omon', 'Oyr'])

# now modify the dataframe manually
cat.df = cat.df.iloc[0:3, :]

In [5]:
cat

Unnamed: 0,unique
activity_id,1
institution_id,1
source_id,1
experiment_id,1
member_id,1
table_id,1
variable_id,3
grid_label,1
zstore,3
dcpp_init_year,0


In [6]:
ddict = cat.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'
