## Create Test Data
Just a quick script to load in a data file, reduce memory by selecting 1 time value, less lat and lon and save as output.

In [1]:
import os
import xarray as xr
import time
import numpy as np

In [53]:
lat_ind_keep = np.linspace(0, 192-1, 5, dtype=int)
lon_ind_keep = np.linspace(0, 288-1, 10, dtype=int)
time_ind_keep = [4, 5]

def preprocess(ds):
    return ds.isel(lat=lat_ind_keep, lon=lon_ind_keep, time=time_ind_keep)
#
# def preprocess(ds):
#     return ds['SOILLIQ']
#
# def preprocess(ds):
#     return ds['T']

In [54]:
dir_desktop = os.environ['HOME'] + '/Desktop/'
exp_name_land = 'e.e20.E1850TEST.f09_g17.daily_output.clm2.h1.0031-01-02-00000.nc'
exp_name_atm = 'e.e20.E1850TEST.f09_g17.daily_output.cam.h1.0031-01-02-00000.nc'

In [55]:
def load_with_profile(file_path, preprocess=preprocess, process_first=True, chunks=None, chunk_first=True):
    time_info = []
    time_info += [time.time()]
    ds = xr.open_mfdataset(file_path, preprocess=preprocess if process_first else None, chunks=chunks if chunk_first else None)
    time_info += [time.time()]
    print("Lazy loading: {:.3f}s".format(time_info[-1] - time_info[-2]))
    if not chunk_first and chunks is not None:
        ds = ds.chunk(chunks)
        time_info += [time.time()]
        print("Chunking: {:.3f}s".format(time_info[-1] - time_info[-2]))
    if not process_first:
        ds = preprocess(ds)
        time_info += [time.time()]
        print("Processing: {:.3f}s".format(time_info[-1] - time_info[-2]))
    ds.load()
    time_info += [time.time()]
    print("Load in full: {:.3f}s".format(time_info[-1] - time_info[-2]))
    print("Total time: {:.3f}s".format(time_info[-1] - time_info[0]))
    return ds

In [56]:
ds_land = load_with_profile(dir_desktop + exp_name_land)

Lazy loading: 0.053s
Load in full: 3.442s
Total time: 3.495s


In [57]:
ds_atm = load_with_profile(dir_desktop + exp_name_atm)

Lazy loading: 0.059s
Load in full: 27.488s
Total time: 27.547s


In [109]:
save = False
if save:
    ds_atm.to_netcdf(dir_desktop + f'atm_time={time_ind_keep}.nc')
    ds_land.to_netcdf(dir_desktop + f'land_time={time_ind_keep}.nc')

## Check that processing of these datasets works

In [106]:
def preprocess_land(ds):
    # Only 2 variables, and sum over all soil levels
    soil_liq_sum = ds['SOILLIQ'].sum(dim='levsoi')  # Sum over 'levsoi'
    return xr.Dataset({'SOILLIQ': soil_liq_sum, 'landmask': ds['landmask']})
ds_full_land = xr.open_mfdataset(dir_desktop + exp_name_land, preprocess=preprocess_land)

In [107]:
def preprocess_atm(ds):
    var_atm = ['T', 'Q', 'Z3', 'PS', 'P0', 'hyam', 'hybm']
    p_surf_approx_guess = 1000
    p_ft_approx_guess = 500
    # Preprocessing so don't load in entire dataset
    ds = ds[var_atm]
    return ds.sel(lev=xr.DataArray([p_surf_approx_guess, p_ft_approx_guess], dims='lev'), method='nearest')
ds_full_atm = xr.open_mfdataset(dir_desktop + exp_name_atm, preprocess=preprocess_atm)

In [102]:
ds_full_land = ds_full_land.reindex_like(ds_full_atm['PS'], method="nearest", tolerance=0.01)

In [130]:
# Check opening all test data together
ds_land_use = xr.open_mfdataset('/Users/joshduffield/Desktop/test_cesm/land/*.nc', preprocess=preprocess_land).load()
ds_atm_use = xr.open_mfdataset('/Users/joshduffield/Desktop/test_cesm/atm/*.nc', preprocess=preprocess_atm).load()
ds_land_use = ds_land_use.reindex_like(ds_atm_use['PS'], method="nearest", tolerance=0.01)

In [157]:
land_mask = ds_land_use.landmask.isel(time=0, drop=True)>0
land_mask

In [163]:
ds_atm_use.isel(lat=0, drop=True).sel(lon=(~land_mask).isel(lat=0, drop=True), drop=True).T