## Get Max Variables
This notebook takes raw daily data and output variables conditioned on maximum t2m across each year.

Have option to condition on daily mean or daily maximum `t2m` through the `t2m_av_use` variable. Zhang 2023 used daily maximum

In [1]:
import os
import xarray as xr
import numpy as np
from numba import jit
from tqdm.notebook import tqdm
from isca_tools.utils.moist_physics import sphum_from_dew
print(os.getcwd())

/home/users/jamd1/Isca


In [2]:
dir_base = '/gws/nopw/j04/global_ex/jamd1/era5/'
dir_t500 = os.path.join(dir_base, 'daily_mean', 't500')
dir_z500 = os.path.join(dir_base, 'daily_mean', 'z500')
dir_sp = os.path.join(dir_base, 'daily_mean', 'sp')
dir_d2m = os.path.join(dir_base, 'daily_mean', 'd2m')
dir_t2m = {key: os.path.join(dir_base, f'daily_{key}', 't2m') for key in ['mean', 'max']}

In [3]:
t2m_av_use = 'mean'     # get variables conditioned on annual maximum of daily max or mean temperature

In [4]:
@jit
def _at_t2m_max_ufunc(t2m,y):
    return y[np.argmax(t2m)]

def xr_at_t2m_max(t2m,y): # Find the level of a variable on the annual hottest day for each location
    return xr.apply_ufunc(_at_t2m_max_ufunc, t2m, y,
                          input_core_dims=[['time'],['time']], dask ='parallelized', vectorize=True, output_dtypes=[y.dtype])

In [5]:
def load_ds(path, rename_valid_time=True):
    # The t500 dataset has time as `valid_time` rather than `time`. Deal with this
    ds = xr.open_mfdataset(path)
    if rename_valid_time and 'valid_time' in ds.dims:
        ds = ds.rename({'valid_time': 'time'})
    return ds

In [None]:
# For each year, load in data, compute day with max t2m, and output t500 on this day as well as the index of this day
var_input_dir = {'t2m': dir_t2m[t2m_av_use], 't500': dir_t500, 'sp': dir_sp, 'd2m': dir_d2m, 'day_ind': None}
var_id = {'t2m': 't2m', 't500': 't', 'sp': 'sp', 'd2m': 'd2m'}    # name of var within input directory

var_out_file_name = {var: f"{var}_{'max' if var=='t2m' else 'of_t2m_max'}" for var in var_input_dir}
var_out_path = {key: f'/home/users/jamd1/Isca/jobs/era5/zhang_2023/processed/daily_{t2m_av_use}/{var_out_file_name[key]}.nc' for key in var_out_file_name}
var_out = {key: [] for key in var_out_path}
var_save = {var: True for var in var_out_path}
for var in var_save:
    if os.path.exists(var_out_path[var]):
        print(f'Data for {var} already exists, will skip')
        var_save[var] = False
    else:
        print(f"Data for {var} does not exist, will create it")
if all(not v for v in var_save.values()):
    raise ValueError('Data exists for all variables')

for year in tqdm(range(2011, 2012)):
    # Only load in t2m data, or data that needs to be saved, and that we have raw data for
    var_in_use = {key: load_ds(f"{var_input_dir[key]}/{year}.nc").chunk(dict(time=-1))[var_id[key]]
                  for key in var_input_dir if (key=='t2m' or var_save[key]) and (var_input_dir[key] is not None)}
    var_in_use['day_ind'] = np.arange(var_in_use['t2m'].time.size)   # varies as some years are leap years
    for key in var_out:
        if not var_save[key]:
            continue
        var_out_use = xr_at_t2m_max(var_in_use['t2m'], var_in_use[key]).compute()
        var_out_use = var_out_use.expand_dims(year=[year])
        var_out[key].append(var_out_use)

Data for t2m already exists, will skip
Data for t500 already exists, will skip
Data for sp does not exist, will create it
Data for d2m does not exist, will create it
Data for day_ind already exists, will skip


  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
for key in ['sp', 'd2m']:
    var_out[key] = var_out_prior2011[key] + var_out_2011[key] + var_out_post2011[key]

In [31]:
def set_attrs(var, long_name, units, description):
    # Function to set main attributes of given variable
    var.attrs['long_name'] = long_name
    var.attrs['units'] = units
    var.attrs['description'] = description
    return var

In [33]:
# Save data for all years in a single file for each variable
complevel = 4           # how much to compress by
var_out_attrs = {'t2m': {'long_name': var_in_use['t2m'].long_name, 'units': var_in_use['t2m'].units,
                         'description': f'Max of daily {t2m_av_use} 2m temperature in a given year.'},
                 't500': {'long_name': '500hPa temperature', 'units': 'K',
                          'description': f'500 hPa temperature on day with maximum daily {t2m_av_use} 2m temperature in a given year.'},
                 'day_ind': {'long_name': 'Day index', 'units': 'Day',
                              'description': f'Day (0 is Jan 1st) with maximum daily {t2m_av_use} 2m temperature in a given year.'},
                 'sp': {'long_name': var_in_use['sp'].long_name, 'units': var_in_use['sp'].long_name,
                        'description': f'Surface pressure on day with maximum daily {t2m_av_use} 2m temperature in a given year.'},
                 'd2m': {'long_name': var_in_use['d2m'].long_name, 'units': var_in_use['d2m'].long_name,
                        'description': f'2m dew temperature on day with maximum daily {t2m_av_use} 2m temperature in a given year.'},
                 }
for key in var_out_path:
    if not var_save[key]:
        continue
    if os.path.exists(var_out_path[key]):
        raise ValueError(f'Data exists for var={key} at\n{var_out_path[key]}')
    var_use = xr.concat(var_out[key], dim='year')
    if key == 'day_ind':
        # time ind has max value of 365 so can use int16
        var_use = var_use.astype('int16')
    var_use = set_attrs(var_use, var_out_attrs[key]['long_name'], var_out_attrs[key]['units'], var_out_attrs[key]['description'])
    var_use = xr.Dataset({key: var_use})    # make sure save as dataset rather than data array
    encoding = {var: {'zlib': True, 'complevel': complevel} for var in var_use.data_vars}
    var_use.to_netcdf(var_out_path[key], encoding=encoding)
    print(f'{key} data has been saved at:\n{var_out_path[key]}')

ValueError: Data exists for var=sp at
/home/users/jamd1/Isca/jobs/era5/zhang_2023/processed/daily_max/sp_of_t2m_max.nc