# Input normalization

In [1]:
import os
import glob
import xarray as xr
import numpy as np
from IPython.display import display, Latex

Here we will built upon the input scaling files provided by the existing input_mean/max/min.nc. And we will use the output_scale_std_nopenalty.nc which calculates the st.d. of each level for each output variable (see the other output scaling notebook for details).

In [2]:
path = '/global/homes/z/zeyuanhu/nvidia_codes/Climsim_private/preprocessing/normalizations/'

dsm = xr.open_dataset(path+'inputs/input_mean.nc')
dsa = xr.open_dataset(path+'inputs/input_max.nc')
dsi = xr.open_dataset(path+'inputs/input_min.nc')
dso = xr.open_dataset(path+'outputs/output_scale_std_nopenalty.nc')

Below is the list of input features that will be used in the v5 Unet, which only use and predict total cloud (liquid+ice) information. We will modify/expand the original input scaling files according to the normalization method listed below. For variables using (x-mean)/(max-min), we calculate mean,max,min per-level and save as usual. For variables with blank normalization, we simply set mean=0, max=1, min=0. For variables using x/std, we set mean=0, max=1/std, min=0. For variables using x/(max-min), we set mean = 0 and save max/min as usual. For cloud (liquid, ice, and total cloud) input, we have a separate exponential transformation, and we set mean=0, max=1, min=0.

| **Variable**                   | **Units**      | **Description**                               | **Normalization**          |
|--------------------------------|----------------|-----------------------------------------------|----------------------------|
| $T(z)$                         | K              | Temperature                                   | (x-mean)/(max-min)         |
| $RH(z)$                        |                | Relative humidity                             |                            |
| $liq\_partition(z)$            |                | Fraction of liquid cloud                      |                            |
| $q_n(z)$                       | kg/kg          | Total cloud (liquid + ice) mixing ratio       | 1 - exp(-$\lambda x$)      |
| $u(z)$                         | m/s            | Zonal wind                                    | (x-mean)/(max-min)         |
| $v(z)$                         | m/s            | Meridional wind                               | (x-mean)/(max-min)         |
| $dT_{adv}(z,t_0,t_{-1})$       | K/s            | Large-scale forcing of temperature            | x/(max-min)                |
| $dq_{T,adv}(z,t_0,t_{-1})$     | kg/kg/s        | Large-scale forcing of total water            | x/(max-min)                |
| $du_{adv}(z,t_0,t_{-1})$       | m/s\textsuperscript{2} | Large-scale forcing of zonal wind        | x/(max-min)                |
| $dT(z,t_{-1},t_{-2})$          | K/s            | Temperature tendency                          | x/std                      |
| $dq_v(z,t_{-1},t_{-2})$        | kg/kg/s        | Water vapor tendency                          | x/std                      |
| $dq_n(z,t_{-1},t_{-2})$        | kg/kg/s        | Total cloud tendency                          | x/std                      |
| $du(z,t_{-1},t_{-2})$          | m/s\textsuperscript{2} | Zonal wind tendency                      | x/std                      |
| O3$(z)$                        | mol/mol        | Ozone volume mixing ratio                     | (x-mean)/(max-min)         |
| CH4$(z)$                       | mol/mol        | Methane volume mixing ratio                   | (x-mean)/(max-min)         |
| N2O$(z)$                       | mol/mol        | Nitrous volume mixing ratio                   | (x-mean)/(max-min)         |
| PS                             | Pa             | Surface pressure                              | (x-mean)/(max-min)         |
| SOLIN                          | W/m\textsuperscript{2} | Solar insolation                         | x/(max-min)                |
| LHFLX                          | W/m\textsuperscript{2} | Surface latent heat flux                  | x/(max-min)                |
| SHFLX                          | W/m\textsuperscript{2} | Surface sensible heat flux                 | x/(max-min)                |
| TAUX                           | W/m\textsuperscript{2} | Zonal surface stress                      | (x-mean)/(max-min)         |
| TAUY                           | W/m\textsuperscript{2} | Meridional surface stress                 | (x-mean)/(max-min)         |
| COSZRS                         |                | Cosine of solar zenith angle                  | (x-mean)/(max-min)         |
| ALDIF                          |                | Albedo for diffuse longwave radiation         | (x-mean)/(max-min)         |
| ALDIR                          |                | Albedo for direct longwave radiation          | (x-mean)/(max-min)         |
| ASDIF                          |                | Albedo for diffuse shortwave radiation        | (x-mean)/(max-min)         |
| ASDIR                          |                | Albedo for direct shortwave radiation         | (x-mean)/(max-min)         |
| LWUP                           | W/m\textsuperscript{2} | Upward longwave flux                      | (x-mean)/(max-min)         |
| ICEFRAC                        |                | Sea-ice area fraction                         |                            |
| LANDFRAC                       |                | Land area fraction                            |                            |
| OCNFRAC                        |                | Ocean area fraction                           |                            |
| SNOWHLAND                      | m              | Snow depth over land                          | (x-mean)/(max-min)         |
| cos(lat)                       |                | Cosine of latitude                            |                            |
| sin(lat)                       |                | Sine of latitude                              |                            |
| **Footnote**                   |                | $^{a}$Footnote text here.                     |                            |


## First retrieve the large-scale forcings from the expanded training data and calculate their mean/max/min

In [3]:
# get the whole input file list
base_dir = "/global/homes/z/zeyuanhu/hugging/E3SM-MMF_ne4/train"
nc_files_in = sorted(glob.glob(os.path.join(base_dir, '**/E3SM-MMF.ml2steploc.*.nc'), recursive=True))
len(nc_files_in)

210236

In [4]:
# we used stride of 5 to sample a total of 40k time steps in our actual work
# ntime = 40000
# stride = 5

# below values are used here just as a quick example
ntime = 500
stride = 400 

t_dyn_tmp = np.zeros((ntime,60,384))
u_dyn_tmp = np.zeros((ntime,60,384))
q0_dyn_tmp = np.zeros((ntime,60,384))

for i in range(ntime):
    ifile = stride*i
    ds = xr.open_dataset(nc_files_in[ifile])
    t_dyn_tmp[i,:,:] = ds['state_t_dyn']
    u_dyn_tmp[i,:,:] = ds['state_u_dyn']
    q0_dyn_tmp[i,:,:] = ds['state_q0_dyn']

In [5]:
t_dyn_mean = dsm['state_t'].copy()
t_dyn_min = dsa['state_t'].copy()
t_dyn_max = dsi['state_t'].copy()
# t_dyn_mean[:] = np.mean(t_dyn_tmp, axis=(0,2))
t_dyn_mean[:] = 0.0
t_dyn_min[:] = np.min(t_dyn_tmp, axis=(0,2))
t_dyn_max[:] = np.max(t_dyn_tmp, axis=(0,2))
dsm['state_t_dyn'] = t_dyn_mean
dsa['state_t_dyn'] = t_dyn_min
dsi['state_t_dyn'] = t_dyn_max


u_dyn_mean = dsm['state_t'].copy()
u_dyn_min = dsa['state_t'].copy()
u_dyn_max = dsi['state_t'].copy()
# u_dyn_mean[:] = np.mean(u_dyn_tmp, axis=(0,2))
u_dyn_mean[:] = 0.0
u_dyn_min[:] = np.min(u_dyn_tmp, axis=(0,2))
u_dyn_max[:] = np.max(u_dyn_tmp, axis=(0,2))
dsm['state_u_dyn'] = u_dyn_mean
dsa['state_u_dyn'] = u_dyn_min
dsi['state_u_dyn'] = u_dyn_max

q0_dyn_mean = dsm['state_t'].copy()
q0_dyn_min = dsa['state_t'].copy()
q0_dyn_max = dsi['state_t'].copy()
# q0_dyn_mean[:] = np.mean(q0_dyn_tmp, axis=(0,2))
q0_dyn_mean[:] = 0.0
q0_dyn_min[:] = np.min(q0_dyn_tmp, axis=(0,2))
q0_dyn_max[:] = np.max(q0_dyn_tmp, axis=(0,2))
dsm['state_q0_dyn'] = q0_dyn_mean
dsa['state_q0_dyn'] = q0_dyn_min
dsi['state_q0_dyn'] = q0_dyn_max

tm_state_t_dyn_m = dsm['state_t_dyn'].copy()
tm_state_t_dyn_a = dsa['state_t_dyn'].copy()
tm_state_t_dyn_i = dsi['state_t_dyn'].copy()
tm_state_t_dyn_m[:]= 0.0
dsm['tm_state_t_dyn'] = tm_state_t_dyn_m
dsa['tm_state_t_dyn'] = tm_state_t_dyn_a
dsi['tm_state_t_dyn'] = tm_state_t_dyn_i

tm_state_q0_dyn_m = dsm['state_q0_dyn'].copy()
tm_state_q0_dyn_a = dsa['state_q0_dyn'].copy()
tm_state_q0_dyn_i = dsi['state_q0_dyn'].copy()
tm_state_q0_dyn_m[:]= 0.0
dsm['tm_state_q0_dyn'] = tm_state_q0_dyn_m
dsa['tm_state_q0_dyn'] = tm_state_q0_dyn_a
dsi['tm_state_q0_dyn'] = tm_state_q0_dyn_i

tm_state_u_dyn_m = dsm['state_u_dyn'].copy()
tm_state_u_dyn_a = dsa['state_u_dyn'].copy()
tm_state_u_dyn_i = dsi['state_u_dyn'].copy()
tm_state_u_dyn_m[:]= 0.0
dsm['tm_state_u_dyn'] = tm_state_u_dyn_m
dsa['tm_state_u_dyn'] = tm_state_u_dyn_a
dsi['tm_state_u_dyn'] = tm_state_u_dyn_i

## update the input max/min/mean of other variables based on the defined normalization method listed in the Table above.

In [6]:
dsm['state_q0002'][:] = 0.0
dsa['state_q0002'][:] = 1.0
dsi['state_q0002'][:] = 0.0

dsm['state_q0003'][:] = 0.0
dsa['state_q0003'][:] = 1.0
dsi['state_q0003'][:] = 0.0

state_rh_m = dsm['state_t'].copy()
state_rh_a = dsa['state_t'].copy()
state_rh_i = dsi['state_t'].copy()
state_rh_m[:]= 0.0
state_rh_a[:]= 1.0
state_rh_i[:]= 0.0
dsm['state_rh'] = state_rh_m
dsa['state_rh'] = state_rh_a
dsi['state_rh'] = state_rh_i

state_qn_m = dsm['state_t'].copy()
state_qn_a = dsa['state_t'].copy()
state_qn_i = dsi['state_t'].copy()
state_qn_m[:]= 0.0
state_qn_a[:]= 1.0
state_qn_i[:]= 0.0
dsm['state_qn'] = state_qn_m
dsa['state_qn'] = state_qn_a
dsi['state_qn'] = state_qn_i

In [7]:
state_t_prvphy_m = dsm['state_t'].copy()
state_t_prvphy_a = dsa['state_t'].copy()
state_t_prvphy_i = dsi['state_t'].copy()
state_t_prvphy_m[:]= 0.0
state_t_prvphy_a[:] = 1./dso['ptend_t'] #dso is 1/std, so this is std
state_t_prvphy_i[:]= 0.0
dsm['state_t_prvphy'] = state_t_prvphy_m
dsa['state_t_prvphy'] = state_t_prvphy_a
dsi['state_t_prvphy'] = state_t_prvphy_i

state_q0001_prvphy_m = dsm['state_q0001'].copy()
state_q0001_prvphy_a = dsa['state_q0001'].copy()
state_q0001_prvphy_i = dsi['state_q0001'].copy()
state_q0001_prvphy_m[:]= 0.0
state_q0001_prvphy_a[:] = 1./dso['ptend_q0001'] #dso is 1/std, so this is std
state_q0001_prvphy_i[:]= 0.0
dsm['state_q0001_prvphy'] = state_q0001_prvphy_m
dsa['state_q0001_prvphy'] = state_q0001_prvphy_a
dsi['state_q0001_prvphy'] = state_q0001_prvphy_i

state_qn_prvphy_m = dsm['state_q0001'].copy()
state_qn_prvphy_a = dsa['state_q0001'].copy()
state_qn_prvphy_i = dsi['state_q0001'].copy()
state_qn_prvphy_m[:]= 0.0
state_qn_prvphy_a[:] = 1./dso['ptend_qn'] #dso is 1/std, so this is std
state_qn_prvphy_i[:]= 0.0
dsm['state_qn_prvphy'] = state_qn_prvphy_m
dsa['state_qn_prvphy'] = state_qn_prvphy_a
dsi['state_qn_prvphy'] = state_qn_prvphy_i

state_q0002_prvphy_m = dsm['state_q0002'].copy()
state_q0002_prvphy_a = dsa['state_q0002'].copy()
state_q0002_prvphy_i = dsi['state_q0002'].copy()
state_q0002_prvphy_m[:]= 0.0
state_q0002_prvphy_a[:] = 1./dso['ptend_q0002'] #dso is 1/std, so this is std
state_q0002_prvphy_i[:]= 0.0
dsm['state_q0002_prvphy'] = state_q0002_prvphy_m
dsa['state_q0002_prvphy'] = state_q0002_prvphy_a
dsi['state_q0002_prvphy'] = state_q0002_prvphy_i

state_q0003_prvphy_m = dsm['state_q0003'].copy()
state_q0003_prvphy_a = dsa['state_q0003'].copy()
state_q0003_prvphy_i = dsi['state_q0003'].copy()
state_q0003_prvphy_m[:]= 0.0
state_q0003_prvphy_a[:] = 1./dso['ptend_q0003'] #dso is 1/std, so this is std
state_q0003_prvphy_i[:]= 0.0
dsm['state_q0003_prvphy'] = state_q0003_prvphy_m
dsa['state_q0003_prvphy'] = state_q0003_prvphy_a
dsi['state_q0003_prvphy'] = state_q0003_prvphy_i

state_u_prvphy_m = dsm['state_u'].copy()
state_u_prvphy_a = dsa['state_u'].copy()
state_u_prvphy_i = dsi['state_u'].copy()
state_u_prvphy_m[:]= 0.0
state_u_prvphy_a[:] = 1./dso['ptend_u'] #dso is 1/std, so this is std
state_u_prvphy_i[:]= 0.0
dsm['state_u_prvphy'] = state_u_prvphy_m
dsa['state_u_prvphy'] = state_u_prvphy_a
dsi['state_u_prvphy'] = state_u_prvphy_i

tm_state_t_prvphy_m = dsm['state_t_prvphy'].copy()
tm_state_t_prvphy_a = dsa['state_t_prvphy'].copy()
tm_state_t_prvphy_i = dsi['state_t_prvphy'].copy()
dsm['tm_state_t_prvphy'] = tm_state_t_prvphy_m
dsa['tm_state_t_prvphy'] = tm_state_t_prvphy_a
dsi['tm_state_t_prvphy'] = tm_state_t_prvphy_i

tm_state_q0001_prvphy_m = dsm['state_q0001_prvphy'].copy()
tm_state_q0001_prvphy_a = dsa['state_q0001_prvphy'].copy()
tm_state_q0001_prvphy_i = dsi['state_q0001_prvphy'].copy()
dsm['tm_state_q0001_prvphy'] = tm_state_q0001_prvphy_m
dsa['tm_state_q0001_prvphy'] = tm_state_q0001_prvphy_a
dsi['tm_state_q0001_prvphy'] = tm_state_q0001_prvphy_i

tm_state_qn_prvphy_m = dsm['state_qn_prvphy'].copy()
tm_state_qn_prvphy_a = dsa['state_qn_prvphy'].copy()
tm_state_qn_prvphy_i = dsi['state_qn_prvphy'].copy()
dsm['tm_state_qn_prvphy'] = tm_state_qn_prvphy_m
dsa['tm_state_qn_prvphy'] = tm_state_qn_prvphy_a
dsi['tm_state_qn_prvphy'] = tm_state_qn_prvphy_i

tm_state_q0002_prvphy_m = dsm['state_q0002_prvphy'].copy()
tm_state_q0002_prvphy_a = dsa['state_q0002_prvphy'].copy()
tm_state_q0002_prvphy_i = dsi['state_q0002_prvphy'].copy()
dsm['tm_state_q0002_prvphy'] = tm_state_q0002_prvphy_m
dsa['tm_state_q0002_prvphy'] = tm_state_q0002_prvphy_a
dsi['tm_state_q0002_prvphy'] = tm_state_q0002_prvphy_i

tm_state_q0003_prvphy_m = dsm['state_q0003_prvphy'].copy()
tm_state_q0003_prvphy_a = dsa['state_q0003_prvphy'].copy()
tm_state_q0003_prvphy_i = dsi['state_q0003_prvphy'].copy()
dsm['tm_state_q0003_prvphy'] = tm_state_q0003_prvphy_m
dsa['tm_state_q0003_prvphy'] = tm_state_q0003_prvphy_a
dsi['tm_state_q0003_prvphy'] = tm_state_q0003_prvphy_i

tm_state_u_prvphy_m = dsm['state_u_prvphy'].copy()
tm_state_u_prvphy_a = dsa['state_u_prvphy'].copy()
tm_state_u_prvphy_i = dsi['state_u_prvphy'].copy()
dsm['tm_state_u_prvphy'] = tm_state_u_prvphy_m
dsa['tm_state_u_prvphy'] = tm_state_u_prvphy_a
dsi['tm_state_u_prvphy'] = tm_state_u_prvphy_i

In [8]:
# 'pbuf_SOLIN', #range (set 0 mean)
# 'pbuf_LHFLX', #range (set 0 mean)
# 'pbuf_SHFLX',#range (set 0 mean)


dsm['pbuf_SOLIN'] = 0.0
dsm['pbuf_LHFLX'] = 0.0
dsm['pbuf_SHFLX'] = 0.0

# 'cam_in_ICEFRAC', #no change
# 'cam_in_LANDFRAC', #no change
# 'cam_in_OCNFRAC', #no change


dsm['cam_in_ICEFRAC'] = 0.0
dsa['cam_in_ICEFRAC'] = 1.0
dsi['cam_in_ICEFRAC'] = 0.0

dsm['cam_in_LANDFRAC'] = 0.0
dsa['cam_in_LANDFRAC'] = 1.0
dsi['cam_in_LANDFRAC'] = 0.0

dsm['cam_in_OCNFRAC'] = 0.0
dsa['cam_in_OCNFRAC'] = 1.0
dsi['cam_in_OCNFRAC'] = 0.0

In [9]:
# 'tm_state_ps',
# 'tm_pbuf_SOLIN',
# 'tm_pbuf_LHFLX',
# 'tm_pbuf_SHFLX',
# 'tm_pbuf_COSZRS', # no change
# 'clat', # no change
# 'slat',# no change
# 'icol',] # no change

tm_state_ps_m = dsm['state_ps'].copy()
tm_state_ps_a = dsa['state_ps'].copy()
tm_state_ps_i = dsi['state_ps'].copy()
dsm['tm_state_ps'] = tm_state_ps_m
dsa['tm_state_ps'] = tm_state_ps_a
dsi['tm_state_ps'] = tm_state_ps_i

tm_pbuf_SOLIN_m = dsm['pbuf_SOLIN'].copy()
tm_pbuf_SOLIN_a = dsa['pbuf_SOLIN'].copy()
tm_pbuf_SOLIN_i = dsi['pbuf_SOLIN'].copy()
dsm['tm_pbuf_SOLIN'] = tm_pbuf_SOLIN_m
dsa['tm_pbuf_SOLIN'] = tm_pbuf_SOLIN_a
dsi['tm_pbuf_SOLIN'] = tm_pbuf_SOLIN_i

tm_pbuf_LHFLX_m = dsm['pbuf_LHFLX'].copy()
tm_pbuf_LHFLX_a = dsa['pbuf_LHFLX'].copy()
tm_pbuf_LHFLX_i = dsi['pbuf_LHFLX'].copy()
dsm['tm_pbuf_LHFLX'] = tm_pbuf_LHFLX_m
dsa['tm_pbuf_LHFLX'] = tm_pbuf_LHFLX_a
dsi['tm_pbuf_LHFLX'] = tm_pbuf_LHFLX_i

tm_pbuf_SHFLX_m = dsm['pbuf_SHFLX'].copy()
tm_pbuf_SHFLX_a = dsa['pbuf_SHFLX'].copy()
tm_pbuf_SHFLX_i = dsi['pbuf_SHFLX'].copy()
dsm['tm_pbuf_SHFLX'] = tm_pbuf_SHFLX_m
dsa['tm_pbuf_SHFLX'] = tm_pbuf_SHFLX_a
dsi['tm_pbuf_SHFLX'] = tm_pbuf_SHFLX_i

tm_pbuf_COSZRS_m = dsm['pbuf_COSZRS'].copy()
tm_pbuf_COSZRS_a = dsa['pbuf_COSZRS'].copy()
tm_pbuf_COSZRS_i = dsi['pbuf_COSZRS'].copy()
dsm['tm_pbuf_COSZRS'] = tm_pbuf_COSZRS_m
dsa['tm_pbuf_COSZRS'] = tm_pbuf_COSZRS_a
dsi['tm_pbuf_COSZRS'] = tm_pbuf_COSZRS_i

dsm['clat'] = 0.0
dsa['clat'] = 1.0
dsi['clat'] = 0.0

dsm['slat'] = 0.0
dsa['slat'] = 1.0
dsi['slat'] = 0.0

dsm['icol'] = 0.0
dsa['icol'] = 1.0
dsi['icol'] = 0.0

In [10]:
liq_partition_m = dsm['state_t'].copy()
liq_partition_a = dsa['state_t'].copy()
liq_partition_i = dsi['state_t'].copy()
liq_partition_m[:]= 0.0
liq_partition_a[:]= 1.0
liq_partition_i[:]= 0.0
dsm['liq_partition'] = liq_partition_m
dsa['liq_partition'] = liq_partition_a
dsi['liq_partition'] = liq_partition_i

## saving the updated input scaling files

In [11]:
# climsim_path = '/global/u2/z/zeyuanhu/nvidia_codes/Climsim_private/'
# norm_path = climsim_path+'/preprocessing/normalizations/'
# dsm.to_netcdf(norm_path + 'inputs/input_mean_v5_pervar.nc')
# dsa.to_netcdf(norm_path + 'inputs/input_max_v5_pervar.nc')
# dsi.to_netcdf(norm_path + 'inputs/input_min_v5_pervar.nc')

#below are example paths
climsim_path = '/global/u2/z/zeyuanhu/nvidia_codes/climsim_tests'
norm_path = climsim_path+'/normalization/'
dsm.to_netcdf(norm_path + 'inputs/input_mean_v5_pervar.nc')
dsa.to_netcdf(norm_path + 'inputs/input_max_v5_pervar.nc')
dsi.to_netcdf(norm_path + 'inputs/input_min_v5_pervar.nc')