## Data processing for "An upper bound for extreme temperatures over midlatitude land" by Zhang and Boos

This notebook details how data are processed when |data files in the other notebook are not raw data. This notebook doesn't run. 

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sts
from glob import glob 
import warnings
import cartopy.crs as ccrs
from cartopy.util import add_cyclic_point
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
os.environ["HDF5_USE_FILE_LOCKING"]="FALSE"
xr.set_options(display_style='text')

<xarray.core.options.set_options at 0x155550aef0d0>

In [None]:
%%time
## Western North America (45N, 55N, 116W, 123W)
lat_slice_pn = slice(55,45)
lon_slice_pn =slice(237,244)

## Russia (50N, 57N, 35E, 45E)
lat_slice_rs = slice(57,50)
lon_slice_rs =slice(35,45)

## Western europe (45N, 53N, 3E, 13E)
lat_slice_fr = slice(53,46)
lon_slice_fr =slice(3,13)

## Time slices of three heatwaves
time_pn = slice('2021-06-26','2021-07-01')
time_fr = slice('2019-07-23','2019-07-25')
time_rs = slice('2010-07-31','2010-08-11')

# Land masks
lsm = xr.open_dataset('./e5.oper.invariant.128_172_lsm.ll025sc.1979010100_1979010100.nc').LSM.squeeze()
land = lsm.where(lsm>0.5)*0+1
ocean = lsm.where(lsm<=0.5)*0+1
weights = np.cos(np.deg2rad(land.latitude))

# Physical constants
cp = 1.0047090 
L = 2.5008e3
ep = 0.621981
e = 1e-6
R = 287.058 
g = 9.81



def e_sat(t): # Clausius Clapeyron [K] [Pa]
    return 611.21*np.exp(17.502*((t-273.16)/(t-32.19)))

def alpha(t): # de_sat/dT [K] [Pa]
    return 4217.457/(t-32.19)**2# Constants followed ECMWF

def e2q(e,sp):
    return 0.621981*e/(sp-(1-0.621981)*e)

z500_mean =55.74162890625
t500_mean = 258.82523



------------
### Figure 1

In [None]:
t500 = xr.open_mfdataset('/global/cscratch1/sd/y-zhang/ERA5/t500_*.nc').t500me # Daily mean T500 from ERA5 1979-2021
t500_clim = t500.sel(time=slice('1979','1998')).groupby('time.dayofyear').mean('time').compute() # Taking the average of the first 20 years as climatology
t500_anom = t500.groupby('time.dayofyear')-t500_clim

TX = xr.open_mfdataset('/global/cscratch1/sd/y-zhang/ERA5/TX_*.nc').TX # Daily maximum 2-m temperature from ERA5 1979-2021
TX_clim = TX.sel(time=slice('1979','1998')).groupby('time.dayofyear').mean('time').compute() # Taking the average of the first 20 years as climatology
TX_anom = TX.groupby('time.dayofyear')-TX_clim


t500_pn_anom = t500_anom.sel(time=time_pn).mean('time').compute()
t500_fr_anom = t500_anom.sel(time=time_fr).mean('time').compute()
t500_rs_anom = t500_anom.sel(time=time_rs).mean('time').compute()

TX_pn_anom = TX_anom.sel(time=time_pn).mean('time').compute()
TX_fr_anom = TX_anom.sel(time=time_fr).mean('time').compute()
TX_rs_anom = TX_anom.sel(time=time_rs).mean('time').compute()

------------
### Figure 2

In [None]:


TX = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/TX_2010.nc').TX.sel(latitude=slice(65,40)) # Daily maximum temperature from ERA5
t2m = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/2m_temperature_2010.nc').t2m.sel(latitude=slice(65,40)) # hourly 2-m temperature from ERA5
pr = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/pr_GPM_2010.nc').rename({'lat':'latitude', 'lon':'longitude'}).sel(latitude=slice(65,40)).pr    # GPM daily mean precipitation
cape = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/cape_2010.nc').cape.sel(latitude=slice(65,40)) # hourly CAPE from ERA5

# Calculation of 500-hPa saturation MSE
t500 = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/t500_2010.nc').t500 # hourly 500-hPa temperature from ERA5
z500 = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/z500_2010.nc').z500/1e3 # hourly 500-hPa geopotential from ERA5
hsat500 = cp*t500+L*e2q(e_sat(t500), 50000)+z500

# Calculaiton of 2-m MSE
gzs = xr.open_dataset('/global/homes/y/y-zhang/cmip6/ERA5/e5.oper.invariant/197901/e5.oper.invariant.128_129_z.ll025sc.1979010100_1979010100.nc').Z.squeeze()/1e3
h2m = cp*t2m+L*e2q(e_sat(d2m), sp)+gzs
h2m = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/h2m_2010.nc').h2m.sel(latitude=slice(65,40))

from numba import jit
length=481

@jit(nopython=True)
def _lag_ufunc(da1, da2): 
    result = np.ones(length)*np.nan 
    result[max((length-1)/2-da1.argmax(),0):min((length-1)/2-da1.argmax()+len(da1),length)] = da2[max(da1.argmax()-(length-1)/2,0):min(da1.argmax()+(length+1)/2,len(da1))]
    return result
                    
def xr_lag(da1, da2, convert_to_dataset=True, dim='time'):
    result = xr.apply_ufunc(_lag_ufunc, da1,da2,
                            input_core_dims=[[dim],[dim]],
                            dask ='parallelized',
                            vectorize=True,
                            output_dtypes=[np.float64], 
                            output_core_dims=[['lag']], 
                            output_sizes={'lag':length})
    result['lag'] = np.arange(-(length-1)/2,(length+1)/2)/24
    return result


def xr_lag_year(ds):    
    return xr_lag(ds[x], ds[y])

d = xr.Dataset({'TX': t2m})
x='TX'
y='TX'
TX_TX_year = d.groupby('time.year').apply(xr_lag_year).compute()

d = xr.Dataset({'TX': t2m, 'cape': cape})
x='TX'
y='cape'
cape_TX_year = d.groupby('time.year').apply(xr_lag_year).compute()

d = xr.Dataset({'TX': t2m, 'h2m': h2m})
x='TX'
y='h2m'
h2m_TX_year = d.groupby('time.year').apply(xr_lag_year).compute()

d = xr.Dataset({'TX': t2m, 'hsat500': hsat500})
x='TX'
y='hsat500'
hsat500_TX_year = d.groupby('time.year').apply(xr_lag_year).compute()

d = xr.Dataset({'TX': t2m, 'pr': pr})
x='TX'
y='pr'
pr_TX_year = d.groupby('time.year').apply(xr_lag_year).compute()
pr_TX_year = pr_TX_year.sel(lag=slice(-10/24,10/24))
pr_TX_year = pr_TX_year.assign_coords(lag = np.arange(-10,10.01,1))


------------
### Figure 3

In [None]:
t500 = xr.open_mfdataset('/global/cscratch1/sd/y-zhang/ERA5/t500_*.nc').t500me.sel(time=slice('2001','2020'))
TX = xr.open_mfdataset('/global/cscratch1/sd/y-zhang/ERA5/TX_*.nc').TX.sel(time=slice('2001','2020'))
sm = xr.open_mfdataset('/global/cscratch1/sd/y-zhang/ERA5/swvl1_30daybefore_*.nc').swvl1.sel(time=slice('2001','2020'))
vo = xr.open_mfdataset('/global/cscratch1/sd/y-zhang/ERA5/500hpa_vorticity_*_summer-daily-mean.nc').vome.sel(time=slice('2001','2020')).isel(expver=0)
gzs = xr.open_dataset('/global/homes/y/y-zhang/cmip6/ERA5/e5.oper.invariant/197901/e5.oper.invariant.128_129_z.ll025sc.1979010100_1979010100.nc').Z.squeeze()/1e3

@jit
def _histogram2d_ufunc(x,y):
    h2d, x_bins, y_bins = np.histogram2d(x.ravel(),y.ravel(), bins = (np.arange(200,284.01,0.25), np.arange(210,345.01,0.5)))
    return h2d

   
def xr_histogram2d(x,y): # 2-D histogram 
    h2d = xr.apply_ufunc(_histogram2d_ufunc, x, y, 
                             input_core_dims=[['latitude','longitude'],['latitude','longitude']],
                            dask ='parallelized',
                            vectorize=True,
                            output_dtypes=[x.dtype], 
                          output_core_dims = [['x_bins','y_bins']],
                         output_sizes = {'x_bins': len(np.arange(200,284.01,0.25)), 
                                         'y_bins': len(np.arange(210,345.01,0.5))})

    return xr.DataArray(data = h2d, dims=['time', 'x_bins', 'y_bins'], 
                       coords={'x_bins': np.arange(200,284.01,0.25)[0:-1]+0.125, 
                              'y_bins': np.arange(210,345.01,0.5)[0:-1]+0.25,
                              'time': x.time.values})

@jit
def _histogram2d_weight_ufunc(x,y,z):
    h2d, x_bins, y_bins = np.histogram2d(x.ravel(),y.ravel(), 
                                         bins = (np.arange(200,284.01,0.25), np.arange(210,345.01,0.5))
                                        ,weights=z.ravel())
    return h2d

   
def xr_histogram2d_weight(x,y,z): # weighted 2-D histogram; For example, weighting by soil moisture gives the sum of soil moisture in each bin, then dividing by the count in each bin gives mean soil moisture in each bin
    h2d = xr.apply_ufunc(_histogram2d_weight_ufunc, x, y, z,
                             input_core_dims=[['latitude','longitude'],['latitude','longitude'],['latitude','longitude']],
                            dask ='parallelized',
                            vectorize=True,
                            output_dtypes=[x.dtype], 
                          output_core_dims = [['x_bins','y_bins']],
                         output_sizes = {'x_bins': len(np.arange(200,284.01,0.25)), 
                                         'y_bins': len(np.arange(210,345.01,0.5))})

    return xr.DataArray(data = h2d, dims=['time', 'x_bins', 'y_bins'], 
                       coords={'x_bins': np.arange(200,284.01,0.25)[0:-1]+0.125, 
                              'y_bins': np.arange(210,345.01,0.5)[0:-1]+0.25,
                              'time': x.time.values})

h2d_era5 = xr_histogram2d(t500.sel(latitude=slice(65,40))*land, 
                     (TX+gzs/cp).sel(latitude=slice(65,40))*land)
xr.Dataset({'histogram': h2d_era5}).to_netcdf('/global/cscratch1/sd/y-zhang/ERA5/histogram2d_TX_t500_1979-2021.nc')

# Note that all data are interpolated to HadGHCND grid first
TX_obs = xr.open_dataset('/global/cscratch1/sd/y-zhang/Had/HadGHCND_TX_ERA5.nc').TX
t500_obs = xr.open_dataset('/global/cscratch1/sd/y-zhang/AIRS/t500_AIRS.nc').t500.sel(time=TX_obs.time)
gzs_obs = xr.open_dataset('/global/cscratch1/sd/y-zhang/Had/gzs_ERA5.nc').gzs
h2d_obs = xr_histogram2d(t500_obs.sel(latitude=slice(65,40))*land, 
                     (TX_obs+gzs_obs/cp).sel(latitude=slice(65,40))*land)
xr.Dataset({'histogram': h2d_obs}).to_netcdf('/global/cscratch1/sd/y-zhang/Had/histogram2d_TX_t500_Had_AIRS_2003-2014.nc')


h2d_sm = xr_histogram2d_weight(t500.sel(latitude=slice(65,40), time=t500['time.season']=='JJA')*land, 
                               (TX+gzs/cp).sel(latitude=slice(65,40), time=TX['time.season']=='JJA')*land,
                               sm.sel(latitude=slice(65,40), time=sm['time.season']=='JJA')*land)
xr.Dataset({'histogram': h2d_sm}).to_netcdf('/global/cscratch1/sd/y-zhang/Had/histogram2dweighted_sm_TX_t500_2001-2020.nc')

h2d_vo = xr_histogram2d_weight(t500.sel(latitude=slice(65,40), time=t500['time.season']=='JJA')*land, 
                               (TX+gzs/cp).sel(latitude=slice(65,40), time=TX['time.season']=='JJA')*land,
                               vo.sel(latitude=slice(65,40), time=vo['time.season']=='JJA')*land)
xr.Dataset({'histogram': h2d_vo}).to_netcdf('/global/cscratch1/sd/y-zhang/Had/histogram2dweighted_vo_TX_t500_2001-2020.nc')

t500_pn = t500.sel(latitude=lat_slice_pn, longitude=lon_slice_pn).weighted(weights).mean(['latitude','longitude']).compute()
t500_fr = t500.sel(latitude=lat_slice_fr, longitude=lon_slice_fr).weighted(weights).mean(['latitude','longitude']).compute()
t500_rs = t500.sel(latitude=lat_slice_rs, longitude=lon_slice_rs).weighted(weights).mean(['latitude','longitude']).compute()
TX_pn = TX.sel(latitude=lat_slice_pn, longitude=lon_slice_pn).weighted(weights).mean(['latitude','longitude']).compute()
TX_fr = TX.sel(latitude=lat_slice_fr, longitude=lon_slice_fr).weighted(weights).mean(['latitude','longitude']).compute()
TX_rs = TX.sel(latitude=lat_slice_rs, longitude=lon_slice_rs).weighted(weights).mean(['latitude','longitude']).compute()
gzs_pn = gzs.sel(latitude=lat_slice_pn, longitude=lon_slice_pn).weighted(weights).mean(['latitude','longitude']).values
gzs_fr = gzs.sel(latitude=lat_slice_fr, longitude=lon_slice_fr).weighted(weights).mean(['latitude','longitude']).values
gzs_rs = gzs.sel(latitude=lat_slice_rs, longitude=lon_slice_rs).weighted(weights).mean(['latitude','longitude']).values


------------
### Figure 4

In [None]:
@jit
def _of_txx_ufunc(t2m,y):    
    return y[np.argmax(t2m)]

   
def xr_of_txx(t2m,y): # Find the level of a variable on the annual hottest day for each location
    return xr.apply_ufunc(_at_txx_ufunc, t2m, y, 
                             input_core_dims=[['time'],['time']],
                            dask ='parallelized',
                            vectorize=True,
                            output_dtypes=[y.dtype])
 
    
# T500 on the annual hottest days
for year in np.arange(1979,2022):
    t500 = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/t500_'+str(year)+'.nc').t500me
    TX = xr.open_dataset('/global/cscratch1/sd/y-zhang/ERA5/TX_'+str(year)+'.nc').TX
    
    res = xr_of_txx(TX, t500).compute()
    xr.Dataset({'t500': res}).to_netcdf('/global/cscratch1/sd/y-zhang/ERA5/t500_of_TXx_'+str(year)+'.nc')
    