# Module and DASK setting

In [1]:
# DASK client set

import os
import sys
from dask.distributed import Client
# client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler.json', threads_per_worker=2, n_workers=6)
client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler.json')
# client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler_10.json')  

def setup_module_path():
    module_path = '/proj/kimyy/Dropbox/source/python/all/Modules/CESM2'
    if module_path not in sys.path:
        sys.path.append(module_path)

client.run(setup_module_path)

client

# get path for path changes in Jupyter notebook: File - Open from Path - insert relative_path
notebook_path = os.path.abspath(".")
_, _, relative_path = notebook_path.partition('/all/')
relative_path = '/all/' + relative_path
relative_path

'/all/Model/CESM2/Earth_System_Predictability/ASSM/Aleph'

In [2]:
# load public modules

import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as patches
import matplotlib.ticker as mticker
import matplotlib.path as mpath
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy import stats
from scipy.interpolate import griddata
import cmocean
from cmcrameri import cm
import warnings
warnings.simplefilter(action='ignore')
import pandas as pd
import cftime
import pop_tools
from pprint import pprint
import time
import subprocess
import re as re_mod
import cftime
import datetime
from scipy.stats import ttest_1samp
import xcesm
# from scipy.stats import pearsonr
from scipy.stats import t

In [3]:
# load private modules

import sys
sys.path.append('/proj/kimyy/Dropbox/source/python/all/Modules/CESM2')
from KYY_CESM2_preprocessing import CESM2_config

savefilepath = "/mnt/lustre/proj/kimyy/tmp_python/HCST_skills_HCST-LE"


In [4]:
# change variables by command+F, for S-ST, T-REFHT, T-WS, G-PP, S-SH, P-SL, P-RECT, p-hotoC_TOT_zint_100m, F-AREA_BURNED (not for N-O3). 

cfg_var_NO3=CESM2_config()
cfg_var_NO3.year_s=1960
cfg_var_NO3.year_e=2020
cfg_var_NO3.setvar('NO3')

start_date = cftime.DatetimeNoLeap(cfg_var_NO3.year_s, 2, 1)
end_date = cftime.DatetimeNoLeap(cfg_var_NO3.year_e+1, 1, 1)

ds_grid = pop_tools.get_grid('POP_gx1v7')


In [5]:
def process_coords_surface(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]
):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)
    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        ds = ds.sel(time=slice(sd, ed))
        ds = ds.isel(z_t=0) 
        # ds_rgd = ds[varname].utils.regrid()

        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)


def process_coords_2d(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]

):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        ds = ds.sel(time=slice(sd, ed))
        # ds_rgd = ds[varname].utils.regrid()
        # new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        # ds_rgd = ds_rgd.assign_coords(time=new_time)
        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)      
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)



def process_coords_surface_hcst(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]
):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        # ds = ds.isel(z_t_150m=slice(0,10)) 
        ds = ds.isel(z_t=0) 
        # ds_rgd = ds[varname].utils.regrid()

        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)


def process_coords_2d_hcst(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]
):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        # ds_rgd = ds[varname].utils.regrid()
        # new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        # ds_rgd = ds_rgd.assign_coords(time=new_time)
        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)

In [6]:
# Read LE dataset (NO3)

start_time = time.time()

cfg_var_NO3.LE_path_load(cfg_var_NO3.var)

fpath_NO3 = savefilepath + "/LE_NO3_output*.nc"

tmp_comp=cfg_var_NO3.comp
cfg_var_NO3.LE_ds = []
for imem in range(0, len(cfg_var_NO3.LE_file_list[0])):
    NO3_LE_ds_tmp = xr.open_mfdataset(cfg_var_NO3.LE_file_list[0][imem], 
                           chunks={'time': 5}, 
                           combine='nested',
                           parallel=True,
                           preprocess=lambda ds: process_coords_surface(ds, start_date, end_date, 'NO3', tmp_comp),
                           decode_cf=True, 
                           decode_times=True)
    
    NO3_LE_ds_tmp = NO3_LE_ds_tmp.expand_dims({'ens_LE': 1})
    cfg_var_NO3.LE_ds.append(NO3_LE_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading LE, ' + str(imem) + ': ' + str(elapsed_time))

cfg_var_NO3.LE_ds = xr.concat(cfg_var_NO3.LE_ds, dim='ens_LE')
cfg_var_NO3.LE_ds['ens_LE']=range(0, len(cfg_var_NO3.LE_file_list[0]))
cfg_var_NO3.LE_ds=cfg_var_NO3.LE_ds.compute()
    
end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading LE, ' + str(imem) + ': ' + str(elapsed_time))

elasped time for reading LE, 0: 55.50500798225403
elasped time for reading LE, 1: 96.99230337142944
elasped time for reading LE, 2: 140.2125904560089
elasped time for reading LE, 3: 185.29255890846252
elasped time for reading LE, 4: 229.017160654068
elasped time for reading LE, 5: 273.022803068161
elasped time for reading LE, 6: 318.4810199737549
elasped time for reading LE, 7: 362.6931939125061
elasped time for reading LE, 8: 407.3013348579407
elasped time for reading LE, 9: 449.7417688369751
elasped time for reading LE, 10: 485.672598361969
elasped time for reading LE, 11: 520.5183184146881
elasped time for reading LE, 12: 554.8926792144775
elasped time for reading LE, 13: 592.8374900817871
elasped time for reading LE, 14: 627.5657813549042
elasped time for reading LE, 15: 661.2479491233826
elasped time for reading LE, 16: 697.7980778217316
elasped time for reading LE, 17: 738.3698835372925
elasped time for reading LE, 18: 781.3830614089966
elasped time for reading LE, 19: 816.967456

In [7]:
# HCST NO3

cfg_var_NO3.HCST_ds = []
HCST_ds_xr = []

start_time = time.time()

cfg_var_NO3.HCST_path_load(cfg_var_NO3.var)

for iyear in range(0, len(cfg_var_NO3.HCST_file_list)):
    NO3_HCST_ds_tmp = xr.open_mfdataset(cfg_var_NO3.HCST_file_list[iyear], 
                           chunks={'time': 4}, 
                           combine='nested',
                           concat_dim=[[*cfg_var_NO3.HCST_ensembles], 'year'], 
                           parallel=True,
                           preprocess=lambda ds: process_coords_surface_hcst(ds, start_date, end_date, 'NO3', tmp_comp),
                           decode_cf=True, 
                           decode_times=True)
    
    NO3_HCST_ds_tmp = NO3_HCST_ds_tmp.rename({"year": "lyears"})
    NO3_HCST_ds_tmp['lyears']=range(1,6)
    NO3_HCST_ds_tmp = NO3_HCST_ds_tmp.expand_dims({'iyear': 1})
    
    cfg_var_NO3.HCST_ds.append(NO3_HCST_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading HCST NO3, ' + str(iyear) + ': ' + str(elapsed_time))

cfg_var_NO3.HCST_ds = xr.concat(cfg_var_NO3.HCST_ds, dim='iyear')
cfg_var_NO3.HCST_ds = cfg_var_NO3.HCST_ds.rename({"concat_dim": "ens_HCST"})
cfg_var_NO3.HCST_ds['ens_HCST']=range(0, len(cfg_var_NO3.HCST_file_list[0]))
cfg_var_NO3.HCST_ds['iyear']=range(cfg_var_NO3.year_s, cfg_var_NO3.year_e+1)
cfg_var_NO3.HCST_ds=cfg_var_NO3.HCST_ds.compute()

end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading HCST NO3, ' + str(iyear) + ': ' + str(elapsed_time))

elasped time for reading HCST NO3, 0: 27.56215739250183
elasped time for reading HCST NO3, 1: 32.09673738479614
elasped time for reading HCST NO3, 2: 36.88039803504944
elasped time for reading HCST NO3, 3: 41.53854775428772
elasped time for reading HCST NO3, 4: 46.022348403930664
elasped time for reading HCST NO3, 5: 50.90765380859375
elasped time for reading HCST NO3, 6: 55.53791427612305
elasped time for reading HCST NO3, 7: 59.907596826553345
elasped time for reading HCST NO3, 8: 64.38851618766785
elasped time for reading HCST NO3, 9: 68.81868004798889
elasped time for reading HCST NO3, 10: 73.41756534576416
elasped time for reading HCST NO3, 11: 77.8585274219513
elasped time for reading HCST NO3, 12: 82.50738143920898
elasped time for reading HCST NO3, 13: 86.96654677391052
elasped time for reading HCST NO3, 14: 91.55275964736938
elasped time for reading HCST NO3, 15: 96.03670501708984
elasped time for reading HCST NO3, 16: 100.58891129493713
elasped time for reading HCST NO3, 17: 

In [8]:
cfg_var_NO3.HCST_ds_yearly = cfg_var_NO3.HCST_ds.sel(iyear=slice("1960-01-01", "2020-12-31")).isel(lyears=0).rename({"iyear": "year"})


In [9]:
# get rolling mean variables (NO3)

cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds.rolling(year=4, min_periods=4).mean()
obs_rolling_time_mean = cfg_var_NO3.LE_ds['year'].rolling(year=4, min_periods=4).mean()
cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds_4yr.assign_coords(year=obs_rolling_time_mean)
valid_index = np.where(~np.isnan(cfg_var_NO3.LE_ds_4yr['year']))[0]
cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds_4yr.isel(year=valid_index)
cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds_4yr.isel(year=range(1,58))

# hcst_da_ly25=cfg_var_NO3.HCST_ds.sel(iyear=slice("1996-01-01", "2016-12-31"), lyears=slice(2, 5))
hcst_da_ly25=cfg_var_NO3.HCST_ds.sel(iyear=slice("1959", "2016"), lyears=slice(2, 5))
cfg_var_NO3.HCST_ds_4yr=hcst_da_ly25.mean(dim='lyears')
cfg_var_NO3.HCST_ds_4yr=cfg_var_NO3.HCST_ds_4yr.rename(iyear="year")
cfg_var_NO3.HCST_ds_4yr['year']=cfg_var_NO3.LE_ds_4yr['year']


In [10]:
# get skills
start_time = time.time()

#LY1
corr_NO3_LE_HCST = xr.corr(
                            cfg_var_NO3.LE_ds['NO3'].sel(year=slice("1964", "2020")), 
                            cfg_var_NO3.HCST_ds_yearly['NO3'].sel(year=slice("1964", "2020")), dim='year')
# corr_NO3_LE_HCST_obs_period
# obs_years=slice("1960", "2020") # others
# obs_years=slice("1989", "2019") # VODCA2G-PP
# obs_years=slice("1993", "2020") # SS-H
# obs_years=slice("1960", "2019") # PREC-T
obs_years=slice("1998", "2020") # PREC-T

corr_NO3_LE_HCST_op = xr.corr(
                            cfg_var_NO3.LE_ds['NO3'].sel(year=obs_years), 
                            cfg_var_NO3.HCST_ds_yearly['NO3'].sel(year=obs_years), dim='year')

corr_NO3_LE_HCST_em = xr.corr(
                            cfg_var_NO3.LE_ds['NO3'].mean(dim='ens_LE').sel(year=slice("1964", "2020")), 
                            cfg_var_NO3.HCST_ds_yearly['NO3'].mean(dim='ens_HCST').sel(year=slice("1964", "2020")), dim='year')
# em-obs_period
corr_NO3_LE_HCST_op_em = xr.corr(
                            cfg_var_NO3.LE_ds['NO3'].mean(dim='ens_LE').sel(year=obs_years), 
                            cfg_var_NO3.HCST_ds_yearly['NO3'].mean(dim='ens_HCST').sel(year=obs_years), dim='year')


#LY2_5
corr_NO3_LE_HCST_ly25 = xr.corr(cfg_var_NO3.LE_ds_4yr['NO3'], cfg_var_NO3.HCST_ds_4yr['NO3'], dim='year')
# corr_NO3_LE_HCST_obs_period
start_int = int(obs_years.start)  
stop_int = int(obs_years.stop)    
start_int += 2  
obs_years_ly25 = slice(str(start_int), str(stop_int))
corr_NO3_LE_HCST_ly25_op = xr.corr(cfg_var_NO3.LE_ds_4yr['NO3'].sel(year=obs_years_ly25), cfg_var_NO3.HCST_ds_4yr['NO3'].sel(year=obs_years_ly25), dim='year')

corr_NO3_LE_HCST_ly25_em = xr.corr(cfg_var_NO3.LE_ds_4yr['NO3'].mean(dim='ens_LE'), cfg_var_NO3.HCST_ds_4yr['NO3'].mean(dim='ens_HCST'), dim='year')
# em-obs_period
corr_NO3_LE_HCST_ly25_op_em = xr.corr(cfg_var_NO3.LE_ds_4yr['NO3'].mean(dim='ens_LE').sel(year=obs_years_ly25), cfg_var_NO3.HCST_ds_4yr['NO3'].mean(dim='ens_HCST').sel(year=obs_years_ly25), dim='year')


end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading HCST NO3, ' + str(iyear) + ': ' + str(elapsed_time))

elasped time for reading HCST NO3, 60: 1155.4734489917755


In [11]:
 # save temporary file (HCST)
start_time = time.time()

corr_NO3_LE_HCST.to_netcdf(savefilepath + "/corr_NO3_LE_HCST" + ".nc")
corr_NO3_LE_HCST_op.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_op" + ".nc")
corr_NO3_LE_HCST_em.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_em" + ".nc")
corr_NO3_LE_HCST_op_em.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_op_em" + ".nc")
corr_NO3_LE_HCST_ly25.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_ly25" + ".nc")
corr_NO3_LE_HCST_ly25_op.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_ly25_op" + ".nc")
corr_NO3_LE_HCST_ly25_em.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_ly25_em" + ".nc")
corr_NO3_LE_HCST_ly25_op_em.to_netcdf(savefilepath + "/corr_NO3_LE_HCST_ly25_op_em" + ".nc")

end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for saving NO3 corr, ' + str(iyear) + ': ' + str(elapsed_time))

elasped time for saving NO3 corr, 60: 15.6161630153656


In [12]:
# critical r with alpha=0.1, alpha=0.34
N = 19
alpha = 0.34  # two-sided, so alpha/2 = 0.05
df = N - 2

# 1) Find t_crit for a two-sided test at alpha=0.1
t_crit = t.ppf(1 - alpha/2, df)  # e.g., ppf(0.95) for the upper 5% tail

# 2) Solve for the correlation threshold
rho_crit = np.sqrt(t_crit**2 / ((N - 2) + t_crit**2))
print(f"N={N}, alpha={alpha}, df={df}")
print(f"t_crit = {t_crit:.3f}")
print(f"Correlation threshold ≈ ±{rho_crit:.3f}")

N=19, alpha=0.34, df=17
t_crit = 0.982
Correlation threshold ≈ ±0.232
