# Module and DASK setting

In [1]:
# DASK client set

import os
import sys
from dask.distributed import Client
# client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler.json', threads_per_worker=2, n_workers=6)
client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler.json')
# client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler_10.json')  

def setup_module_path():
    module_path = '/proj/kimyy/Dropbox/source/python/all/Modules/CESM2'
    if module_path not in sys.path:
        sys.path.append(module_path)

client.run(setup_module_path)

client

# get path for path changes in Jupyter notebook: File - Open from Path - insert relative_path
notebook_path = os.path.abspath(".")
_, _, relative_path = notebook_path.partition('/all/')
relative_path = '/all/' + relative_path
relative_path

'/all/Model/CESM2/Earth_System_Predictability/ASSM/Aleph'

In [2]:
# load public modules

import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as patches
import matplotlib.ticker as mticker
import matplotlib.path as mpath
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy import stats
from scipy.interpolate import griddata
import cmocean
from cmcrameri import cm
import warnings
warnings.simplefilter(action='ignore')
import pandas as pd
import cftime
import pop_tools
from pprint import pprint
import time
import subprocess
import re as re_mod
import cftime
import datetime
from scipy.stats import ttest_1samp
import xcesm
# from scipy.stats import pearsonr
from scipy.stats import t

In [3]:
# load private modules

import sys
sys.path.append('/proj/kimyy/Dropbox/source/python/all/Modules/CESM2')
from KYY_CESM2_preprocessing import CESM2_config

savefilepath = "/mnt/lustre/proj/kimyy/tmp_python/HCST_skills_HCST-LE"


In [10]:
cfg_var_SST=CESM2_config()
cfg_var_SST.year_s=1960
cfg_var_SST.year_e=2020
cfg_var_SST.setvar('SST')

start_date = cftime.DatetimeNoLeap(cfg_var_SST.year_s, 2, 1)
end_date = cftime.DatetimeNoLeap(cfg_var_SST.year_e+1, 1, 1)

ds_grid = pop_tools.get_grid('POP_gx1v7')


In [100]:
def process_coords_bgc_surface(
    ds, sd, ed, varname, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT","SST", "SSH","NO3","photoC_TOT_zint_100m", "TREFHT", "PSL", "PRECT", "TWS", "GPP", "FAREA_BURNED"]
):
    import xcesm
    import numpy as np
    import datetime

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        ds = ds.sel(time=slice(sd, ed))
        ds = ds.isel(z_t=0) 
        ds_rgd = ds[varname].utils.regrid()

        new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds_rgd = ds_rgd.assign_coords(time=new_time)
        ds['lat'] = ds['lat'].round(4)
        ds['lon'] = ds['lon'].round(4)
        return ds_rgd
    else:
        return ds.set_coords(coord_vars)


def process_coords_2d(
    ds, sd, ed, varname, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT","SST", "SSH","NO3","photoC_TOT_zint_100m", "TREFHT", "PSL", "PRECT", "TWS", "GPP", "FAREA_BURNED"]

):
    import xcesm
    import numpy as np
    import datetime

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        ds = ds.sel(time=slice(sd, ed))
        # ds_rgd = ds[varname].utils.regrid()
        # new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        # ds_rgd = ds_rgd.assign_coords(time=new_time)
        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)      
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        ds['lat'] = ds['lat'].round(4)
        ds['lon'] = ds['lon'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)



def process_coords_bgc_surface_hcst(
    ds, sd, ed, varname, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT","SST", "SSH","NO3","photoC_TOT_zint_100m", "TREFHT", "PSL", "PRECT", "TWS", "GPP", "FAREA_BURNED"]
):
    import xcesm
    import numpy as np
    import datetime

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        # ds = ds.isel(z_t_150m=slice(0,10)) 
        ds = ds.isel(z_t_150m=0) 
        ds_rgd = ds[varname].utils.regrid()

        new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds_rgd = ds_rgd.assign_coords(time=new_time)
        ds['lat'] = ds['lat'].round(4)
        ds['lon'] = ds['lon'].round(4)
        return ds_rgd
    else:
        return ds.set_coords(coord_vars)


def process_coords_2d_hcst(
    ds, sd, ed, varname, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT","SST", "SSH","NO3","photoC_TOT_zint_100m", "TREFHT", "PSL", "PRECT", "TWS", "GPP", "FAREA_BURNED"]
):
    import xcesm
    import numpy as np
    import datetime

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        # ds_rgd = ds[varname].utils.regrid()
        # new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        # ds_rgd = ds_rgd.assign_coords(time=new_time)
        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        ds['lat'] = ds['lat'].round(4)
        ds['lon'] = ds['lon'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)

In [111]:
# Read LE dataset (SST)

start_time = time.time()

cfg_var_SST.LE_path_load(cfg_var_SST.var)

fpath_SST = savefilepath + "/LE_SST_output*.nc"


cfg_var_SST.LE_ds = []
for imem in range(0, len(cfg_var_SST.LE_file_list[0])):
    SST_LE_ds_tmp = xr.open_mfdataset(cfg_var_SST.LE_file_list[0][imem], 
                           chunks={'time': 5}, 
                           combine='nested',
                           parallel=True,
                           preprocess=lambda ds: process_coords_2d(ds, start_date, end_date, 'SST'),
                           decode_cf=True, 
                           decode_times=True)
    
    SST_LE_ds_tmp = SST_LE_ds_tmp.expand_dims({'ens_LE': 1})
    cfg_var_SST.LE_ds.append(SST_LE_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading LE, ' + str(imem) + ': ' + str(elapsed_time))

cfg_var_SST.LE_ds = xr.concat(cfg_var_SST.LE_ds, dim='ens_LE')
cfg_var_SST.LE_ds['ens_LE']=range(0, len(cfg_var_SST.LE_file_list[0]))
cfg_var_SST.LE_ds=cfg_var_SST.LE_ds.compute()
    


elasped time for reading LE, 0: 12.47792363166809
elasped time for reading LE, 1: 16.664344787597656
elasped time for reading LE, 2: 20.938654899597168
elasped time for reading LE, 3: 25.055885314941406
elasped time for reading LE, 4: 29.2139310836792
elasped time for reading LE, 5: 33.66727638244629
elasped time for reading LE, 6: 37.89186668395996
elasped time for reading LE, 7: 42.085633277893066
elasped time for reading LE, 8: 46.397172689437866
elasped time for reading LE, 9: 50.30598306655884
elasped time for reading LE, 10: 53.992342472076416
elasped time for reading LE, 11: 58.35973238945007
elasped time for reading LE, 12: 62.42756485939026
elasped time for reading LE, 13: 66.776846408844
elasped time for reading LE, 14: 70.94828081130981
elasped time for reading LE, 15: 75.18238687515259
elasped time for reading LE, 16: 79.21568846702576
elasped time for reading LE, 17: 83.41791105270386
elasped time for reading LE, 18: 87.42975163459778
elasped time for reading LE, 19: 93.99

In [112]:
# HCST SST

cfg_var_SST.HCST_ds = []
HCST_ds_xr = []

start_time = time.time()

cfg_var_SST.HCST_path_load(cfg_var_SST.var)

for iyear in range(0, len(cfg_var_SST.HCST_file_list)):
    SST_HCST_ds_tmp = xr.open_mfdataset(cfg_var_SST.HCST_file_list[iyear], 
                           chunks={'time': 4}, 
                           combine='nested',
                           concat_dim=[[*cfg_var_SST.HCST_ensembles], 'year'], 
                           parallel=True,
                           preprocess=lambda ds: process_coords_2d_hcst(ds, start_date, end_date, 'SST'),
                           decode_cf=True, 
                           decode_times=True)
    
    SST_HCST_ds_tmp = SST_HCST_ds_tmp.rename({"year": "lyears"})
    SST_HCST_ds_tmp['lyears']=range(1,6)
    SST_HCST_ds_tmp = SST_HCST_ds_tmp.expand_dims({'iyear': 1})
    
    cfg_var_SST.HCST_ds.append(SST_HCST_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading HCST SST, ' + str(iyear) + ': ' + str(elapsed_time))

cfg_var_SST.HCST_ds = xr.concat(cfg_var_SST.HCST_ds, dim='iyear')
cfg_var_SST.HCST_ds = cfg_var_SST.HCST_ds.rename({"concat_dim": "ens_HCST"})
cfg_var_SST.HCST_ds['ens_HCST']=range(0, len(cfg_var_SST.HCST_file_list[0]))
cfg_var_SST.HCST_ds['iyear']=range(cfg_var_SST.year_s, cfg_var_SST.year_e+1)
cfg_var_SST.HCST_ds=cfg_var_SST.HCST_ds.compute()

elasped time for reading HCST SST, 0: 17.715754508972168
elasped time for reading HCST SST, 1: 18.281638622283936
elasped time for reading HCST SST, 2: 18.826735734939575
elasped time for reading HCST SST, 3: 19.376030683517456
elasped time for reading HCST SST, 4: 19.901851654052734
elasped time for reading HCST SST, 5: 20.7691650390625
elasped time for reading HCST SST, 6: 21.305104970932007
elasped time for reading HCST SST, 7: 21.868179082870483
elasped time for reading HCST SST, 8: 22.414310455322266
elasped time for reading HCST SST, 9: 22.967474460601807
elasped time for reading HCST SST, 10: 23.517557859420776
elasped time for reading HCST SST, 11: 24.03641629219055
elasped time for reading HCST SST, 12: 24.579558849334717
elasped time for reading HCST SST, 13: 25.112762212753296
elasped time for reading HCST SST, 14: 25.648260831832886
elasped time for reading HCST SST, 15: 26.2024564743042
elasped time for reading HCST SST, 16: 26.747222900390625
elasped time for reading HCST

In [76]:
cfg_var_SST.HCST_ds_yearly = cfg_var_SST.HCST_ds.sel(iyear=slice("1960-01-01", "2020-12-31")).isel(lyears=0).rename({"iyear": "year"})


In [105]:
# get rolling mean variables (SST)

cfg_var_SST.LE_ds_4yr = cfg_var_SST.LE_ds.rolling(year=4, min_periods=4).mean()
obs_rolling_time_mean = cfg_var_SST.LE_ds['year'].rolling(year=4, min_periods=4).mean()
cfg_var_SST.LE_ds_4yr = cfg_var_SST.LE_ds_4yr.assign_coords(year=obs_rolling_time_mean)
valid_index = np.where(~np.isnan(cfg_var_SST.LE_ds_4yr['year']))[0]
cfg_var_SST.LE_ds_4yr = cfg_var_SST.LE_ds_4yr.isel(year=valid_index)
cfg_var_SST.LE_ds_4yr = cfg_var_SST.LE_ds_4yr.isel(year=range(1,58))

# hcst_da_ly25=cfg_var_SST.HCST_ds.sel(iyear=slice("1996-01-01", "2016-12-31"), lyears=slice(2, 5))
hcst_da_ly25=cfg_var_SST.HCST_ds.sel(iyear=slice("1959", "2016"), lyears=slice(2, 5))
cfg_var_SST.HCST_ds_4yr=hcst_da_ly25.mean(dim='lyears')
cfg_var_SST.HCST_ds_4yr=cfg_var_SST.HCST_ds_4yr.rename(iyear="year")
cfg_var_SST.HCST_ds_4yr['year']=cfg_var_SST.LE_ds_4yr['year']



In [110]:
# get skills
start_time = time.time()


corr_SST_LE_HCST = xr.corr(cfg_var_SST.LE_ds['SST'], cfg_var_SST.HCST_ds_yearly['SST'], dim='year').compute()
# corr_SST_LE_HCST_obs_period = xr.corr(cfg_var_SST.LE_ds, cfg_var_SST.HCST_ds_yearly, dim='year')
# em
# em-obs_period

end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading HCST SST, ' + str(iyear) + ': ' + str(elapsed_time))

Task exception was never retrieved
future: <Task finished name='Task-40846' coro=<Client._gather.<locals>.wait() done, defined at /mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py:2212> exception=AllExit()>
Traceback (most recent call last):
  File "/mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py", line 2221, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
future: <Task finished name='Task-40837' coro=<Client._gather.<locals>.wait() done, defined at /mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py:2212> exception=AllExit()>
Traceback (most recent call last):
  File "/mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py", line 2221, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
futur

KeyboardInterrupt: 

Task exception was never retrieved
future: <Task finished name='Task-40801' coro=<Client._gather.<locals>.wait() done, defined at /mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py:2212> exception=AllExit()>
Traceback (most recent call last):
  File "/mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py", line 2221, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
future: <Task finished name='Task-40798' coro=<Client._gather.<locals>.wait() done, defined at /mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py:2212> exception=AllExit()>
Traceback (most recent call last):
  File "/mnt/lustre/proj/kimyy/APP/conda/miniconda3/envs/kdask/lib/python3.11/site-packages/distributed/client.py", line 2221, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
futur

In [109]:
 # save temporary file (HCST)
corr_SST_LE_HCST.to_netcdf(savefilepath + "/HCST_SST_corr" + ".nc")
        

OSError: [Errno -101] NetCDF: HDF error: b'/mnt/lustre/proj/kimyy/tmp_python/HCST_skills_HCST-LE/HCST_SST_corr.nc'

In [52]:
# critical r with alpha=0.1, alpha=0.34
N = 19
alpha = 0.34  # two-sided, so alpha/2 = 0.05
df = N - 2

# 1) Find t_crit for a two-sided test at alpha=0.1
t_crit = t.ppf(1 - alpha/2, df)  # e.g., ppf(0.95) for the upper 5% tail

# 2) Solve for the correlation threshold
rho_crit = np.sqrt(t_crit**2 / ((N - 2) + t_crit**2))
print(f"N={N}, alpha={alpha}, df={df}")
print(f"t_crit = {t_crit:.3f}")
print(f"Correlation threshold ≈ ±{rho_crit:.3f}")

N=19, alpha=0.34, df=17
t_crit = 0.982
Correlation threshold ≈ ±0.232
