# Module and DASK setting

In [1]:
# DASK client set

import os
import sys
from dask.distributed import Client
# client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler.json', threads_per_worker=2, n_workers=6)
client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler.json')
# client = Client(scheduler_file='/proj/kimyy/Dropbox/source/python/all/mpi/scheduler_10.json')  

def setup_module_path():
    module_path = '/proj/kimyy/Dropbox/source/python/all/Modules/CESM2'
    if module_path not in sys.path:
        sys.path.append(module_path)

client.run(setup_module_path)

client

# get path for path changes in Jupyter notebook: File - Open from Path - insert relative_path
notebook_path = os.path.abspath(".")
_, _, relative_path = notebook_path.partition('/all/')
relative_path = '/all/' + relative_path
relative_path

'/all/Model/CESM2/Earth_System_Predictability/ASSM/Aleph'

In [2]:
# load public modules

import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as patches
import matplotlib.ticker as mticker
import matplotlib.path as mpath
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy import stats
from scipy.interpolate import griddata
import cmocean
from cmcrameri import cm
import warnings
warnings.simplefilter(action='ignore')
import pandas as pd
import cftime
import pop_tools
from pprint import pprint
import time
import subprocess
import re as re_mod
import cftime
import datetime
from scipy.stats import ttest_1samp
import xcesm
# from scipy.stats import pearsonr
from scipy.stats import t

In [3]:
# load private modules

import sys
sys.path.append('/proj/kimyy/Dropbox/source/python/all/Modules/CESM2')
from KYY_CESM2_preprocessing import CESM2_config

savefilepath = "/mnt/lustre/proj/kimyy/tmp_python/HCST_skills_autocorr"


In [4]:
# change variables by command+F, for S-ST, T-REFHT, T-WS, P-SL, P-RECT, G-PP, S-SH, p-hotoC_TOT_zint_100m, F-AREA_BURNED (not for N-O3). 

cfg_var_NO3=CESM2_config()
cfg_var_NO3.year_s=1960
cfg_var_NO3.year_e=2020
cfg_var_NO3.setvar('NO3')
cfg_var_NO3.OBS_var = 'nan'
start_date = cftime.DatetimeNoLeap(cfg_var_NO3.year_s, 2, 1)
end_date = cftime.DatetimeNoLeap(cfg_var_NO3.year_e+1, 1, 1)

ds_grid = pop_tools.get_grid('POP_gx1v7')


In [12]:
cfg_var_NO3.OBS_var = 'nan'

In [5]:

def process_coords_2d(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]

):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        ds = ds.sel(time=slice(sd, ed))

        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)      
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)

def process_coords_surface(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]
):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)
    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        ds = ds.sel(time=slice(sd, ed))
        ds = ds.isel(z_t=0) 
        # ds_rgd = ds[varname].utils.regrid()

        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)



def process_coords_2d_obs(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]

):
    import xcesm
    import numpy as np
    import datetime

    if drop:

        if 'T' in ds.coords or 'T' in ds.dims:
            ds = ds.rename({'T': 'time'})
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)


def process_coords_2d_hcst(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]
):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        # ds_rgd = ds[varname].utils.regrid()
        # new_time = ds_rgd.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        # ds_rgd = ds_rgd.assign_coords(time=new_time)
        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)


def process_coords_surface_hcst(
    ds, sd, ed, varname, comp, drop=True,
    except_coord_vars=["time","lon","lat","TLONG","TLAT"]
):
    import xcesm
    import numpy as np
    import datetime
    except_coord_vars.append(varname)

    coord_vars = []
    for v in np.array(ds.coords):
        if v not in except_coord_vars:
            coord_vars.append(v)
    for v in np.array(ds.data_vars):
        if v not in except_coord_vars:
            coord_vars.append(v)

    if drop:
        ds = ds.drop(coord_vars)
        # ds = ds.isel(z_t_150m=slice(0,10)) 
        ds = ds.isel(z_t=0) 
        # ds_rgd = ds[varname].utils.regrid()

        new_time = ds.time - np.array([datetime.timedelta(days=15)] * len(ds.time))
        ds = ds.assign_coords(time=new_time)
        ds=ds.groupby('time.year').mean(dim='time', skipna=True)
        if comp == "atm" or comp == "lnd":
            ds['lat'] = ds['lat'].round(4)
            ds['lon'] = ds['lon'].round(4)
        if comp == "ocn" or comp == "ice":
            ds['TLAT'] = ds['TLAT'].round(4)
            ds['TLONG'] = ds['TLONG'].round(4)
        return ds
    else:
        return ds.set_coords(coord_vars)

In [6]:
# Read ODA dataset (NO3)

start_time = time.time()

cfg_var_NO3.ODA_path_load(cfg_var_NO3.var)

tmp_comp=cfg_var_NO3.comp
cfg_var_NO3.ODA_ds = []
for imem in range(0, len(cfg_var_NO3.ODA_file_list[0])):
    NO3_ODA_ds_tmp = xr.open_mfdataset(cfg_var_NO3.ODA_file_list[0][imem], 
                           chunks={'time': 5}, 
                           combine='nested',
                           parallel=True,
                           preprocess=lambda ds: process_coords_surface(ds, start_date, end_date, 'NO3', tmp_comp),
                           decode_cf=True, 
                           decode_times=True)
    
    NO3_ODA_ds_tmp = NO3_ODA_ds_tmp.expand_dims({'ens_ODA': 1})
    cfg_var_NO3.ODA_ds.append(NO3_ODA_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading ODA, ' + str(imem) + ': ' + str(elapsed_time))

cfg_var_NO3.ODA_ds = xr.concat(cfg_var_NO3.ODA_ds, dim='ens_ODA')
cfg_var_NO3.ODA_ds['ens_ODA']=range(0, len(cfg_var_NO3.ODA_file_list[0]))
cfg_var_NO3.ODA_ds=cfg_var_NO3.ODA_ds.compute()

end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading ODA, ' + str(imem) + ': ' + str(elapsed_time))

elasped time for reading ODA, 0: 74.62341022491455
elasped time for reading ODA, 1: 148.36190915107727
elasped time for reading ODA, 2: 219.9632408618927
elasped time for reading ODA, 3: 294.2169530391693
elasped time for reading ODA, 4: 363.48272013664246
elasped time for reading ODA, 5: 440.5602767467499
elasped time for reading ODA, 6: 509.79437923431396
elasped time for reading ODA, 7: 578.7457957267761
elasped time for reading ODA, 8: 648.8710770606995
elasped time for reading ODA, 9: 720.3828880786896
elasped time for reading ODA, 10: 785.4098088741302
elasped time for reading ODA, 11: 852.6737434864044
elasped time for reading ODA, 12: 919.8347978591919
elasped time for reading ODA, 13: 982.7610414028168
elasped time for reading ODA, 14: 1047.1239540576935
elasped time for reading ODA, 15: 1118.424021244049
elasped time for reading ODA, 16: 1183.6339428424835
elasped time for reading ODA, 17: 1249.9023110866547
elasped time for reading ODA, 18: 1315.4203963279724
elasped time fo

In [7]:
# Read LE dataset (NO3)

start_time = time.time()

cfg_var_NO3.LE_path_load(cfg_var_NO3.var)

fpath_NO3 = savefilepath + "/LE_NO3_output*.nc"

tmp_comp=cfg_var_NO3.comp
cfg_var_NO3.LE_ds = []
for imem in range(0, len(cfg_var_NO3.LE_file_list[0])):
    NO3_LE_ds_tmp = xr.open_mfdataset(cfg_var_NO3.LE_file_list[0][imem], 
                           chunks={'time': 5}, 
                           combine='nested',
                           parallel=True,
                           preprocess=lambda ds: process_coords_surface(ds, start_date, end_date, 'NO3', tmp_comp),
                           decode_cf=True, 
                           decode_times=True)
    
    NO3_LE_ds_tmp = NO3_LE_ds_tmp.expand_dims({'ens_LE': 1})
    cfg_var_NO3.LE_ds.append(NO3_LE_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading LE, ' + str(imem) + ': ' + str(elapsed_time))

cfg_var_NO3.LE_ds = xr.concat(cfg_var_NO3.LE_ds, dim='ens_LE')
cfg_var_NO3.LE_ds['ens_LE']=range(0, len(cfg_var_NO3.LE_file_list[0]))
cfg_var_NO3.LE_ds=cfg_var_NO3.LE_ds.compute()
    
end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading LE, ' + str(imem) + ': ' + str(elapsed_time))

elasped time for reading LE, 0: 54.70848989486694
elasped time for reading LE, 1: 96.36748456954956
elasped time for reading LE, 2: 140.1586754322052
elasped time for reading LE, 3: 183.7445194721222
elasped time for reading LE, 4: 228.88884925842285
elasped time for reading LE, 5: 274.0359888076782
elasped time for reading LE, 6: 320.6613619327545
elasped time for reading LE, 7: 363.5642306804657
elasped time for reading LE, 8: 406.83827233314514
elasped time for reading LE, 9: 450.9124381542206
elasped time for reading LE, 10: 488.1293435096741
elasped time for reading LE, 11: 525.7322189807892
elasped time for reading LE, 12: 560.7542510032654
elasped time for reading LE, 13: 596.3222925662994
elasped time for reading LE, 14: 632.4994781017303
elasped time for reading LE, 15: 668.1446907520294
elasped time for reading LE, 16: 703.5543518066406
elasped time for reading LE, 17: 743.3858289718628
elasped time for reading LE, 18: 787.665876865387
elasped time for reading LE, 19: 824.990

In [8]:
# HCST NO3

cfg_var_NO3.HCST_ds = []
HCST_ds_xr = []

start_time = time.time()

cfg_var_NO3.HCST_path_load(cfg_var_NO3.var)

for iyear in range(0, len(cfg_var_NO3.HCST_file_list)):
    NO3_HCST_ds_tmp = xr.open_mfdataset(cfg_var_NO3.HCST_file_list[iyear], 
                           chunks={'time': 4}, 
                           combine='nested',
                           concat_dim=[[*cfg_var_NO3.HCST_ensembles], 'year'], 
                           parallel=True,
                           preprocess=lambda ds: process_coords_surface_hcst(ds, start_date, end_date, 'NO3', tmp_comp),
                           decode_cf=True, 
                           decode_times=True)
    
    NO3_HCST_ds_tmp = NO3_HCST_ds_tmp.rename({"year": "lyears"})
    NO3_HCST_ds_tmp['lyears']=range(1,6)
    NO3_HCST_ds_tmp = NO3_HCST_ds_tmp.expand_dims({'iyear': 1})
    
    cfg_var_NO3.HCST_ds.append(NO3_HCST_ds_tmp)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print('elasped time for reading HCST NO3, ' + str(iyear) + ': ' + str(elapsed_time))

cfg_var_NO3.HCST_ds = xr.concat(cfg_var_NO3.HCST_ds, dim='iyear')
cfg_var_NO3.HCST_ds = cfg_var_NO3.HCST_ds.rename({"concat_dim": "ens_HCST"})
cfg_var_NO3.HCST_ds['ens_HCST']=range(0, len(cfg_var_NO3.HCST_file_list[0]))
cfg_var_NO3.HCST_ds['iyear']=range(cfg_var_NO3.year_s, cfg_var_NO3.year_e+1)
cfg_var_NO3.HCST_ds=cfg_var_NO3.HCST_ds.compute()

end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for reading HCST NO3, ' + str(iyear) + ': ' + str(elapsed_time))

elasped time for reading HCST NO3, 0: 28.442138671875
elasped time for reading HCST NO3, 1: 33.0088152885437
elasped time for reading HCST NO3, 2: 37.62539768218994
elasped time for reading HCST NO3, 3: 42.31944513320923
elasped time for reading HCST NO3, 4: 46.81011724472046
elasped time for reading HCST NO3, 5: 51.33808398246765
elasped time for reading HCST NO3, 6: 55.989657402038574
elasped time for reading HCST NO3, 7: 60.6488778591156
elasped time for reading HCST NO3, 8: 65.86664843559265
elasped time for reading HCST NO3, 9: 70.37450361251831
elasped time for reading HCST NO3, 10: 74.95180678367615
elasped time for reading HCST NO3, 11: 79.52054691314697
elasped time for reading HCST NO3, 12: 84.10340881347656
elasped time for reading HCST NO3, 13: 88.74954223632812
elasped time for reading HCST NO3, 14: 93.27501630783081
elasped time for reading HCST NO3, 15: 97.98826432228088
elasped time for reading HCST NO3, 16: 103.54740571975708
elasped time for reading HCST NO3, 17: 108.

In [9]:
cfg_var_NO3.HCST_ds_yearly = cfg_var_NO3.HCST_ds.sel(iyear=slice("1960-01-01", "2020-12-31")).isel(lyears=0).rename({"iyear": "year"})


In [10]:
# get rolling mean variables (NO3)

cfg_var_NO3.ODA_ds_4yr = cfg_var_NO3.ODA_ds.rolling(year=4, min_periods=4).mean()
obs_rolling_time_mean = cfg_var_NO3.ODA_ds['year'].rolling(year=4, min_periods=4).mean()
cfg_var_NO3.ODA_ds_4yr = cfg_var_NO3.ODA_ds_4yr.assign_coords(year=obs_rolling_time_mean)
valid_index = np.where(~np.isnan(cfg_var_NO3.ODA_ds_4yr['year']))[0]
cfg_var_NO3.ODA_ds_4yr = cfg_var_NO3.ODA_ds_4yr.isel(year=valid_index)
cfg_var_NO3.ODA_ds_4yr = cfg_var_NO3.ODA_ds_4yr.isel(year=range(1,58))

cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds.rolling(year=4, min_periods=4).mean()
obs_rolling_time_mean = cfg_var_NO3.LE_ds['year'].rolling(year=4, min_periods=4).mean()
cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds_4yr.assign_coords(year=obs_rolling_time_mean)
valid_index = np.where(~np.isnan(cfg_var_NO3.LE_ds_4yr['year']))[0]
cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds_4yr.isel(year=valid_index)
cfg_var_NO3.LE_ds_4yr = cfg_var_NO3.LE_ds_4yr.isel(year=range(1,58))

# hcst_da_ly25=cfg_var_NO3.HCST_ds.sel(iyear=slice("1996-01-01", "2016-12-31"), lyears=slice(2, 5))
hcst_da_ly25=cfg_var_NO3.HCST_ds.sel(iyear=slice("1959", "2016"), lyears=slice(2, 5))
cfg_var_NO3.HCST_ds_4yr=hcst_da_ly25.mean(dim='lyears')
cfg_var_NO3.HCST_ds_4yr=cfg_var_NO3.HCST_ds_4yr.rename(iyear="year")
cfg_var_NO3.HCST_ds_4yr['year']=cfg_var_NO3.LE_ds_4yr['year']



In [13]:

# individual (ODA)
da = cfg_var_NO3.ODA_ds['NO3'].sel(year=slice(1965, 2020))

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)
autocorr_NO3_ODA = xr.corr(da, da_lag1, dim='year')   # ens_ODA, lat, lon


# ensemble mean (ODA)
da = cfg_var_NO3.ODA_ds['NO3'].sel(year=slice(1965, 2020)).mean(dim='ens_ODA')

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)
autocorr_NO3_ODA_em = xr.corr(da, da_lag1, dim='year')   # ens_ODA, lat, lon


# individual 4yr (ODA)
da = cfg_var_NO3.ODA_ds_4yr['NO3'].sel(year=slice(1965, 2020))

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_ODA_ds_4yr = xr.corr(da, da_lag1, dim='year')   # ens_ODA, lat, lon


# ensemble mena 4yr (ODA)
da = cfg_var_NO3.ODA_ds_4yr['NO3'].sel(year=slice(1965, 2020)).mean(dim='ens_ODA')

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_ODA_ds_4yr_em = xr.corr(da, da_lag1, dim='year')   # ens_ODA, lat, lon

In [14]:

# individual (LE)
da = cfg_var_NO3.LE_ds['NO3'].sel(year=slice(1965, 2020))

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)
autocorr_NO3_LE = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon


# ensemble mean (LE)
da = cfg_var_NO3.LE_ds['NO3'].sel(year=slice(1965, 2020)).mean(dim='ens_LE')

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)
autocorr_NO3_LE_em = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon


# individual 4yr (LE)
da = cfg_var_NO3.LE_ds_4yr['NO3'].sel(year=slice(1965, 2020))

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_LE_ds_4yr = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon


# ensemble mena 4yr (LE)
da = cfg_var_NO3.LE_ds_4yr['NO3'].sel(year=slice(1965, 2020)).mean(dim='ens_LE')

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_LE_ds_4yr_em = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon

In [15]:

# individual (HCST)
da = cfg_var_NO3.HCST_ds_yearly['NO3'].sel(year=slice(1965, 2020))

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_HCST = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon


# ensemble mean (HCST)
da = cfg_var_NO3.HCST_ds_yearly['NO3'].sel(year=slice(1965, 2020)).mean(dim='ens_HCST')

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_HCST_em = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon


# individual 4yr (HCST)
da = cfg_var_NO3.HCST_ds_4yr['NO3'].sel(year=slice(1965, 2020))

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_HCST_ds_4yr = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon


#ensemble mean 4yr (HCST)
da = cfg_var_NO3.HCST_ds_4yr['NO3'].sel(year=slice(1965, 2020)).mean(dim='ens_HCST')

da = da.where(~np.isclose(da, 0), np.nan) if cfg_var_NO3.OBS_var == 'NO3' else da
da_lag1 = da.shift(year=-1)
valid = (~np.isnan(da)) & (~np.isnan(da_lag1))
da      = da.where(valid)
da_lag1 = da_lag1.where(valid)

autocorr_NO3_HCST_ds_4yr_em = xr.corr(da, da_lag1, dim='year')   # ens_LE, lat, lon

In [17]:
# save temporary file (HCST)
start_time = time.time()

autocorr_NO3_ODA.to_netcdf(savefilepath + "/autocorr_NO3_ODA" + ".nc")
autocorr_NO3_ODA_em.to_netcdf(savefilepath + "/autocorr_NO3_ODA_em" + ".nc")
autocorr_NO3_ODA_ds_4yr.to_netcdf(savefilepath + "/autocorr_NO3_ODA_ds_4yr" + ".nc")
autocorr_NO3_ODA_ds_4yr_em.to_netcdf(savefilepath + "/autocorr_NO3_ODA_ds_4yr_em" + ".nc")


autocorr_NO3_LE.to_netcdf(savefilepath + "/autocorr_NO3_LE" + ".nc")
autocorr_NO3_LE_em.to_netcdf(savefilepath + "/autocorr_NO3_LE_em" + ".nc")
autocorr_NO3_LE_ds_4yr.to_netcdf(savefilepath + "/autocorr_NO3_LE_ds_4yr" + ".nc")
autocorr_NO3_LE_ds_4yr_em.to_netcdf(savefilepath + "/autocorr_NO3_LE_ds_4yr_em" + ".nc")


autocorr_NO3_HCST.to_netcdf(savefilepath + "/autocorr_NO3_HCST" + ".nc")
autocorr_NO3_HCST_em.to_netcdf(savefilepath + "/autocorr_NO3_HCST_em" + ".nc")
autocorr_NO3_HCST_ds_4yr.to_netcdf(savefilepath + "/autocorr_NO3_HCST_ds_4yr" + ".nc")
autocorr_NO3_HCST_ds_4yr_em.to_netcdf(savefilepath + "/autocorr_NO3_HCST_ds_4yr_em" + ".nc")



end_time = time.time()
elapsed_time = end_time - start_time
print('elasped time for saving NO3 corr, ' + str(iyear) + ': ' + str(elapsed_time))

elasped time for saving NO3 corr, 60: 1.3934228420257568


In [18]:
savefilepath

'/mnt/lustre/proj/kimyy/tmp_python/HCST_skills_autocorr'

In [19]:
# critical r with alpha=0.1, alpha=0.34
N = 10.37
alpha = 0.1  # two-sided, so alpha/2 = 0.05
df = N - 2

# 1) Find t_crit for a two-sided test at alpha=0.1
t_crit = t.ppf(1 - alpha/2, df)  # e.g., ppf(0.95) for the upper 5% tail

# 2) Solve for the correlation threshold
rho_crit = np.sqrt(t_crit**2 / ((N - 2) + t_crit**2))
print(f"N={N}, alpha={alpha}, df={df}")
print(f"t_crit = {t_crit:.3f}")
print(f"Correlation threshold ≈ ±{rho_crit:.3f}")

N=10.37, alpha=0.1, df=8.37
t_crit = 1.849
Correlation threshold ≈ ±0.539


In [21]:
# cfg_var_NO3.OBS_ds['lat']=cfg_var_NO3.HCST_ds_yearly['lat']
# cfg_var_NO3.OBS_ds['lon']=cfg_var_NO3.HCST_ds_yearly['lon']
# hcst_skill = xr.corr(cfg_var_NO3.OBS_ds['NO3'].sel(year=slice(1965, 2020)),
#                      cfg_var_NO3.HCST_ds_yearly['NO3'].sel(year=slice(1965, 2020)),
#                      dim='year')

# le_skill = xr.corr(cfg_var_NO3.OBS_ds['NO3'].sel(year=slice(1965, 2020)),
#                      cfg_var_NO3.LE_ds['NO3'].sel(year=slice(1965, 2020)),
#                      dim='year')


# common_years = np.intersect1d(cfg_var_NO3.OBS_ds.year, 
#                               cfg_var_NO3.HCST_ds_yearly.year)

hcst_le_skill = xr.corr(cfg_var_NO3.HCST_ds_yearly['NO3'].sel(year=slice(1965, 2020)),
                     cfg_var_NO3.LE_ds['NO3'].sel(year=slice(1965, 2020)),
                     dim='year')

In [22]:
hcst_le_skill.to_netcdf(savefilepath + "/hcst_le_skill_obs_NO3" + ".nc")