In [None]:
# this script is for the calculation of PDFs in Figure S5 of Rodgers et al. 2021 (https://doi.org/10.5194/esd-2021-50). 
# If you have have any questions, please contact the author of this notebook.
# Author: Lei Huang (huanglei[AT]pusan[DOT]ac[DOT]kr)

# import

In [64]:
%matplotlib inline

import numpy as np
import xarray as xr 
import matplotlib.pyplot as plt
import matplotlib as mpl
import glob
import dask.array as da
import pandas as pd


# seting parallel

In [None]:
## Run the mpirun in command line:
## mpirun --np 6 dask-mpi --scheduler-file scheduler.json --no-nanny --dashboard-address :8785 --memory-limit=60e9

from dask.distributed import Client
client = Client(scheduler_file = 'the_path_for_your_scheduler_json_file')

# functions for reading ensembles in parallel

In [66]:
# preprocess dataset prior to concatenation
variables = []
exceptcv = ['time', 'nlat', 'nlon', 'z_t',
            'lon', 'lat', 'gw', 'landfrac', 'area', *variables]
def def_process_coords(exceptcv = []):
    def process_coords(ds, except_coord_vars=exceptcv):
        coord_vars = []
        for v in np.array(ds.coords):
            if not v in except_coord_vars:
                coord_vars += [v]
        for v in np.array(ds.data_vars):
            if not v in except_coord_vars:
                coord_vars += [v]
        return ds.drop(coord_vars)
    return process_coords



In [67]:
# define function to read in files for historical simulations
def read_in(var, exceptcv, domain='lnd/', freq='day_1/', stream='h6', chunks=dict(time=365), ens_s = 0, ens_e = 100):
    ens_dir = "mother_directory_for_ensemble_files"
    projens_names = [member.split('archive/')[1][:-1] for member in sorted(
        glob.glob(ens_dir + "b.e21.BSSP370*.f09_g17*/"))][ens_s:ens_e]
    proj_ncfiles = []
    for i in np.arange(len(projens_names)):
        proj_fnames = sorted(glob.glob(
            ens_dir + projens_names[i] + "/" + domain + "proc/tseries/" + freq + "*" + stream + var + "*"))
        proj_ncfiles.append(proj_fnames[-2:])
    ens_numbers = [members.split('LE2-')[1]
                   for members in projens_names]
    proj_ds = xr.open_mfdataset(proj_ncfiles,
                                chunks=chunks,
                                preprocess=def_process_coords(exceptcv),
                                combine='nested',
                                concat_dim=[[*ens_numbers], 'time'],
                                parallel=True)

    ens_ds = proj_ds.rename({'concat_dim': 'ensemble'})
    return ens_ds


# PDF for precipitation in india

In [6]:
variables = ['PRECT']
exceptcv = ['time', 'lat', 'lon', 'gw',  *variables]
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/atm/proc/tseries/day_1/*.PRECT.1980*')) \
    + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/atm/proc/tseries/day_1/*.PRECT.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
prect_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks={'time':365},
                          combine='nested',
                                  preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})


In [7]:
prect_proj_ds = read_in(var = '.PRECT.',
                     exceptcv = exceptcv,
                     domain = 'atm/',
                     freq = 'day_1/',
                     stream = 'h1',
                     chunks= dict(time = 365))

In [8]:
# the purpose to read in NEP here is to get continent mask from CLM5, and clip continent from the precipitation array
variables = ['NEP']
exceptcv = ['time', 'lat', 'lon', 'gw', 'landfrac', 'area',  *variables]
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/lnd/proc/tseries/day_1/*.NEP.1980*')) \
            + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/lnd/proc/tseries/day_1/*.NEP.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
nep_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks={'time':365},
                          combine='nested',
                                preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})

nep_proj_ds = read_in(var = '.NEP.',
                     exceptcv = exceptcv,
                     domain = 'lnd/',
                     freq = 'day_1/',
                     stream = 'h5',
                     chunks= dict(time = 365),
                     ens_s = 10,
                    ens_e = 100)

In [288]:
prect_hist_india = prect_hist_ds.PRECT.sel(lat = slice(7,30), lon = slice(68,89))* 24* 3600 * 1000
prect_proj_india = prect_proj_ds.PRECT.sel(lat = slice(7,30), lon = slice(68,89), time = slice('2090-01-01','2099-12-31'))* 24* 3600 * 1000

In [289]:
gw_hist_india = prect_hist_ds.gw.sel(lat = slice(7,30)).broadcast_like(prect_hist_india).chunk({'time':prect_hist_india.chunks[1]})
gw_proj_india = prect_proj_ds.gw.sel(lat = slice(7,30), time = slice('2090-01-01','2099-12-31')).broadcast_like(prect_proj_india).chunk({'time':prect_proj_india.chunks[1]})

In [290]:
landfrac_hist_india = nep_hist_ds.landfrac.sel(lat = slice(7,30), lon = slice(68,89))
landfrac_proj_india = nep_proj_ds.landfrac.sel(lat = slice(7,30), lon = slice(68,89))

In [291]:
landfrac_hist_india['lat'] = prect_hist_india.lat
landfrac_hist_india['lon'] = prect_hist_india.lon
landfrac_proj_india['lat'] = prect_proj_india.lat
landfrac_proj_india['lon'] = prect_proj_india.lon

In [292]:
# clip the continent from the precipitation array, we select grids with land fraction over 0.9 as the continent
prect_hist_india = prect_hist_india.where(landfrac_hist_india[0,...] >= 0.9)
prect_proj_india = prect_proj_india.where(landfrac_proj_india[0,0,...]>= 0.9)

In [297]:
h_hist_prect_india_raw, bins_hist_prect_india_raw = np.histogram(prect_hist_india,
                                                            bins = np.arange(0,600.1,4),
                                                            weights = gw_hist_india,
                                                            density = True)

In [298]:
h_proj_prect_india_raw, bins_proj_prect_india_raw = np.histogram(prect_proj_india,
                                                            bins = np.arange(0,600.1,4),
                                                            weights = gw_proj_india,
                                                            density = True)

In [299]:
s1 = np.expand_dims(bins_hist_prect_india_raw[1:]-2, axis = 1)
s2 = np.expand_dims(h_hist_prect_india_raw, axis = 1)
s3 = np.expand_dims(h_proj_prect_india_raw, axis = 1)
pd.DataFrame(data = np.concatenate((s1,s2,s3), axis = 1),
             columns=['bins', 'h_hist', 'h_proj']).to_csv('path_csv_file', index=False)


# PDF for Arctic Temperature

In [175]:
variables = ['TS']
exceptcv = ['time', 'lat', 'lon', 'gw',  *variables]
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/atm/proc/tseries/day_1/*.TS.1980*')) \
    + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/atm/proc/tseries/day_1/*.TS.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
ts_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks={'time':365},
                          combine='nested',
                               preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})


In [176]:
ts_proj_ds = read_in(var = '.TS.',
                     exceptcv = exceptcv,
                     domain = 'atm/',
                     freq = 'day_1/',
                     stream = 'h1',
                     chunks= dict(time = 365))

In [180]:
ts_hist_Ac = ts_hist_ds.TS.sel(lat = slice(60,90)) - 273.15
ts_proj_Ac = ts_proj_ds.TS.sel(lat = slice(60,90), time = slice('2090-01-01','2099-12-31')) - 273.15

In [185]:
gw_hist_Ac = ts_hist_ds.gw.sel(lat = slice(60,90)).broadcast_like(ts_hist_Ac).chunk({'time':ts_hist_Ac.chunks[1]})
gw_proj_Ac = ts_proj_ds.gw.sel(lat = slice(60,90), time = slice('2090-01-01','2099-12-31')).broadcast_like(ts_proj_Ac).chunk({'time':ts_hist_Ac.chunks[1]})

In [186]:
h_hist_ts_Ac_raw, bins_hist_ts_Ac_raw = np.histogram(ts_hist_Ac,
                                                    bins = np.arange(-50,40.5,0.5 ),
                                                    weights = gw_hist_Ac,
                                                    density = True)

In [187]:
h_proj_ts_Ac_raw, bins_proj_ts_Ac_raw = np.histogram(ts_proj_Ac,
                                                    bins = np.arange(-50,40.5,0.5 ),
                                                    weights = gw_proj_Ac,
                                                    density = True)

In [188]:
s1 = np.expand_dims(bins_hist_ts_Ac_raw[1:]-0.25, axis = 1)
s2 = np.expand_dims(h_hist_ts_Ac_raw, axis = 1)
s3 = np.expand_dims(h_proj_ts_Ac_raw, axis = 1)
pd.DataFrame(data = np.concatenate((s1,s2,s3), axis = 1),
             columns=['bins', 'h_hist', 'h_proj']).to_csv('path_csv_file', index=False)


# PDF for AMOC

In [68]:
variables = ['MOC']
exceptcv = ['time', 'moc_comp', 'transport_reg', 'lat_aux_grid',  *variables]
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/ocn/proc/tseries/month_1/*.MOC.1980*')) \
            + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/ocn/proc/tseries/month_1/*.MOC.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
amoc_hist_ds = xr.open_mfdataset(ncfiles,
                          combine='nested',
                                 preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})


In [69]:
amoc_proj_ds = read_in(var = '.MOC.',
                     exceptcv = exceptcv,
                     domain = 'ocn/',
                     freq = 'month_1/',
                     stream = 'h')


In [70]:
amoc_hist = amoc_hist_ds.MOC.isel(moc_comp = 0, transport_reg = 1).sel(lat_aux_grid = [26.5], method = 'nearest').max('moc_z').mean('lat_aux_grid').compute()
amoc_proj = amoc_proj_ds.MOC.isel(moc_comp = 0, transport_reg = 1).sel(lat_aux_grid = [26.5], method = 'nearest').sel(time = slice('2090-02-01','2100-01-01')).max('moc_z').mean('lat_aux_grid').compute()

In [71]:
h_hist_amoc_raw, bins_hist_amoc_raw = np.histogram(amoc_hist,
                                                  bins = np.arange(0,30.5,0.5),
                                                  density = True)

In [72]:
h_proj_amoc_raw, bins_proj_amoc_raw = np.histogram(amoc_proj,
                                                  bins = np.arange(0,30.5,0.5),
                                                  density = True)

In [73]:
s1 = np.expand_dims(bins_hist_amoc_raw[1:] - 0.25, axis = 1)
s2 = np.expand_dims(h_hist_amoc_raw, axis = 1)
s3 = np.expand_dims(h_proj_amoc_raw, axis = 1)
pd.DataFrame(data = np.concatenate((s1,s2,s3), axis = 1),
             columns=['bins', 'h_hist', 'h_proj']).to_csv('path_csv_file', index=False)


# PDF for Nino3.4 U-850hpa

In [203]:
variables = ['U850']
exceptcv = ['time', 'lat', 'lon', 'gw',  *variables]
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/atm/proc/tseries/day_1/*.U850.1980*')) \
            + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/atm/proc/tseries/day_1/*.U850.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
u850_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks={'time':365},
                          combine='nested',
                                 preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})


In [204]:
u850_proj_ds = read_in(var = '.U850.',
                     exceptcv = exceptcv,
                     domain = 'atm/',
                     freq = 'day_1/',
                     stream = 'h1',
                     chunks= dict(time = 365))

In [205]:
u850_hist_nino = u850_hist_ds.U850.sel(lat = slice(-5,5), lon = slice(190,240))
u850_proj_nino = u850_proj_ds.U850.sel(lat = slice(-5,5), lon = slice(190,240), time = slice('2090-01-01','2099-12-31'))

In [209]:
gw_hist_nino = u850_hist_ds.gw.sel(lat = slice(-5,5)).broadcast_like(u850_hist_nino).chunk({'time':u850_hist_nino.chunks[1]})
gw_proj_nino = u850_proj_ds.gw.sel(lat = slice(-5,5), time = slice('2090-01-01','2099-12-31')).broadcast_like(u850_proj_nino).chunk({'time':u850_proj_nino.chunks[1]})

In [210]:
h_hist_u850_nino_raw, bins_hist_u850_nino_raw = np.histogram(u850_hist_nino,
                                                            bins = np.arange(-30,20.5,0.5),
                                                            weights = gw_hist_nino,
                                                            density = True)

In [211]:
h_proj_u850_nino_raw, bins_proj_u850_nino_raw = np.histogram(u850_proj_nino,
                                                            bins = np.arange(-30,20.5,0.5),
                                                            weights = gw_proj_nino,
                                                            density = True)

In [212]:
s1 = np.expand_dims(bins_hist_u850_nino_raw[1:]-0.25, axis = 1)
s2 = np.expand_dims(h_hist_u850_nino_raw, axis = 1)
s3 = np.expand_dims(h_proj_u850_nino_raw, axis = 1)
pd.DataFrame(data = np.concatenate((s1,s2,s3), axis = 1),
             columns=['bins', 'h_hist', 'h_proj']).to_csv('path_csv_file', index=False)


# PDF for Chlorophyll in Southern Ocean

In [86]:
# In the Biogeochemistry module, chlorophyll concentration equals the sum of diatChl_SURF, diazChl_SURF, and spChl_SURF
## read in chlorophyll for 1980-1989
variables = ['diatChl_SURF']
exceptcv = ['time', 'nlat', 'nlon', 'z_t', 'TAREA', *variables]
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/ocn/proc/tseries/day_1/*.diatChl_SURF.1980*')) \
            + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/ocn/proc/tseries/day_1/*.diatChl_SURF.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
tchl_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks=dict(nlat = 192, nlon = 160, time = 365),
                          combine='nested',
                          preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/ocn/proc/tseries/day_1/*.diazChl_SURF.1980*')) \
            + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/ocn/proc/tseries/day_1/*.diazChl_SURF.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
variables = ['diazChl_SURF']
exceptcv = ['time', 'nlat', 'nlon', 'z_t', 'TAREA', *variables]
zchl_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks=dict(nlat = 192, nlon = 160, time = 365),
                          combine='nested',
                          preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})
ncfiles = sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTcmip6*/ocn/proc/tseries/day_1/*.spChl_SURF.1980*')) \
            + sorted(glob.glob('mother_directory_for_ensemble_files/b.e21.BHISTsmbb*/ocn/proc/tseries/day_1/*.spChl_SURF.1980*'))
hist_ens_numbers = [member.split('LE2-')[1][:8] for member in ncfiles]
variables = ['spChl_SURF']
exceptcv = ['time', 'nlat', 'nlon', 'z_t', 'TAREA', *variables]
spchl_hist_ds = xr.open_mfdataset(ncfiles,
                          chunks=dict(nlat = 192, nlon = 160, time = 365),
                          combine='nested',
                          preprocess=def_process_coords(exceptcv),
                          concat_dim =[[*hist_ens_numbers]],
                          parallel = True).rename({'concat_dim':'ensemble'})


In [87]:
## read in chlorophyll for 2090-2099
variables = ['diatChl_SURF']
exceptcv = ['time', 'nlat', 'nlon', 'z_t', 'TAREA', *variables]
tchl_proj_ds = read_in(var = '.diatChl_SURF.',
                     exceptcv = exceptcv,
                     domain = 'ocn/',
                     freq = 'day_1/',
                     stream = 'h*',
                     chunks= dict(nlat = 192, nlon = 160, time = 365),)
variables = ['diazChl_SURF']
exceptcv = ['time', 'nlat', 'nlon', 'z_t', 'TAREA', *variables]
zchl_proj_ds = read_in(var = '.diazChl_SURF.',
                     exceptcv = exceptcv,
                     domain = 'ocn/',
                     freq = 'day_1/',
                     stream = 'h*',
                     chunks= dict(nlat = 192, nlon = 160, time = 365),)
variables = ['spChl_SURF']
exceptcv = ['time', 'nlat', 'nlon', 'z_t', 'TAREA', *variables]
spchl_proj_ds = read_in(var = '.spChl_SURF.',
                     exceptcv = exceptcv,
                     domain = 'ocn/',
                     freq = 'day_1/',
                     stream = 'h*',
                     chunks= dict(nlat = 192, nlon = 160, time = 365),)


In [109]:
TLAT = xr.open_dataset(ncfiles[-1]).TLAT
TLONG = xr.open_dataset(ncfiles[-1]).TLONG

In [93]:
chl_hist = tchl_hist_ds.diatChl_SURF[:,:,...] \
            + zchl_hist_ds.diazChl_SURF[:,:,...] \
                + spchl_hist_ds.spChl_SURF[:,:,...]
chl_proj = tchl_proj_ds.diatChl_SURF.sel(time = slice('2090-01-02','2100-01-01')) \
            + zchl_proj_ds.diazChl_SURF.sel(time = slice('2090-01-02','2100-01-01')) \
            + spchl_proj_ds.spChl_SURF.sel(time = slice('2090-01-02','2100-01-01'))

In [97]:
tarea_hist = tchl_hist_ds.TAREA.broadcast_like(chl_hist).chunk({'time':chl_hist.chunks[1]})
tarea_proj = tchl_proj_ds.TAREA.sel(time = slice('2090-01-02','2100-01-01')).broadcast_like(chl_proj).chunk({'time':chl_proj.chunks[1]})

In [None]:
chl_hist_SO = chl_hist.where((TLAT >= -60) & (TLAT <= -40), drop = True)
chl_proj_SO = chl_proj.where((TLAT >= -60) & (TLAT <= -40), drop = True)
tarea_hist_SO = tarea_hist.where((TLAT >= -60) & (TLAT <= -40), drop = True)
tarea_proj_SO = tarea_proj.where((TLAT >= -60) & (TLAT <= -40), drop = True)

In [162]:
h_hist_chl_SO_raw, bins_hist_chl_SO_raw = np.histogram(chl_hist_SO,
                                                      bins = np.arange(0,20.2,0.2),
                                                      weights = tarea_hist_SO,
                                                      density=True)

In [163]:
h_proj_chl_SO_raw, bins_proj_chl_SO_raw = np.histogram(chl_proj_SO,
                                                      bins = np.arange(0,20.2,0.2),
                                                      weights = tarea_proj_SO,
                                                      density=True)

In [165]:
s1 = np.expand_dims(bins_hist_chl_SO_raw[1:]-0.1, axis = 1)
s2 = np.expand_dims(h_hist_chl_SO_raw, axis = 1)
s3 = np.expand_dims(h_proj_chl_SO_raw, axis = 1)
pd.DataFrame(data = np.concatenate((s1,s2,s3), axis = 1), 
             columns=['bins', 'h_hist', 'h_proj']).to_csv('path_csv_file', index=False)
