# CESM2 Large Ensemble 2 Investiagtion

Daily files are located under `/glade/campaign/cgd/cesm/CESM2-LE/timeseries/atm/proc/tseries/day_1/WSPDSRFAV/*` on the NCAR server.

__Variables analyzed__
- `WSPDSRFAV`: Horizontal total wind speed average at the surface [$m \ s^{-1}$]
- _WSPDSRFAV anomaly_: artificially constructed according to $x_i-\bar{x}_{\text{time}}$

In [1]:
import matplotlib.pyplot as plt
from matplotlib import colors
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.ticker import LatitudeFormatter, LongitudeFormatter
import nc_time_axis
import numpy as np
import pandas as pd
import pwlf
import xarray as xr
import cf_xarray as cfxr   # to use cf_xarray attributes
import regionmask
from glob import glob
import scienceplots
%matplotlib inline
plt.style.use(["nature", "notebook"])

xr.set_options(keep_attrs=True)
%load_ext rich
from rich import print  # pretty printing
from tqdm import tqdm  # progress bar
import warnings  # deal with warnings

# To access collection
import dask
import intake
from dask_jobqueue import PBSCluster
from dask.distributed import Client

# Play nice with CMIP6 data
import xclim.ensembles as ensembles

## Helper functions

In [2]:
def mask_data(data, map, regions: None, drop=False, reverse=False):
    """Mask xarray data based on region names

    Args:
        data (xarray dataset): xarray dataset to mask
        map (regionmask): regionmask object
        regions (list or None): list of region names to mask. if None, all regions are taken. Defaults to None.
        drop (bool, optional): Whether to drop when masking. Defaults to False.
        reverse (bool, optional): Whether to mask the inverse of the regions. Defaults to False.

    Returns:
        xarray dataset: masked dataset
    """
    # Load the region mask
    mask = map.mask(data.cf['lat'], data.cf['lon'])
    # Extract keys for the region
    id_dict = map.region_ids
    # Get region names
    if regions is None:
        names = id_dict.keys()
    else:
        # Coerce region names to uppercase
        regions = [region.upper() for region in regions]
        names = [name for name in id_dict.keys() if str(name).upper() in regions]
        assert len(names) == len(regions), 'Not enough regions found'
    # Get the key for the regions
    keys = [id_dict[name] for name in names]
    # Apply the mask to the data
    if reverse:
        masked_data = data.where(~mask.isin(keys), drop=drop)
    else:
        masked_data = data.where(mask.isin(keys), drop=drop)
    return masked_data

## Spin up Dask cluster

In [None]:
# Create our NCAR Cluster - which uses PBSCluster under the hood
num_jobs = 5
cluster = PBSCluster(
    job_name = 'valencig-dask-hpc',
    cores = 4,
    memory = '10GiB',
    local_directory = '/glade/u/home/valencig/spilled/',
    log_directory = '/glade/u/home/valencig/worker-logs/',
    queue = 'casper',
    walltime = '02:00:00', # Change wall time if needed
    interface = 'ext'
)


# Spin up workers
cluster.scale(jobs=num_jobs)

# Assign the cluster to our Client
client = Client(cluster)

# Block progress until workers have spawned
client.wait_for_workers(num_jobs)
client

## Create ensemble

### Get simulation names

In [3]:
# Get list of all files
all_files = glob('/glade/campaign/cgd/cesm/CESM2-LE/timeseries/atm/proc/tseries/day_1/WSPDSRFAV/*')
# Simulation runs
simulations = set([f.split('.')[2] for f in all_files])
simulations

[1m{[0m[32m'BHISTcmip6'[0m, [32m'BSSP370cmip6'[0m, [32m'BSSP370smbb'[0m, [32m'BHISTsmbb'[0m[1m}[0m

### Load one simulation

In [4]:
# Extract all member numbers
sim_files = [f for f in all_files if f.split('.')[2] == 'BSSP370smbb']
members = list(set([f.split('.')[4] for f in sim_files]))
# Create list to store datasets for each member
sim_list = []
for member in tqdm(members, desc="Loading members..."):
    # Get files for this member
    m_files = [f for f in sim_files if f.split('.')[4] == member]
    # Load dataset
    ds = xr.open_mfdataset(m_files)
    # Remove 2100 (data extends to 2099)
    sliced = ds.sel(time=slice(None, '2099'))
    # Expand dimension name
    sim_list.append(sliced.WSPDSRFAV)
simulation = ensembles.create_ensemble(sim_list, realizations=members)
yearly = simulation.groupby('time.year').mean()

Loading members...: 100%|██████████| 14/14 [01:13<00:00,  5.26s/it]


In [None]:
# Masking regions
land_region = regionmask.defined_regions.natural_earth_v5_0_0.land_110  # Land has value 0
countries = regionmask.defined_regions.natural_earth_v5_0_0.countries_110

# mask to land
land = mask_data(yearly, land_region, ['land'], drop=True)
# Eliminate the planet of Hoth
land = mask_data(land, countries, ['greenland'], reverse=True, drop=True)
land = land.where((land['lat']>-59).compute(), drop=True)  # antarctica
land = land.where((land['lat']<70).compute(), drop=True)  # Northern canada
land = land.compute()

In [None]:
land['WSPDSRFAV'].mean(['lat', 'lon']).plot(hue='realization')

## Kill Dask Cluster

In [None]:
client.shutdown()