In [1]:
from glob import glob
import xarray as xr
import cf_xarray # use cf-xarray so that we can use CF attributes
import pandas as pd
import matplotlib.pyplot as plt
import regionmask
from xclim.indices.stats import frequency_analysis
import cartopy.crs as ccrs
import nc_time_axis
import numpy as np
import warnings
# To access collection
import dask
import intake
from dask_jobqueue import PBSCluster
from dask.distributed import Client, LocalCluster, futures_of
from dask.diagnostics import ProgressBar
from tqdm import tqdm 
import regionmask

## Spin up Dask cluster

In [2]:
# Create our NCAR Cluster - which uses PBSCluster under the hood
num_jobs = 10
cluster = PBSCluster(
    job_name = 'valencig-dask-hpc',
    cores = 1,
    memory = '10GiB',
    processes = 1,
    local_directory = '/glade/u/home/valencig/spilled/',
    log_directory = '/glade/u/home/valencig/worker-logs/',
    resource_spec = 'select=1:ncpus=1:mem=15GB',
    queue = 'casper',
    walltime = '02:00:00', # Change wall time if needed
    interface = 'ext'
)


# Spin up workers
cluster.scale(jobs=num_jobs)

# Assign the cluster to our Client
client = Client(cluster)

# Block progress until workers have spawned
client.wait_for_workers(num_jobs)

In [3]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/8787/status,Workers: 10
Total threads: 10,Total memory: 100.00 GiB

0,1
Comm: tcp://128.117.208.112:38279,Workers: 10
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/8787/status,Total threads: 10
Started: Just now,Total memory: 100.00 GiB

0,1
Comm: tcp://128.117.208.76:45167,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/39359/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:37577,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-lyc5ea0m,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-lyc5ea0m
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.98 MiB,Spilled bytes: 0 B
Read bytes: 1.07 MiB,Write bytes: 1.32 MiB

0,1
Comm: tcp://128.117.208.76:33421,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/43489/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:45483,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-_sx4acq2,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-_sx4acq2
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.98 MiB,Spilled bytes: 0 B
Read bytes: 108.26 MiB,Write bytes: 147.84 kiB

0,1
Comm: tcp://128.117.208.69:42679,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/32833/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.69:41281,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-w8em4kwq,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-w8em4kwq
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 65.04 MiB,Spilled bytes: 0 B
Read bytes: 13.72 MiB,Write bytes: 644.51 kiB

0,1
Comm: tcp://128.117.208.76:46759,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/33415/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:40879,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-2_zwy9oz,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-2_zwy9oz
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.97 MiB,Spilled bytes: 0 B
Read bytes: 3.82 MiB,Write bytes: 8.20 MiB

0,1
Comm: tcp://128.117.208.102:46671,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/43507/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.102:34999,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-r1j2c4eg,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-r1j2c4eg
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 131.79 MiB,Spilled bytes: 0 B
Read bytes: 16.14 MiB,Write bytes: 106.07 kiB

0,1
Comm: tcp://128.117.208.76:44745,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/39083/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:41397,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-j7gk65vn,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-j7gk65vn
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.93 MiB,Spilled bytes: 0 B
Read bytes: 2.32 GiB,Write bytes: 1.83 MiB

0,1
Comm: tcp://128.117.208.69:39781,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/37907/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.69:33995,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-85te1mxg,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-85te1mxg
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.97 MiB,Spilled bytes: 0 B
Read bytes: 6.85 MiB,Write bytes: 1.57 MiB

0,1
Comm: tcp://128.117.208.69:40713,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/33601/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.69:39089,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-v1rvw1jr,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-v1rvw1jr
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.85 MiB,Spilled bytes: 0 B
Read bytes: 8.29 MiB,Write bytes: 1.79 MiB

0,1
Comm: tcp://128.117.208.69:41629,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/34485/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.69:46125,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-qltfzlk3,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-qltfzlk3
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.92 MiB,Spilled bytes: 0 B
Read bytes: 586.89 kiB,Write bytes: 832.79 kiB

0,1
Comm: tcp://128.117.208.76:43007,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/41291/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:46413,
Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-cj44t0tw,Local directory: /glade/u/home/valencig/spilled/dask-scratch-space/worker-cj44t0tw
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.90 MiB,Spilled bytes: 0 B
Read bytes: 276.71 MiB,Write bytes: 218.86 kiB


In [4]:
cluster.get_logs()

### [Commands for managing dask workers](https://arc.ucar.edu/knowledge_base/68878389)


In [5]:
# See the workers in the job scheduler
!qstat -u $USER

# Kill all running or pending jobs
# !qdel `qselect -u $USER`

                                                            Req'd  Req'd   Elap
Job ID          Username Queue    Jobname    SessID NDS TSK Memory Time  S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
9375000.casper* valencig htc      STDIN       18541   1   1   10gb 02:00 R 00:03
9375025.casper* valencig htc      valencig-*  19315   1   1   15gb 02:00 R 00:00
9375026.casper* valencig htc      valencig-*  30738   1   1   15gb 02:00 R 00:00
9375027.casper* valencig htc      valencig-*  30753   1   1   15gb 02:00 R 00:00
9375028.casper* valencig htc      valencig-*  30777   1   1   15gb 02:00 R 00:00
9375029.casper* valencig htc      valencig-*  30801   1   1   15gb 02:00 R 00:00
9375030.casper* valencig htc      valencig-*  22809   1   1   15gb 02:00 R 00:00
9375031.casper* valencig htc      valencig-*  22830   1   1   15gb 02:00 R 00:00
9375032.casper* valencig htc      valencig-*  22860   1   1   15gb 02:00 R 00:00
9375033.casper* valencig htc  

## Read in the catalog

In [6]:
# 'cesm.json' is copy of '/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cesm2-le.json'
# Comment out "options": null in aggregation_controls.aggregations.0 in order to get intake-esm to work
# cat = intake.open_esm_datastore('cesm2.json')
cat = intake.open_esm_datastore('/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json')
cat

Unnamed: 0,unique
activity_id,17
institution_id,35
source_id,81
experiment_id,130
member_id,421
table_id,36
variable_id,1075
grid_label,12
dcpp_init_year,59
version,597


## Querying for desired variable

https://www.cesm.ucar.edu/community-projects/lens/data-sets

CMIP6 variable list --> https://na-cordex.org/variable-list.html

CMIP6 variable list --> https://clipc-services.ceda.ac.uk/dreq/mipVars.html

Also --> https://wcrp-cmip.github.io/CMIP6_CVs/docs/CMIP6_experiment_id.html

In [7]:
cat.search(variable_id=['wind*', 'Wind*']).df.variable_id.unique()

array(['sfcWind', 'sfcWindmax', 'sndmasswindrif'], dtype=object)

## Query and subset data catalog

Overview found [here](https://www2.cesm.ucar.edu/projects/CMIP6/):

ScenarioMIP: "Will provide multi-model climate projections based on alternative scenarios of future emissions and land use changes produced with integrated assessment models. The design consists of eight alternative 21st century scenarios plus one large initial condition ensemble and a set of long-term extensions. Climate model projections will facilitate integrated studies of climate change as well as address targeted scientific questions."

Citation: O'Neill, B. C., Tebaldi, C., van Vuuren, D.P., Eyring, V., Friedlingstein, P., Hurtt, G., Knutti, R., Kriegler, E., Lamarque, J.-F., Lowe, J., Meehl, G.A., Moss, R., Riahi, K., and Sanderson, B. M. 2016. The Scenario Model Intercomparison Project (ScenarioMIP) for CMIP6. Geosci. Model Dev., 9, 3461-3482.

In [8]:
cesm2 = cat.search(
    variable_id='sfcWind', # near surface wind
    source_id='CESM2',
    experiment_id='ssp*',
    # experiment_id='historical', # all historical forcings
    table_id='day', # day is highest resolution
    activity_id='ScenarioMIP'
)

In [9]:
cesm2.keys_info()

Unnamed: 0_level_0,activity_id,institution_id,source_id,experiment_id,table_id,grid_label
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ScenarioMIP.NCAR.CESM2.ssp126.day.gn,ScenarioMIP,NCAR,CESM2,ssp126,day,gn
ScenarioMIP.NCAR.CESM2.ssp245.day.gn,ScenarioMIP,NCAR,CESM2,ssp245,day,gn
ScenarioMIP.NCAR.CESM2.ssp370.day.gn,ScenarioMIP,NCAR,CESM2,ssp370,day,gn
ScenarioMIP.NCAR.CESM2.ssp585.day.gn,ScenarioMIP,NCAR,CESM2,ssp585,day,gn


## Read in using ```.to_dataset_dict()```

https://stackoverflow.com/questions/67813208/xarray-open-mfdataset-doesnt-work-if-dask-distributed-client-has-been-created

In [10]:
with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    dsets = cesm2.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


Let’s take a look at the keys - these are defined by the `groupby` attributes in the catalog. The groupby attributes in this case are:

https://www2.cesm.ucar.edu/experiments/cesm1.2/GLE/GLENS_output_fields/

`component.experiment.stream.forcing_variant.control_branch_year.variable`

- Component - which component this output is from (ex. atm represents the atmosphere)
- Experiment - which experiment this is from, in this case, this is `ssp370` which is one of the CMIP6 future experiments
- Stream - which stream this output is from, in this case, this is `cam.h1`, which represents daily output
- Control Branch Year - which year the ensemble branched off from, these are described within the [CESM2-LE documentation page](https://www.cesm.ucar.edu/community-projects/lens2)
- Variable - which variable you are working with


component = atm (atmosphere), lnd (land), ocn (ocean), ice

frequency = monthly, daily, or hourly6

experiment = historical (1850 to 2015) or ssp370 (2015 to 2100)

forcing_variant = the biomass forcing variant, cmip6 (the default in the cmip6 runs) or smbb (smoothed biomass burning)

variable = one of the variable names listed in the tables below

## Process Data

Time period for historical data is `1978` till `2014`.

Using the  annual maximum method with Gumbel distribution from [Palutikof et al., 1999](https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1017/s1350482799001103). Sourced from [this paper](https://meetingorganizer.copernicus.org/ECSS2019/ECSS2019-218-3.pdf) from the European Conference on Storms (2019).

In [28]:
def subset_ds(ds, task):
    states = regionmask.defined_regions.natural_earth_v5_0_0.us_states_50
    # Hawaii and Alaska are not included in the mask
    good_keys = [
        k for k in states.regions.keys() 
        if k not in states.map_keys(['Hawaii', 'Alaska'])
    ]
    mask = states.mask(ds.lon, ds.lat).isin(good_keys)
    da = ds.where(mask, drop=True).sfcWind.sel(time=slice('2015', '2100')) # Last time (2100) is wonky
    if task == 'mean':
        result = da.resample(time='1Y').mean(dim='time')
    elif task == 'anomaly':
        # Anomaly is x-x_mean 
        # Then get average anomaly on a yearly basis
        result = (da - da.mean('time')).resample(time='1Y').mean('time')
    elif task == '50yrmax':
        near = frequency_analysis(
            da.sel(time=slice('2021', '2040')),
            t=50,
            dist="gumbel_r",
            mode="max",
            freq="YS"
        ).sel(return_period=50).assign_coords({'forecast': 'near'}).expand_dims('forecast')
        mid = frequency_analysis(
            da.sel(time=slice('2041', '2060')),
            t=50,
            dist="gumbel_r",
            mode="max",
            freq="YS"
        ).sel(return_period=50).assign_coords({'forecast': 'mid'}).expand_dims('forecast')
        far = frequency_analysis(
            da.sel(time=slice('2081', '2100')),
            t=50,
            dist="gumbel_r",
            mode="max",
            freq="YS"
        ).sel(return_period=50).assign_coords({'forecast': 'far'}).expand_dims('forecast')
        result = xr.combine_by_coords([near, mid, far])
    return result

Tasks are `mean` or `anomaly` or `50yr`
- Mean: yearly mean
- Anomaly: yearly anomaly
- 50yrmax: Gets 50 year max estimates for 2021–2040 (the “near-term”), 2041–2060 (the “mid-term”), and 2081–2100 (the “long-term”)

In [30]:
tasks = ('50yrmax', )
for task in tasks:
    print(f'Running task: {task}')
    for key in tqdm(list(dsets.keys()), desc='Processing Data...'):
        ds = dsets[key].chunk({'time': 365})  # Get into approx 100 mb chunks
        da = subset_ds(ds, task=task).persist()
        da.compute().to_netcdf('/glade/u/home/valencig/wind-trend-analysis/data/'+key+'.'+task+'.nc')

Running task: 50yrmax


Processing Data...: 100%|██████████| 4/4 [01:02<00:00, 15.57s/it]


## Restart dask cluster

In [None]:
#client.restart()

## Close dask cluster

In [None]:
# client.shutdown()