In [1]:
from glob import glob
import xarray as xr
import cf_xarray # use cf-xarray so that we can use CF attributes
import pandas as pd
import matplotlib.pyplot as plt
import regionmask
import cartopy.crs as ccrs
import nc_time_axis
import numpy as np
import warnings
# To access collection
import dask
import intake
# from distributed import Client
from dask_jobqueue import PBSCluster
from dask.distributed import Client, LocalCluster, futures_of
from dask.diagnostics import ProgressBar
# from dask.distributed import progress
from tqdm import tqdm 
import regionmask
from xclim.indices.stats import frequency_analysis


# from ncar_jobqueue import NCARCluster
# https://ncar.github.io/esds/faq/#xarray-and-dask

# USE SCIENCEPLOTS

Following this tutorial --> https://ncar.github.io/esds/posts/2021/intake-cesm2-le-glade-example/

also --> https://ncar.github.io/esds/posts/2021/intake_esm_dask/

## Spin up Dask cluster

Useful thread --> https://github.com/dask/dask-jobqueue/issues/216

Also --> https://github.com/NCAR/ncar-jobqueue/issues/101

https://github.com/NCAR/dask-tutorial/blob/main/notebooks/05-dask-hpc.ipynb

Ncar yaml located at `~/.config/dask` --> changed interface to `ext`

In [3]:
# Create our NCAR Cluster - which uses PBSCluster under the hood
num_jobs = 1
cluster = PBSCluster(
    job_name = 'valencig-dask-hpc',
    cores = 1,
    memory = '10GiB',
    processes = 1,
    local_directory = '/glade/u/home/valencig/spilled/',
    log_directory = '/glade/u/home/valencig/worker-logs/',
    resource_spec = 'select=1:ncpus=1:mem=15GB',
    queue = 'casper',
    walltime = '02:00:00', # Change wall time if needed
    interface = 'ext'
)


# Spin up workers
cluster.scale(jobs=num_jobs)

# Assign the cluster to our Client
client = Client(cluster)

# Block progress until workers have spawned
client.wait_for_workers(num_jobs)

In [3]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/8787/status,Workers: 10
Total threads: 10,Total memory: 100.00 GiB

0,1
Comm: tcp://128.117.208.82:34383,Workers: 10
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/8787/status,Total threads: 10
Started: Just now,Total memory: 100.00 GiB

0,1
Comm: tcp://128.117.208.76:44309,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/43273/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:38123,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-bbh2gion,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-bbh2gion
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.93 MiB,Spilled bytes: 0 B
Read bytes: 20.79 MiB,Write bytes: 4.56 MiB

0,1
Comm: tcp://128.117.208.76:40813,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/34901/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:35067,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-ojwykalw,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-ojwykalw
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.94 MiB,Spilled bytes: 0 B
Read bytes: 26.89 MiB,Write bytes: 13.57 MiB

0,1
Comm: tcp://128.117.208.78:44371,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/37639/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.78:46359,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-lwtw93q4,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-lwtw93q4
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 19.5%,Last seen: Just now
Memory usage: 132.80 MiB,Spilled bytes: 0 B
Read bytes: 166.57 MiB,Write bytes: 4.03 MiB

0,1
Comm: tcp://128.117.208.57:40335,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/36119/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.57:40039,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-uw70rv3b,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-uw70rv3b
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 134.93 MiB,Spilled bytes: 0 B
Read bytes: 400.94 kiB,Write bytes: 897.48 kiB

0,1
Comm: tcp://128.117.208.84:42703,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/36719/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.84:44053,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-yymb4ze7,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-yymb4ze7
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 136.25 MiB,Spilled bytes: 0 B
Read bytes: 655.43 MiB,Write bytes: 841.77 MiB

0,1
Comm: tcp://128.117.208.76:45411,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/33289/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:46661,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-uwallqso,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-uwallqso
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.88 MiB,Spilled bytes: 0 B
Read bytes: 19.81 MiB,Write bytes: 13.82 MiB

0,1
Comm: tcp://128.117.208.76:42007,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/38285/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.76:38151,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-kmta8cdf,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-kmta8cdf
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 0.0%,Last seen: Just now
Memory usage: 64.81 MiB,Spilled bytes: 0 B
Read bytes: 11.65 MiB,Write bytes: 10.81 MiB

0,1
Comm: tcp://128.117.208.84:43013,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/44791/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.84:34973,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-hwaox3uk,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-hwaox3uk
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 136.32 MiB,Spilled bytes: 0 B
Read bytes: 653.26 MiB,Write bytes: 850.02 MiB

0,1
Comm: tcp://128.117.208.75:42323,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/42461/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.75:40721,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-vsw1jmq5,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-vsw1jmq5
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 4.0%,Last seen: Just now
Memory usage: 133.88 MiB,Spilled bytes: 0 B
Read bytes: 16.55 kiB,Write bytes: 8.95 kiB

0,1
Comm: tcp://128.117.208.75:38321,Total threads: 1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/valencig/proxy/39439/status,Memory: 10.00 GiB
Nanny: tcp://128.117.208.75:36259,
Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-kq6e52_g,Local directory: /glade/derecho/scratch/valencig/tmp/dask-scratch-space/worker-kq6e52_g
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 134.05 MiB,Spilled bytes: 0 B
Read bytes: 16.56 kiB,Write bytes: 10.00 kiB


In [4]:
cluster.get_logs()

### Commands for managing dask workers

https://arc.ucar.edu/knowledge_base/68878389

In [None]:
# See the workers in the job scheduler
!qstat -u $USER

# Kill all running or pending jobs
# !qdel `qselect -u $USER`

## Read in the catalog

In [5]:
# 'cesm.json' is copy of '/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cesm2-le.json'
# Comment out "options": null in aggregation_controls.aggregations.0 in order to get intake-esm to work
# cat = intake.open_esm_datastore('cesm2.json')
cat = intake.open_esm_datastore('/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json')
cat

Unnamed: 0,unique
activity_id,17
institution_id,35
source_id,81
experiment_id,130
member_id,421
table_id,36
variable_id,1075
grid_label,12
dcpp_init_year,59
version,597


## Querying for desired variable

https://www.cesm.ucar.edu/community-projects/lens/data-sets

CMIP6 variable list --> https://na-cordex.org/variable-list.html

Also --> https://wcrp-cmip.github.io/CMIP6_CVs/docs/CMIP6_experiment_id.html

In [6]:
# cat.search(component='atm', long_name=['wind*', 'Wind*']).df.long_name.unique()
cat.search(variable_id=['wind*', 'Wind*']).df.variable_id.unique()

array(['sfcWind', 'sfcWindmax', 'sndmasswindrif'], dtype=object)

## Query and subset data catalog

Overview found [here](https://www2.cesm.ucar.edu/projects/CMIP6/):

ScenarioMIP: "Will provide multi-model climate projections based on alternative scenarios of future emissions and land use changes produced with integrated assessment models. The design consists of eight alternative 21st century scenarios plus one large initial condition ensemble and a set of long-term extensions. Climate model projections will facilitate integrated studies of climate change as well as address targeted scientific questions."

Citation: O'Neill, B. C., Tebaldi, C., van Vuuren, D.P., Eyring, V., Friedlingstein, P., Hurtt, G., Knutti, R., Kriegler, E., Lamarque, J.-F., Lowe, J., Meehl, G.A., Moss, R., Riahi, K., and Sanderson, B. M. 2016. The Scenario Model Intercomparison Project (ScenarioMIP) for CMIP6. Geosci. Model Dev., 9, 3461-3482.

In [7]:
cesm2 = cat.search(
    variable_id='sfcWind', # near surface wind
    source_id='CESM2',
    experiment_id='ssp*',
    # experiment_id='historical', # all historical forcings
    table_id='day', # day is highest resolution
    activity_id='ScenarioMIP'
)

In [8]:
cesm2.keys_info()

Unnamed: 0_level_0,activity_id,institution_id,source_id,experiment_id,table_id,grid_label
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ScenarioMIP.NCAR.CESM2.ssp126.day.gn,ScenarioMIP,NCAR,CESM2,ssp126,day,gn
ScenarioMIP.NCAR.CESM2.ssp245.day.gn,ScenarioMIP,NCAR,CESM2,ssp245,day,gn
ScenarioMIP.NCAR.CESM2.ssp370.day.gn,ScenarioMIP,NCAR,CESM2,ssp370,day,gn
ScenarioMIP.NCAR.CESM2.ssp585.day.gn,ScenarioMIP,NCAR,CESM2,ssp585,day,gn


## Read in using ```.to_dataset_dict()```

https://stackoverflow.com/questions/67813208/xarray-open-mfdataset-doesnt-work-if-dask-distributed-client-has-been-created

In [9]:
with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    dsets = cesm2.to_dataset_dict()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


Let’s take a look at the keys - these are defined by the `groupby` attributes in the catalog. The groupby attributes in this case are:

https://www2.cesm.ucar.edu/experiments/cesm1.2/GLE/GLENS_output_fields/

`component.experiment.stream.forcing_variant.control_branch_year.variable`

- Component - which component this output is from (ex. atm represents the atmosphere)
- Experiment - which experiment this is from, in this case, this is `ssp370` which is one of the CMIP6 future experiments
- Stream - which stream this output is from, in this case, this is `cam.h1`, which represents daily output
- Control Branch Year - which year the ensemble branched off from, these are described within the [CESM2-LE documentation page](https://www.cesm.ucar.edu/community-projects/lens2)
- Variable - which variable you are working with


component = atm (atmosphere), lnd (land), ocn (ocean), ice

frequency = monthly, daily, or hourly6

experiment = historical (1850 to 2015) or ssp370 (2015 to 2100)

forcing_variant = the biomass forcing variant, cmip6 (the default in the cmip6 runs) or smbb (smoothed biomass burning)

variable = one of the variable names listed in the tables below

## Process Data

Like Shen (2022) we only take the first model member.

"Note that only the first ensemble member (index of “r1i1p1f1”) is selected from each model"

For extreme wind speeds, refer to [this site](https://www.wind-pioneers.com/extreme-wind-analysis-at-a-wind-turbine-site/). If the Weibull distribution shape factor $k$ can be determined then the [EWTS II](https://openwind.ul-renewables.com/ewts.html) algorithm works.

In [16]:
# !!! SWITCH BACK TO 2100 WHEN USING SSP DATA !!!

def subset_ds(ds, task):
    states = regionmask.defined_regions.natural_earth_v5_0_0.us_states_50
    # Hawaii and Alaska are not included in the mask
    good_keys = [
        k for k in states.regions.keys() 
        if k not in states.map_keys(['Hawaii', 'Alaska'])
    ]
    mask = states.mask(ds.lon, ds.lat).isin(good_keys)
    da = ds.where(mask, drop=True).sfcWind.isel(member_id=0).sel(time=slice('1978', '2100')) # Last time (2100) is wonky
    if task == 'mean':
        result = da.resample(time='1Y').mean(dim='time')
    elif task == 'anomaly':
        # Anomaly is x-x_mean 
        # Then get average anomaly on a yearly basis
        result = (da - da.mean('time')).resample(time='1Y').mean('time')
    return result

In [17]:
# ISSUE: NEED TO MASK TO ONLY BE LAND DATA
task = 'mean'
print(f'Running task: {task}')
for key in tqdm(list(dsets.keys()), desc='Processing Data...'):
    ds = dsets[key].chunk({'time':365}) # Get into approx 100 mb chunks
    da = subset_ds(ds, task=task).persist()
    da.compute().to_netcdf('/glade/u/home/valencig/wind-trend-analysis/data/'+key+'.'+task+'.nc')

Running task: mean


Processing Data...:   0%|          | 0/4 [00:15<?, ?it/s]


## Restart dask cluster

In [None]:
client.restart()

## Close dask cluster

In [4]:
client.shutdown()