# Building CMIP5 Collection Catalog

This notebook demonstrates how to generate CMIP5 collection catalog using `intake_cesm` for CMIP5 datasets residing on NCAR's GLADE file system.

In [1]:
# Import Packages
from dask.distributed import Client
import dask
from dask_jobqueue import SLURMCluster
import os
PROJECT = os.environ["PBS_ACCOUNT"]

In [2]:
# Create Dask Cluster using Jobqueue to speed up collection catalog generation
cluster = SLURMCluster(project=PROJECT, processes=6, cores=12, memory="10GB",
                           env_extra=['export LANG="en_US.utf8"',
                                      'export LANGUAGE="en_US.utf8"',
                                      'export LC_ALL="en_US.utf8"',
                                      'export LD_LIBRARY_PATH=""',])

In [3]:
cluster.adapt(minimum=12, maximum=24)

<distributed.deploy.adaptive.Adaptive at 0x2ae4cffab6a0>

In [5]:
!squeue -u $USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           2235786       dav dask-wor  abanihi  R       0:06      1 casper03
           2235787       dav dask-wor  abanihi  R       0:06      1 casper02
           2235596       dav     srun  abanihi  R      27:49      1 casper05


In [6]:
client = Client(cluster)

In [7]:
client

0,1
Client  Scheduler: tcp://10.12.205.14:36282  Dashboard: http://10.12.205.14:8787/status,Cluster  Workers: 12  Cores: 24  Memory: 20.04 GB


In [8]:
import intake

To build a CMIP collection catalog, the user needs to have a YAML input file with the following content:

E.g: `cmip_collection_input.yml`

```yaml
name: cmip5
collection_type: cmip
overwriting_existing: true
include_cache_dir : true
data_sources:
  root_dir:
    name: GLADE
    loc_type: posix
    direct_access: True
    urlpath: /glade/collections/cmip/cmip5
```

In [10]:
%time col = intake.open_esm_metadatastore(collection_input_file="cmip_collection_input.yml", collection_type="cmip")

None
Persisting cmip5 at : /glade/u/home/abanihi/.intake_esm/database_directory/cmip/cmip5.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87631 entries, 0 to 87630
Data columns (total 11 columns):
ensemble         87631 non-null object
experiment       87631 non-null object
file_basename    87631 non-null object
file_fullpath    87631 non-null object
frequency        87631 non-null object
institution      87631 non-null object
model            87631 non-null object
realm            87631 non-null object
files_dirname    87631 non-null object
variable         87631 non-null object
version          87631 non-null object
dtypes: object(11)
memory usage: 7.4+ MB
CPU times: user 23.2 s, sys: 2.45 s, total: 25.6 s
Wall time: 1min 34s


In [11]:
col.df.head()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,files_dirname,variable,version
0,r1i1p1,rcp85,ua_Amon_GFDL-CM3_rcp85_r1i1p1_209601-210012.nc,/glade/collections/cmip/cmip5/output1/NOAA-GFD...,mon,NOAA-GFDL,GFDL-CM3,atmos,/glade/collections/cmip/cmip5/output1/NOAA-GFD...,ua,v0
1,r4i1p1,rcp85,ua_Amon_CanESM2_rcp85_r4i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0
2,r2i1p1,rcp85,ua_Amon_CanESM2_rcp85_r2i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0
3,r5i1p1,rcp85,ua_Amon_CanESM2_rcp85_r5i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0
4,r3i1p1,rcp85,ua_Amon_CanESM2_rcp85_r3i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0


In [12]:
col.df.model.nunique()  # Find the total number of unique climate models

55

In [13]:
col.df.realm.unique()  # find the list of unique realms

array(['atmos', 'aerosol', 'seaIce', 'landIce', 'land', 'ocean',
       'ocnBgchem'], dtype=object)

In [14]:
col.df.variable.nunique() # How many unique variables

454

In [15]:
col.df.tail()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,files_dirname,variable,version
87626,r1i1p1,rcp85,ua_day_ACCESS1-0_rcp85_r1i1p1_20960101-2100123...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,ua,v4
87627,r1i1p1,rcp85,tasmin_day_ACCESS1-0_rcp85_r1i1p1_20810101-210...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,tasmin,v4
87628,r1i1p1,historical,va_day_ACCESS1-0_historical_r1i1p1_20050101-20...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,va,v4
87629,r1i1p1,rcp85,tasmax_day_ACCESS1-0_rcp85_r1i1p1_20810101-210...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,tasmax,v4
87630,r1i1p1,historical,ua_day_ACCESS1-0_historical_r1i1p1_20050101-20...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,ua,v4


In [16]:
%time len(col.df) # Find the total number of files available in generated CMIP5 database

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 18.1 µs


87631

In [17]:
col.df.groupby('model').nunique()

Unnamed: 0_level_0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,files_dirname,variable,version
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ACCESS1-0,3,3,190,190,4,1,1,4,190,59,11
ACCESS1-3,3,6,213,213,4,1,1,3,213,53,17
ACCESS1.0,2,3,3,3,1,1,1,1,3,1,1
BNU-ESM,1,6,79,79,4,1,1,6,79,43,3
CCSM4,87,44,55336,55336,5,1,1,5,55336,190,95
CESM1-BGC,15,12,3804,3804,4,1,1,6,3804,219,22
CESM1-CAM5,37,16,7075,7075,5,1,1,6,7075,181,52
CESM1-FASTCHEM,5,2,633,633,3,1,1,5,633,152,6
CESM1-WACCM,12,5,3214,3214,3,1,1,5,3214,156,19
CMCC-CESM,1,2,73,73,2,1,1,5,73,46,6


In [18]:
%load_ext watermark

In [19]:
%watermark --iversion -g -h -m -v -u -d

dask   1.1.1
intake 0.4.1
last updated: 2019-03-01 

CPython 3.6.7
IPython 7.1.1

compiler   : GCC 7.3.0
system     : Linux
release    : 3.10.0-693.21.1.el7.x86_64
machine    : x86_64
processor  : x86_64
CPU cores  : 72
interpreter: 64bit
host name  : casper05
Git hash   : 7bc09705408cff3a1dadf2f5cd15ccb79b53c958
