# Climate Data Store Download Demo

Illustrating how to use the CDS api to download the AgERA5 dataset in parallel.

To run this notebook you must open it on a development node and supply the path to a kernel where the cdsapi package is install. Either create your own conda environment and jupyter kernel or use the one at /work/hpc/users/kerrie/UN_FAO/cdsapi/share/jupyter

NOTE: it's better to do this with a .py script and run it at the command line. All the outputs from the api calls will eventually crash this notebook if you are downloading many files

In [None]:
import cdsapi
import numpy as np
import dask
import os
import glob
import subprocess
import sys

In [None]:
# directory to save downloads in
data_dir='/work/hpc/datasets/unfao_sera/temporary/agERA5/'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
# generate all the api call info for all the data we want

# we want to grab daily maximum temperature for a few years

# the api only allows you to grab about 1 year at a time due to size constraints

# list of strings
months=[str(s).zfill(2) for s in np.arange(12)+1]
days=[str(s).zfill(2) for s in np.arange(31)+1]
years=[str(s) for s in np.arange(2014,2024)]

# other cds api things
dataset='sis-agrometeorological-indicators'
base_dict={'variable':'2m_temperature',
           'statistic':'day_time_maximum',
           'month':months,
           'day':days,
           'format':'tgz',
           'version':'1_1'}
outvar='tmax'


print(years)
print('--------')
print(months)
print('--------')
print(days)                

In [None]:
# this is the api call
# put it in a delayed function so we can download in parallel
    
@dask.delayed
def get_1yr_daily(ds,y,vd,dd,vo):
    c = cdsapi.Client()
    c.retrieve(
    ds,
    vd,
    dd+vo+'_AgERA5_daily_'+y+'.tar.gz')    

In [None]:
tasklist=[]
for year in years:
    # add to the dictionary for the api call
    vardict=base_dict
    vardict['year']=year
    # build a dask delayed task list
    tasklist.append(get_1yr_daily(dataset,year,vardict,data_dir,outvar))

In [None]:
# will request 1 var, separate call for each of n years, so we should have n tasks
print(len(tasklist))

# this is what a task looks like (a dask delayed object)
tasklist[0]

In [None]:
# let er rip
dask.compute(*tasklist)

In [None]:
# unzip/untar into directories by year
for year in years:
    print(year)
    # create the dir if it doesn't exist
    if not os.path.exists(data_dir+year):
        os.makedirs(data_dir+year)
    # get the file name
    try:
        filename=glob.glob(data_dir+'tmax_AgERA5_daily_'+year+'.tar*')[0]
    except:
        sys.exit('problem with',year)
    # bash command to untar into the yearly directories  
    os.chdir(data_dir)
    subprocess.run(['tar', 'xf', filename, '-C', year],check=True, text=True)

In [None]:
# remove the tar files since we don't need them any more
files=glob.glob(data_dir+'tmax_AgERA5_daily_*.tar*')
for f in files:
    subprocess.run(['rm', f],check=True, text=True)