# Set up

In [1]:
# for computing
import numpy as np
import xarray as xr # for reading/writing netcdf
# import dask.array as da
import dask
import pandas as pd # only used for date times

# convenience things
from time import time as timer
import glob # for system ls
from natsort import natsorted # for alphabetical sorting

# for plotting
import matplotlib.pyplot as plt

In [2]:
year=2020
varnames=['Precip','Srad','Tmax-2m','Tmin-2m','Vapr','Wind-10m']
outvars=['Precip','Srad','Tmax-2m','Tmin-2m','Vapr','Wind-2m']
months=np.arange(12)+1

# your notebook directory location, change per user
# repo_dir='/work/hpc/users/kerrie/UN_FAO/repos/py_AEZ_data_prep/'

# web link to this notebook, same for everyone
nb_link='https://github.com/kerriegeil/pyAEZ_data_prep/blob/main/global/03_create_pyaez_daily.ipynb'

# your data directory locations, same for everyone
# ddev_dir='/work/hpc/datasets/un_fao/gaez/global_1980/dailydev/netcdf/'
# mon_dir='/work/hpc/datasets/un_fao/pyaez/global_1980/monthly/netcdf/'
temp_dir='/work/hpc/datasets/un_fao/gaez_v5_intermediate/'

# this is an ouput directory, change per user or overwrite files here
# if you change it, make sure to create the directory (mkdir) before running the script
# day_dir='/work/hpc/datasets/un_fao/pyaez/global_daily/netcdf/'

# everything below, same for everyone
# metadata and encoding for writing netcdf files
timeattrs={'standard_name':'time','long_name':'time','axis':'T'}
time_encoding={'calendar':'standard','units':'days since 1900-01-01 00:00:00','_FillValue':None}
latattrs={'standard_name':'latitude','long_name':'latitude','units':'degrees_north','axis':'Y'}
lat_encoding={'_FillValue':None}
lonattrs={'standard_name':'longitude','long_name':'longitude','units':'degrees_east','axis':'X'}
lon_encoding={'_FillValue':None}

# source_data=[ddev_dir+'*_DailyDev_'+str(year)+'_5m.nc',mon_dir+'*_monthly_'+str(year)+'_5m.nc']
source_data=[temp_dir+' DailyDev and Monthly nc files']
source_code=nb_link

In [3]:
# daily deviations
# ddfiles=natsorted(glob.glob(ddev_dir+'*_DailyDev_'+str(year)+'_*.nc'))
ddfiles=natsorted(glob.glob(temp_dir+'netcdf/*_DailyDev_'+str(year)+'_*.nc'))

# monthly data
# mfiles=natsorted(glob.glob(mon_dir+'*_monthly_'+str(year)+'_*.nc'))
mfiles=natsorted(glob.glob(temp_dir+'netcdf/*_Monthly_'+str(year)+'_*.nc'))

ddfiles,mfiles

(['/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Precip_DailyDev_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Srad_DailyDev_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Tmax-2m_DailyDev_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Tmin-2m_DailyDev_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Vapr_DailyDev_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Wind-10m_DailyDev_2020_5m.nc'],
 ['/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Precip_Monthly_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Srad_Monthly_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Tmax-2m_Monthly_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Tmin-2m_Monthly_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Vapr_Monthly_2020_5m.nc',
  '/work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Win

# Define functions

In [13]:
# function to calculate daily values as daily_value = daily_deviation + monthly_mean

def calc_daily_ds(iv):

    # get daily dev data 
    var_prime=xr.open_dataset(ddfiles[iv],chunks={'time':-1,'lat':200,'lon':-1})[varnames[iv]]
    # get monthly data 
    var_mean=xr.open_dataset(mfiles[iv],chunks={'time':-1,'lat':200,'lon':-1})[varnames[iv]]
    # save variable metadata
    varattrs=var_mean.attrs
    del varattrs['description']
    # rework metadata for the groupby below
    var_mean=var_mean.rename({'time':'month'})
    var_mean['month']=months

    print('computing daily values...')
    start=timer()
    # calc daily
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        var_daily=var_prime.groupby('time.month')+var_mean
#     with dask.config.set(**{'array.slicing.split_large_chunks': False}):
#         var_daily=var_daily.compute()
    tasktime = timer()-start
    print('complete in',tasktime,'s')
    
    # clean up variable metadata
    var_daily=var_daily.drop('month')
    var_daily.attrs=varattrs

    # convert array to dataset and set global attributes
    ds=var_daily.to_dataset()
    ds=ds.assign_attrs({'source_data':source_data,'source_code':source_code})

    return ds  

In [11]:
# function to calculate daily_value = fraction_of_monthly_total * monthly_total

def calc_daily_precip_ds(iv):
    print('loading data...')
    start=timer()
    # get daily dev data 
    var_frac=xr.open_dataset(ddfiles[iv])[varnames[iv]].load()
    # get monthly data 
    var_acc=xr.open_dataset(mfiles[iv])[varnames[iv]].load()
    # save variable metadata
    varattrs=var_acc.attrs
    del varattrs['description']
    # rework metadata for the groupby below
    var_acc=var_acc.rename({'time':'month'})
    var_acc['month']=months
    tasktime = timer()-start
    print('complete in',tasktime,'s')

    print('computing daily values...')
    start=timer()
    # calc daily
    var_daily=var_frac.groupby('time.month')*var_acc  # times here instead of add
    tasktime = timer()-start
    print('complete in',tasktime,'s')
    
    # clean up variable metadata
    var_daily=var_daily.drop('month')
    var_daily.attrs=varattrs

    # convert array to dataset and set global attributes
    ds=var_daily.to_dataset()
    ds=ds.assign_attrs({'source_data':source_data,'source_code':source_code})

    return ds  

In [6]:
# This function writes the datafile

def write_nc(ds,var,var_encoding):
#     print('writing netcdf file to',day_dir+var+'_daily_'+str(year)+'_5m.nc')
    outfile=temp_dir+'netcdf/'+var+'_daily_'+str(year)+'_5m.nc'
    print('writing netcdf file to',outfile)
    start=timer()
    # write to file
#     ds.to_netcdf(day_dir+var+'_daily_'+str(year)+'_5m.nc',
    ds.to_netcdf(outfile,
                encoding={'lat':lat_encoding,'lon':lon_encoding,'time':time_encoding,var:var_encoding})
    tasktime = timer()-start
    print('complete in',tasktime,'s')

# Main Code

### Precip

In [None]:
# expect about 1 min run time

# calc daily vals
iv=0
ds=calc_daily_precip_ds(iv)
ds[varnames[iv]].attrs['units']='mm/day'
ds[varnames[iv]].sel(time=str(year)+'-01-01').plot()


In [None]:
# expect about 3 min run time

# write file
var_encoding = {'zlib':True,'dtype':'float32'}
write_nc(ds,outvars[iv],var_encoding)

### Srad

In [None]:
# expect about 4 min run time

# calc daily vals
iv=1
ds=calc_daily_ds(iv)

# save and update metadata
attrs=ds[varnames[iv]].attrs
attrs['units']='W/m2'

# Convert J/m2/day to W/m2
s_per_day=86400
ds[varnames[iv]]=ds[varnames[iv]]/s_per_day
ds[varnames[iv]].attrs=attrs

# write file
var_encoding = {'zlib':True,'dtype':'float32'}
write_nc(ds,outvars[iv],var_encoding)

### Tmax-2m

In [8]:
# expect about 4 min run time

# calc daily vals
iv=2
ds=calc_daily_ds(iv)

# write file
var_encoding = {'zlib':True,'dtype':'float32'}
write_nc(ds,outvars[iv],var_encoding)

computing daily values...


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


complete in 46.214075803756714 s
writing netcdf file to /work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Tmax-2m_daily_2020_5m.nc
complete in 124.96423006057739 s


In [None]:
iv=2
var_prime=xr.open_dataset(ddfiles[iv])[varnames[iv]]#.load()
var_prime

In [None]:
iv=2
var_prime=xr.open_dataset(ddfiles[iv],chunks={'time':-1,'lat':200,'lon':-1})[varnames[iv]]
var_prime

In [None]:
var_mean=xr.open_dataset(mfiles[iv],chunks={'time':-1,'lat':200,'lon':-1})[varnames[iv]]
var_mean

In [None]:
varattrs=var_mean.attrs
del varattrs['description']
# rework metadata for the groupby below
var_mean=var_mean.rename({'time':'month'})
var_mean['month']=months
var_mean

In [None]:
start=timer()
var_daily=var_prime.groupby('time.month')+var_mean
var_daily=var_daily.compute()
tasktime = timer()-start
print('complete in',tasktime,'s')
var_daily

In [None]:
# clean up variable metadata
var_daily=var_daily.drop('month')
var_daily.attrs=varattrs

# convert array to dataset and set global attributes
ds=var_daily.to_dataset()
ds=ds.assign_attrs({'source_data':source_data,'source_code':source_code})
ds

In [None]:
fig=plt.figure(figsize=(20,4))
plt.suptitle('Tmax-2m')

levels=[-45,-40,-35,-30,-25,-20,-15,-10,-5,0,5,10,15,20,25,30,35,40,45]

ax=fig.add_subplot(131)
ds['Tmax-2m'][0,:,:].plot(ax=ax,levels=levels,extend='both')
plt.title('2020-01-01')

ax=fig.add_subplot(132)
var_mean[0,:,:].plot(ax=ax,levels=levels,extend='both')
plt.title('monthly mean Jan2020')

ax=fig.add_subplot(133)
var_prime[0,:,:].plot(ax=ax,levels=levels,extend='both')
plt.title('2020-01-01 deviation')

plt.show()

figname=temp_dir+'plots/Tmax-2m_Jan-day-mean-dev_'+str(year)+'.png'
fig.savefig(figname,bbox_inches='tight')
plt.close(fig)

### Tmin-2m

In [14]:
# expect about 4 min run time

# calc daily vals
iv=3
ds=calc_daily_ds(iv)
ds
# # write file
# var_encoding = {'zlib':True,'dtype':'float32'}
# write_nc(ds,outvars[iv],var_encoding)

computing daily values...
complete in 0.005510091781616211 s


Unnamed: 0,Array,Chunk
Bytes,10.60 GiB,1.18 GiB
Shape,"(366, 1800, 4320)","(366, 200, 4320)"
Dask graph,9 chunks in 6 graph layers,9 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.60 GiB 1.18 GiB Shape (366, 1800, 4320) (366, 200, 4320) Dask graph 9 chunks in 6 graph layers Data type float32 numpy.ndarray",4320  1800  366,

Unnamed: 0,Array,Chunk
Bytes,10.60 GiB,1.18 GiB
Shape,"(366, 1800, 4320)","(366, 200, 4320)"
Dask graph,9 chunks in 6 graph layers,9 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [17]:
# compute and write file, expect about 7 min
outfile=temp_dir+'netcdf/'+varnames[iv]+'_daily_'+str(year)+'_5m.nc'
print('writing',outfile)
start=timer()
var_encoding = {'zlib':True,'dtype':'float32'}
# var=Rhum.name
write_job=ds.to_netcdf(outfile,
            encoding={'lat':lat_encoding,'lon':lon_encoding,'time':time_encoding,varnames[iv]:var_encoding},
            compute=False)
write_job.compute()
tasktime = timer()-start

writing /work/hpc/datasets/un_fao/gaez_v5_intermediate/netcdf/Tmin-2m_daily_2020_5m.nc


In [18]:
tasktime


168.62278985977173

### Vapr --> Rhum

In [None]:
# expect about 4 min run time

# first write vapr daily
# calc daily vals
iv=4
ds=calc_daily_ds(iv)

# write file
var_encoding = {'zlib':True,'dtype':'float32'}
write_nc(ds,outvars[iv],var_encoding)

In [None]:
# for Rhum
# we have to chunk and parallelize to get Rhum because 
# the data needed to compute it is bigger than memory

print('lazy loading data...')
start=timer()

chunks={'time':-1,'lat':540,'lon':1080}  # 16 total chunks for each variable

# lazy load daily vapr and lazy scale
# vapr=xr.open_dataset(day_dir+'Vapr_daily_1980_5m.nc',chunks=chunks)['Vapr']*0.1
vapr=xr.open_dataset(temp_dir+'netcdf/Vapr_daily_'+str(year)+'_5m.nc',chunks=chunks)['Vapr']*0.1

# lazy load daily tmax
# tmax=xr.open_dataset(day_dir+'Tmax-2m_daily_1980_5m.nc',chunks=chunks)['Tmax-2m']
tmax=xr.open_dataset(temp_dir+'netcdf/Tmax-2m_daily_'+str(year)+'_5m.nc',chunks=chunks)['Tmax-2m']

# lazy load daily tmin
# tmin=xr.open_dataset(day_dir+'Tmin-2m_daily_1980_5m.nc',chunks=chunks)['Tmin-2m']
tmin=xr.open_dataset(temp_dir+'netcdf/Tmin-2m_daily_'+str(year)+'_5m.nc',chunks=chunks)['Tmin-2m']

tasktime = timer()-start
print('complete in',tasktime,'s')

In [None]:
print('lazy compute...')
start=timer()

# saturation vapor pressure
vapr_sat=0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) )

tasktime = timer()-start
print('complete in',tasktime,'s')

In [None]:
print('more lazy compute...')
start=timer()

# relative humidity
Rhum=100*(vapr/vapr_sat)

# fix up metadata
Rhum.name='Rhum'
attrs={'standard_name':Rhum.name,'long_name':'relative humidity','units':'%'}
Rhum.attrs=attrs

# convert array to dataset and set global attributes
ds=Rhum.to_dataset()
ds=ds.assign_attrs({'source_data':source_data,'source_code':source_code})

tasktime = timer()-start
print('complete in',tasktime,'s')

In [None]:
# compute and write file, expect about 7 min

start=timer()

var_encoding = {'zlib':True,'dtype':'float32'}
var=Rhum.name

write_job=ds.to_netcdf(day_dir+var+'_daily_'+str(year)+'_5m.nc',
            encoding={'lat':lat_encoding,'lon':lon_encoding,'time':time_encoding,var:var_encoding},
            compute=False)
write_job.compute()

tasktime = timer()-start
print('complete in',tasktime,'s')

### Wind-10m --> Wind-2m

In [None]:
# expect about 4 min run time

# calc daily vals
iv=5
ds=calc_daily_ds(iv)

print('interpolating...')
start=timer()
# interp from 10m to 2m height
z=10
z_adjust=4.87/(np.log(67.8*z-5.42))
ds=ds*z_adjust

# fix metadata
attrs={'standard_name':outvars[iv],'long_name':'2m Wind Speed','units':'m/s'}
ds=ds.rename({varnames[iv]:outvars[iv]})
ds[outvars[iv]].attrs=attrs

tasktime = timer()-start
print('complete in',tasktime,'s')


# write file
var_encoding = {'zlib':True,'dtype':'float32'}
write_nc(ds,outvars[iv],var_encoding)

# Check that everything looks ok

In [None]:
dfiles=natsorted(glob.glob(day_dir+'*_daily_'+str(year)+'_*.nc'))
dfiles

In [None]:
ptime='1980-01-03'
for f in dfiles:
    vname=f.split('/')[-1].split('_')[0]
    var=xr.open_dataset(f)[vname].sel(time=ptime)

    fig=plt.figure(figsize=(4,4))
    ax=fig.add_subplot(111)

    var.plot(ax=ax)
    plt.title(vname+', '+ptime)    
    plt.show()


### Looks like there is an issue with Rhum. I suspect it is because the Vapr, Tmin-2m, and/or Tmax-2m rst monthly files are slightly different than the tifs Gunther is using. 

In [None]:
rhum=xr.open_dataset(dfiles[1])['Rhum'].load()
rhum.min(),rhum.max()

In [None]:
cmap=['deepskyblue','whitesmoke','indigo','tomato']

fig=plt.figure(figsize=(10,2))
ax=plt.subplot(111)

t='1980-01-03'
rhum.sel(time=t,lat=slice(80,45)).plot(ax=ax,levels=[-11,0,100,105,215],colors=cmap)
plt.title(t)
plt.show()