
# Setup

In [1]:
# for computing
import numpy as np
import xarray as xr # for reading/writing netcdf
# import dask.array as da
# import dask
import pandas as pd # only used for date times

# convenience things
from time import time, sleep
import os
import glob # for system commands
# from natsort import natsorted # for alphabetical sorting

# for plotting
import matplotlib.pyplot as plt
# from dask_jobqueue import SLURMCluster
# from dask.distributed import Client

In [None]:
# # contain worker logs in their own folder
# homedir = os.environ['HOME']
# daskpath=os.path.join(homedir, "dask-worker-space-can-be-deleted")

# try: 
#     os.mkdir(daskpath) 
# except OSError as error: 
#     print(error) 

In [2]:
# your notebook directory location, change per user
# repo_dir='/work/hpc/users/kerrie/UN_FAO/repos/py_AEZ_data_prep/'

# link to where this notebook lives on github, same for everyone
nb_link='https://github.com/kerriegeil/pyAEZ_data_prep/blob/main/global/01_dat_to_nc.ipynb'

# data directory location, same for everyoe
maskfile = 'ALOSmask5m_fill.rst'
static_dir = '/work/hpc/datasets/un_fao/gaez_v5/land/'
temp_dir = '/work/hpc/datasets/un_fao/gaez_v5_intermediate/'
# data_dir='/work/hpc/datasets/un_fao/'

# personal_dir='/work/hpc/users/kerrie/UN_FAO/temp_data/'

# your output directory location, change per user
# to not overwrite data files you need to change these to a location 
# under your /work/hpc/users/username directory
# and make sure the directories exist before running the script (mkdir)
# out_dir_static=data_dir+'gaez/static/'
# out_dir_time=data_dir+'gaez/global_1980/dailydev/netcdf/'
# out_dir_static=data_dir+'gaez_v5_2023OCT/static/'
# out_dir_time=data_dir+'gaez_v5_2023OCT/global_2020/hist/daily5m/netcdf/'
# out_dir_static=personal_dir
# out_dir_time=personal_dir

# the ALOSmask file that has matching grid to your pxv/dat data, same for everyone
# maskfile=data_dir+'pyaez/static/rst/ALOSmask5m_fill.rst'
# maskfile=data_dir+'gaez_v5_2023OCT/static/ALOSmask5m_fill.rst'

# source info to include in file metadata, same for everyone
# source_dirs=['on MSU HPC2 /gri/projects/UN_FAO/',
#              'on Orion /work/hpc/datasets/un_fao/gaez/global_1980/dailydev/dat/']
source_dirs=['on MSU HPC2 /gri/projects/UN_FAO/',
             'on Orion /work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/']

# all this is same for everyone
# file/variable info
year=2020#1980
fillval=-9999. # value used for missing in the dat files
# varnames=['Precip','Srad','Tmax-2m','Tmin-2m','Vapr','Wind-10m'] # save varnames for later

# scale and units info from the file "UnitScaleFactors.txt" from Gunther
varinfo={'Precip':{'scale_factor':1.e-5,'units':'unitless (mm/mm)','long_name':'precipitation fraction'},
         'Srad':{'scale_factor':1000.,'units':'J/m2/day','long_name':'surface short wave radiation'},
         'Tmax-2m':{'scale_factor':0.01,'units':'degrees C','long_name':'2m maximum air temperature'},
         'Tmin-2m':{'scale_factor':0.01,'units':'degrees C','long_name':'2m minimum air temperature'},
         'Vapr':{'scale_factor':0.01,'units':'hPa','long_name':'vapor pressure'},
         'Wind-10m':{'scale_factor':0.001,'units':'m/s','long_name':'10m wind speed'}}

# metadata for output data files
timeattrs={'standard_name':'time','long_name':'time','axis':'T'}
latattrs={'standard_name':'latitude','long_name':'latitude','units':'degrees_north','axis':'Y'}
lonattrs={'standard_name':'longitude','long_name':'longitude','units':'degrees_east','axis':'X'}

# encoding info for writing netcdf files
time_encoding={'calendar':'standard','units':'days since 1900-01-01 00:00:00','_FillValue':None}
lat_encoding={'_FillValue':None}
lon_encoding={'_FillValue':None}
var_encoding = {'zlib':True,'dtype':'float32'}

# Read data from dat file

In [3]:
# choose a variable
varname='Srad'

In [4]:
# get dat file
infile=glob.glob(temp_dir+'dat/'+varname+'365_AgERA5_*_'+str(year)+'_5m.dat')[0]
infile

'/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/Srad365_AgERA5_Hist_2020_5m.dat'

In [5]:
# get each line from dat file as a string and remove carriage returns
temp=open(infile).read().splitlines() 

# grab the lines with lat/lons (every other line)
ilatilon=temp[0::2] 

# grab the lines with the data (every other line)
data=temp[1::2]

In [6]:
# get each string lat/lon as integer and put it in an numpy array
ilat=np.array([int(i.split()[0]) for i in ilatilon]).astype('int16') 
ilon=np.array([int(i.split()[1]) for i in ilatilon]).astype('int16') 

# put data in a numpy array too, takes 30-60s
data2D=np.loadtxt(data,dtype='int16')
nt=data2D.shape[1]

print('variable =',varname)
print('data dimensions:')
print(data2D.shape[0],'rows (each row represents a different grid cell)')
print(nt,'cols (each col represents a day of the year)')
print('data min max before scaling:',data2D.min(),data2D.max())

variable = Srad
data dimensions:
2295358 rows (each row represents a different grid cell)
366 cols (each col represents a day of the year)
data min max before scaling: -27299 21907


In [8]:
# per Gunther, the number of grids in each pxv file with data present should be 2287408
# this number comes from the number of grids=1 in the ALOSmask (2295358) minus 
# some additional grids that Gunther has set to the missing value -9999 in the pxv files
# some files have more grids with data, some files have less grids with data
# we will fix this by applying a mask later

# how many precip grids have data?  
nomissing=np.where(data2D==-9999,0,1)
ngrids=nomissing.sum()/nt

# how many grids are set to the missing value?
missing=np.where(data2D==-9999.,1,0)
nmissing=missing.sum()/nt

# print some info
print(ngrids,'grids with data present, per Gunther this should be 2287408')
print(nmissing,'grids set to fill value')

2287396.1174863386 grids with data present, per Gunther this should be 2287408
7961.882513661202 grids set to fill value


In [17]:
if ngrids != 2287408.:
    flag=True
    print('incorrect number of data points found, mask will be applied')
else:
    flag=False

incorrect number of data points found, mask will be applied


# Get grid info from mask file

In [9]:
# get grid info from a mask
mask=xr.open_dataset(temp_dir+'static/mask_2287408_5m.nc')
spatial_ref=mask.spatial_ref
nlats,nlons=mask.mask.shape
lats=mask.mask.lat.data.astype('float32')
lons=mask.mask.lon.data.astype('float32')

In [10]:
# create a time dimension
time=pd.date_range(str(year)+'-01-01',str(year)+'-12-31',freq='D')

In [None]:
# # get the lats and lons from the ALOSmask5m_fill.rst file
# mask=xr.open_dataset(static_dir+maskfile,engine='rasterio').squeeze()['band_data'] 
# mask=mask.drop('band')

# mask

In [None]:
# # how many non-zero grids in the ALOS mask file?
# # more than the 2287408 number
# test=xr.where(mask>0,1,0)
# test=xr.where(mask.y<-60,0,test) # eliminate Antarctica
# test.data.sum()

In [11]:
# # save coordinate information for creating netcdf files
# nlats,nlons=mask.shape
# lats=mask.y.data.astype('float32')
# lons=mask.x.data.astype('float32')

# # create a time dimension
# time=pd.date_range(str(year)+'-01-01',str(year)+'-12-31',freq='D')

# # # if leap day is deleted out of the dataset, fix up time to match
# # if len(time) != nt:
# #     time=time[~(time==str(year)+'-02-29')]

print('global data dimensions:',nlats,'latitudes by',nlons,'longitudes by',nt,'days')

global data dimensions: 1800 latitudes by 4320 longitudes by 366 days


we now have the metadata needed (lat, lon, time info) to create an xarray data array to store global data

# Start a compute cluster with dask distributed 

if you started jupyter notebook session with many nodes/cores, use LocalCluster below and cluster.scale(number) appropriately for how many nodes/cores you have running. THIS METHOD IS RECOMMENDED DUE TO HIGH USAGE ON ORION WHICH CAN LEAVE SLURMCLUSTER WORKERS IN THE QUEUE

If you started this jupyter notebook session on 1 node with few cores, shut down this session and open a new session on 6 nodes, 120 cores (6 nodes * 20 cores per node) and also use the --exclusive flag under additional slurm parameters. Then re-open this notebook to run it.

Notes

- setting the cluster to 1 worker = a full node of cores (20), then scaling up is the way to go

- the default settings with less cores/threads per worker and more workers takes much longer to compute

- for LocalCluster: don't start/scale the cluster more than once. If you need to start a new one, restart the whole kernel first and re-run the notebook

In [None]:
# # # DON'T RUN THIS CELL MORE THAN ONCE, SKIP WHEN PROCESSING ADDITIONAL VARIABLES
# # # don't run this cell more than once unless you restart the jupyter kernel first and then rerun the notebook up to this point

# # # for LocalCluster
# # from dask.distributed import Client,LocalCluster
# # cluster=LocalCluster(n_workers=1,threads_per_worker=20) # define your compute cluster (does not start any computing)

# # ############################################################
# # # ADJUST THIS NUMBER
# # ############################################################
# # # the following assumes you opened your jupyter session on multiple full nodes e.g. x nodes and x*20 cores
# # nodes=6 # adjust this to however many nodes you opened your session with
# # ############################################################

# # # start and scale up your local computing cluster
# # client=Client(cluster)  # connect to your compute cluster
# # cluster.scale(nodes-1)  # scale up your compute cluster to more nodes
# # sleep(2)
# # client # click on "Cluster Info" below to see how many workers and threads are running in the local cluster

# cluster = SLURMCluster(
#     queue='bigmem',
#     account="191000-nf0001",
#     processes=1,
#     cores=40,
#     memory='300GB',
#     walltime="01:00:00",
#     log_directory=daskpath)

# client=Client(cluster)
# nworkers=6
# cluster.scale(nworkers)

# # now we wait for the workers to start
# # you should see nworkers under the cluster info section (click on it when it appears)
# client.wait_for_workers(nworkers,timeout=300)
# client

In [None]:
# # if workers didn't show up above run this cell. workers should show up under "Local Cluster" below
# sleep(10)
# client
# print(cluster.job_script())

# Use dask delayed parallel computing to put data onto global grid

loop thru each latitude to create a chunk of the global data array, where grid cells without data are filled with nan

inside the loop we issue calls to functions that do the heavier computing tasks

we delay those functions to create a dask task graph on length ny=2160 and then set off all the tasks to compute

In [None]:
# # function to build a chunk of data for grids where data is present

# @dask.delayed
# def build_full_lat(ixs,data,y,x,t,fv,sf):
#     # create nan array of shape (all days, 1 lat, all lons)
#     arr=xr.DataArray(np.nan,
#                      dims=['time','lat','lon'],
#                      coords={'time':('time',time),'lat':('lat',y),'lon':('lon',x)}).astype('float32')
    
#     for i,ix in enumerate(ixs):
#         arr[:,0,ix]=data[i,:]
    
#     # below only works when the fill value is not in the range of data
#     # only do this for precip so we can save the mask
#     # for everything else, instead of converting -9999 to nan here,
#     # scale everything including the fill value and apply the mask at the end
# #     arr=arr.where(arr!=fv)  # replace all -9999 with nan
#     arr=arr*sf # apply scale factor
    
#     return arr

In [None]:
# # function to build a chunk of data for grids where data is not present

# @dask.delayed
# def build_empty_lat(y,x,t):
#     # create nan array of shape (all days, 1 lat, all lons)    
#     arr=xr.DataArray(np.nan,
#                      dims=['time','lat','lon'],
#                      coords={'time':('time',time),'lat':('lat',y),'lon':('lon',x)}).astype('float32')
#     return arr

In [12]:
# @dask.delayed
def build_full_lat(ixs,data,y,x,t):
    # create nan array of shape (all days, 1 lat, all lons)
    arr=np.empty((nt,len(y),len(x)),dtype='float32')
    arr[:]=np.nan    
    for i,ix in enumerate(ixs):
        arr[:,0,ix]=data[i,:]
    return arr

def build_empty_lat(y,x,t):
    # create nan array of shape (all days, 1 lat, all lons)   
    arr=np.empty((nt,len(y),len(x)),dtype='float32')
    arr[:]=np.nan
    return arr

In [13]:
arr_list=[]
for iy in range(nlats):
    if iy%500==0: print('processing iy = ',iy,'of',nlats)
    indices=np.where(ilat==iy+1)[0] # find which data rows apply to this latitude
    if np.any(indices):
        result=build_full_lat((ilon[indices]-1),data2D[indices,:],lats[iy:iy+1],lons,time) # lazy call to func returns a task
        arr_list.append(result)
    else:
        result=build_empty_lat(lats[iy:iy+1],lons,time)     
        arr_list.append(result)

processing iy =  0 of 1800
processing iy =  500 of 1800
processing iy =  1000 of 1800
processing iy =  1500 of 1800


In [14]:
bignp=np.concatenate(arr_list,axis=1,dtype='float32')
bignp.shape


(366, 1800, 4320)

In [37]:
bigarr=xr.DataArray(bignp,name=varname,
                 dims=['time','lat','lon'],
                 coords={'time':('time',time),'lat':('lat',lats),'lon':('lon',lons)}).astype('float32')
# bigarr

In [29]:
# mask=mask.rename({'y':'lat','x':'lon'})
# mask['lon']=mask.lon.data.astype('float32')
# mask['lat']=mask.lat.data.astype('float32')
# mask=xr.where(mask>0,1,0)
# mask


In [38]:
bigarr=bigarr*varinfo[varname]['scale_factor'] # apply scale factor
bigarr

In [39]:
# bigarr=xr.where(mask.mask==1,bigarr,np.nan) 
bigarr=bigarr.where(mask.mask==1)
# bigarr=bigarr.transpose('time','lat','lon') # change dataset back to array with proper ordering  
# bigarr.name=varname # change var name back to variable
bigarr

In [40]:
bigarr=bigarr.drop('spatial_ref')
bigarr

In [None]:
bigarr=xr.where(mask==1,bigarr,np.nan) # .where reorders dimensions so we have to fix below
bigarr=bigarr.transpose('time','lat','lon') # change dataset back to array with proper ordering  

In [None]:
bigarr.dtype

In [None]:
bigarr=bigarr.where(bigarr!=fillval)  # replace all -9999 with nan
# print(bigarr.min().data,bigarr.max().data)
bigarr=bigarr*varinfo[varname]['scale_factor'] # apply scale factor
# print(bigarr.min().data,bigarr.max().data)

In [None]:
# variable/coordinate metadata
varattrs={'standard_name':varname,'long_name':varinfo[varname]['long_name'],'units':varinfo[varname]['units']}

# assign metadata
bigarr.name=varname
bigarr.attrs=varattrs
bigarr['lat'].attrs=latattrs
bigarr['lon'].attrs=lonattrs
bigarr['time'].attrs=timeattrs

# array to dataset
ds=bigarr.to_dataset()
ds=ds.assign_coords({'spatial_ref':mask.spatial_ref})
ds=ds.assign_attrs({'source_data':source_dirs,
                    'source_code':nb_link})

print('bigarr is',bigarr.nbytes/1E9,'GB')

ds

In [None]:
bigarr[0,:,:].plot()

In [None]:
# writing with compression, will take a few minutes
ds.to_netcdf(temp_dir+varname+'_DailyDev_'+str(year)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      'time':time_encoding,
                      varnames[v]:var_encoding})

now we write a normal not-delayed loop to call the delayed functions and collect all the delayed tasks into a list called tasklist

In [None]:
tasklist=[] # empty list for storing delayed computing tasks

# data2D=dask.delayed(data2D) this really slows things down and I don't know why

# parellilze by latitude (latitude loop)
for iy in range(nlats):
    
    indices=np.where(ilat==iy+1)[0] # find which data rows apply to this latitude
    
    # if there is any data present at this latitude call build_full_lat function
    if np.any(indices):
        result=build_full_lat((ilon[indices]-1),data2D[indices,:],lats[iy:iy+1],lons,time,fillval,varinfo[varname]['scale_factor']) # lazy call to func returns a task
        tasklist.append(result) # collect list of compute tasks
    # if there is no data present at this latitude call build_empty_lat function
    else:
        result=build_empty_lat(lats[iy:iy+1],lons,time)
        tasklist.append(result)

In [None]:
len(tasklist)

dask.compute starts the parallel computing and pulls all the results down from workers into a list of arrays

note: the use of *tasklist means output will be a list of len ny=2160 of arrays, without * the output will be len 1

In [None]:
# ignore the warning

# on 5 or 6 nodes expect a few minutes run time, maybe 4-8 min
output=dask.compute(*tasklist) # execute all the delayed compute tasks across our local cluster
output[0] # look at 1 array in the list

now concat all the arrays into a single large array

In [None]:
bigarr=xr.concat(output,dim='lat')
bigarr

# create or apply mask

In [None]:
# # the ALOS mask provided doesn't match where there is and is not data present in the pxv/dat files
# # save a new mask from the precip pxv file
# if v==0:
#     newmask=xr.where(np.isfinite(bigarr[0,:,:]),1,0) # 2160x4320 mask of 0&1
#     # assign metadata
#     newmask.name='mask'
#     newmask=newmask.drop('time')
#     newmask['lat'].attrs=latattrs
#     newmask['lon'].attrs=lonattrs
#     newmask.attrs={'description':'binary admin mask created using ALOSmask5m_fill.rst and the precip pxv file'}
#     newmask=newmask.to_dataset()
#     newmask=newmask.assign_coords({'spatial_ref':mask.spatial_ref})
#     # write mask netcdf file
#     newmask.to_netcdf(out_dir_static+'mask_2287408_5m.nc',
#                 encoding={'lat':lat_encoding,'lon':lon_encoding,'mask':var_encoding})  
# else:
# if variable is not precip
# load and apply the new mask
# newmask=xr.open_dataset(static_dir+'mask_2287408_5m.nc') # read the mask file
# newmask=xr.open_dataset(temp_dir+'static/'+'mask_2287408_5m.nc') # read the mask file

newmask=xr.open_dataset(temp_dir+'static/'+'mask_2287408_5m.nc').isel(lat=slice(500,1000)) # read the mask file
bigarr=xr.where(newmask==1,bigarr,np.nan) # .where reorders dimensions so we have to fix below
bigarr=bigarr['mask'].transpose('time','lat','lon') # change dataset back to array with proper ordering    

In [None]:
bigarr

In [None]:
# variable/coordinate metadata
varattrs={'standard_name':varname,'long_name':varinfo[varname]['long_name'],'units':varinfo[varname]['units']}

# assign metadata
bigarr.name=varname
bigarr.attrs=varattrs
bigarr['lat'].attrs=latattrs
bigarr['lon'].attrs=lonattrs
bigarr['time'].attrs=timeattrs

# array to dataset
ds=bigarr.to_dataset()
ds=ds.assign_coords({'spatial_ref':mask.spatial_ref})
ds=ds.assign_attrs({'source_data':source_dirs,
                    'source_code':nb_link})

print('bigarr is',bigarr.nbytes/1E9,'GB')

ds

let's take a look at the data

In [None]:
import matplotlib.pyplot as plt

t='2020-06-01'#'1980-06-01'

fig=plt.figure(figsize=(8,8))
ax=fig.add_subplot(111)

bigarr.sel(lat=slice(50,23),lon=slice(-90,-65),time=t).plot(ax=ax)
plt.title(varname+' dev '+t,fontsize='xx-large')


In [None]:
bigarr[0,:,:].plot()

In [None]:
newmask.mask.plot()

# Write netcdf file with compression



In [None]:
# writing with compression, will take a few minutes
ds.to_netcdf(temp_dir+varname+'_DailyDev_'+str(year)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      'time':time_encoding,
                      varnames[v]:var_encoding})

#### process all other variables using the appropriate cells above, then proceed to below

# Check netcdf files

double checking that all files have 2287408 grids with data

In [None]:
files=glob.glob(temp_dir+'*_DailyDev_'+str(year)+'_5m.nc')

In [None]:
for f in files:
    d=xr.open_dataset(f) # read a data file
    vn=list(d.keys())[0]  # get variable name
    d=np.where(np.isfinite(d[vn].isel(time=0)),1,0)  # select one timestep and create mask
    print(vn,'has',d.sum(),'grids with data')  # count how many grids have data

In [None]:
client.shutdown()