In [1]:
import numpy as np
# import glob
# import rioxarray as rio
import xarray as xr
# from natsort import natsorted 
import matplotlib.pyplot as plt
import dask
import pandas as pd 
# from dask_jobqueue import SLURMCluster
# from dask.distributed import Client
# from time import sleep
import os


In [2]:
# directory for writing intermediate files (dat, netcdf, extra masks etc)
temp_dir="/work/hpc/datasets/un_fao/gaez_v5_intermediate/"

# # directory for writing the final npy and tif inputs for pyaez
# output_dir="/work/hpc/datasets/un_fao/pyaez/global_"

ALOSmaskfile='/work/hpc/datasets/un_fao/gaez_v5/land/ALOSmask5m_fill.rst'
monthlyfileformask='/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/Precip_AgERA5_Hist_2020_5m.dat'
dailyfileformask='/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/Precip365_AgERA5_Hist_2020_5m.dat'
fillval=-9999.

ALOSelevfile='/work/hpc/datasets/un_fao/gaez_v5/land/ALOSdem5m_fill.rst'

nb_link='https://github.com/kerriegeil/pyAEZ_data_prep/blob/main/global/hpc_shell_workflow/01_create_all_masks.ipynb'

# metadata and encoding for writing netcdf files
latattrs={'standard_name':'latitude','long_name':'latitude','units':'degrees_north','axis':'Y'}
lonattrs={'standard_name':'longitude','long_name':'longitude','units':'degrees_east','axis':'X'}
varattrs={'standard_name':'mask',
          'long_name':'mask',
          'units':'unitless',
          'description':'binary admin mask of 0 and 1, where 1 indicates cell with data'}
elevattrs={'standard_name':'elev',
          'long_name':'elevation',
          'units':'m'}

lat_encoding={'_FillValue':None}
lon_encoding={'_FillValue':None}
var_encoding = {'zlib':True,'dtype':'int32'}
elev_encoding = {'zlib':True,'dtype':'float32'}


# Create binary admin masks

There are 3 different masks in this data: 


<br><br>
1) the mask that is supposed to apply to everything is ALOSmask5m_fill.rst with 2,295,358 grids of data

2) the mask I create from the monthly mean pxv files. These files have 2,268,708 grids with data + 26650 additional grids set to the missing value. These two numbers combined equals the number of grids with data in mask (1)

3) the mask I create from the daily dev pxv files. These files have 2,287,408 grids with data + 7950 additional grids set to the missing value. These two numbers combined equals the number of grids with data in mask (1)

<br><br>


Moving forward I use the mask with the least amount of grids with data (2) to mask all other files.


Here we create a netcdf for each of these masks. This allows for easy comparison of where there is and isn't data present across the different file types.

#### we also create elevation files to match each mask

### We start with (1) the mask provided in ALOSmask5m_fill.rst

In [3]:
vname='mask'
infile=ALOSmaskfile

ds=xr.open_dataset(infile,engine='rasterio').squeeze() 
del ds.coords['band']

if 'y' in list(ds.coords): 
    ds['y']=ds['y'].astype('float32')
if 'x' in list(ds.coords):
    ds['x']=ds['x'].astype('float32')      

# rename variable and dimensions
ds=ds.rename({'x':'lon','y':'lat','band_data':vname})

# create binary mask
ds[vname]=xr.where(ds[vname]>0,1,0)
ds[vname]=xr.where(ds.lat<-60,0,ds[vname]) # eliminate antarctica
ds[vname]=ds[vname].astype('int32')

# variable/coordinate metadata
ds[vname].attrs=varattrs
ds['lat'].attrs=latattrs
ds['lon'].attrs=lonattrs

# global attributes
ds=ds.assign_attrs({'source_data':infile,
                    'source_code':nb_link})

# slice off Antarctica
ds=ds.sel(lat=slice(90,-60))

ds

In [4]:
# test for values other than 0,1
print(np.unique(ds[vname]))

# test for correct mask
ngrids=ds[vname].data.sum()
print(ngrids)


[0 1]
2295358


the ALOS mask with Antarctica set to zero has 2295358 grids equal to 1

In [None]:
# write to file
ds.to_netcdf(temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      vname:var_encoding})

In [5]:
# save grid info for other masks
nlats=len(ds.lat)
nlons=len(ds.lon)
lats=ds.lat.data
lons=ds.lon.data
spatial_ref=ds.spatial_ref
spatial_ref

In [6]:
maskALOS=ds.mask

### Next, we create a mask from where data is present in the monthly mean rst files (2) 

In [None]:
# # for rst files

# vname='mask'
# infile=monthlyfileformask

# ds=xr.open_dataset(infile,engine='rasterio').squeeze() 
# del ds.coords['band']

# if 'y' in list(ds.coords): 
#     ds['y']=ds['y'].astype('float32')
# if 'x' in list(ds.coords):
#     ds['x']=ds['x'].astype('float32') 

# # rename variable and dimensions
# ds=ds.rename({'x':'lon','y':'lat','band_data':vname})

# # create binary mask
# ds[vname]=xr.where(ds[vname]>=0,1,0)
# ds[vname]=xr.where(ds.lat<-60,0,ds[vname]) # eliminate antarctica
# ds[vname]=ds[vname].astype('int32')

# # variable/coordinate metadata
# varattrs={'standard_name':vname,'long_name':'mask','units':'unitless','description':'binary administrative mask'}
# ds[vname].attrs=varattrs
# ds['lat'].attrs=latattrs
# ds['lon'].attrs=lonattrs

# # global attributes
# ds=ds.assign_attrs({'source_data':infile,
#                     'source_code':nb_link})

# ds

In [None]:
# # test for values other than 0,1
# print(np.unique(ds[vname]))

# # test for correct mask
# ngrids=ds[vname].data.sum()
# print(ngrids)

In [7]:
# for dat files

vname='mask'
infile=monthlyfileformask

# parse dat file into 2D numpy array
temp=open(infile).read().splitlines() # get each line from dat file as a string and remove carriage returns
ilatilon=temp[0::2] # grab the lines with lat/lons (every other line)
data=temp[1::2] # grab the lines with the data (every other line)

# get each string lat/lon as integer and put it in an numpy array
ilat=np.array([int(i.split()[0]) for i in ilatilon]).astype('int16') 
ilon=np.array([int(i.split()[1]) for i in ilatilon]).astype('int16') 

# put data in a numpy array too
data2D=np.loadtxt(data,dtype='float32')
nt=data2D.shape[1]

# how many grids have data (excluding the missing value)?  
nomissing=np.where(data2D==fillval,0,1)
ngrids=nomissing.sum()/nt

# how many grids are set to the missing value?
missing=np.where(data2D==fillval,1,0)
nmissing=missing.sum()/nt

# print some info
print(ngrids,nmissing)
print(data2D.min(),data2D.max())

2268708.0 26650.0
-9999.0 4785.29


In [8]:
# create a time dimension
time=pd.date_range('2020-01-01','2020-12-31',freq='MS')
# time

In [9]:
def build_full_lat(ixs,data,y,x,t):
    # create nan array of shape (all days, 1 lat, all lons)
    arr=np.empty((nt,len(y),len(x)),dtype='float32')
    arr[:]=np.nan    
    for i,ix in enumerate(ixs):
        arr[:,0,ix]=data[i,:] # fill in data  
    return arr

def build_empty_lat(y,x,t):
    # create nan array of shape (all days, 1 lat, all lons)   
    arr=np.empty((nt,len(y),len(x)),dtype='float32')
    arr[:]=np.nan
    return arr

In [10]:
arr_list=[]

for iy in range(nlats):
    if iy%500==0: print('processing iy = ',iy,'of',nlats)
    indices=np.where(ilat==iy+1)[0] # find which data rows apply to this latitude
    if np.any(indices):
        result=build_full_lat((ilon[indices]-1),data2D[indices,:],lats[iy:iy+1],lons,time)
        arr_list.append(result)
    else:
        result=build_empty_lat(lats[iy:iy+1],lons,time)     
        arr_list.append(result)

processing iy =  0 of 1800
processing iy =  500 of 1800
processing iy =  1000 of 1800
processing iy =  1500 of 1800


In [11]:
# concat numpy arrays together
bignp=np.concatenate(arr_list,axis=1)

# convert to xarray for easier manipulation
bigarr=xr.DataArray(bignp,
                 dims=['time','lat','lon'],
                 coords={'time':('time',time),'lat':('lat',lats),'lon':('lon',lons)}).astype('float32')

In [12]:
# slice out antarctica
bigarr=bigarr.sel(lat=slice(90,-60))

In [13]:
# see how many grids have data present or missing value
nomissing=np.where((bigarr==fillval)|(~np.isfinite(bigarr)),0,1) # where data is present
ngrids=nomissing.sum()/nt

missing=np.where(bigarr==fillval,1,0) # where data is the missing value
nmissing=missing.sum()/nt

print('number of grids with data:',ngrids)
print('number of grid set to missing:',nmissing)
print('these numbers combined equals the number of grids=1 in the ALOS mask:',ngrids+nmissing)
print(bigarr.min().data,bigarr.max().data)

number of grids with data: 2268708.0
number of grid set to missing: 26650.0
these numbers combined equals the number of grids=1 in the ALOS mask: 2295358.0
-9999.0 4785.29


In [14]:
bigarr=xr.where(bigarr!=fillval,bigarr,np.nan)  # replace all -9999 with nan
bigarr=xr.where(np.isfinite(bigarr),1,0)  # convert to a 0,1 mask

ds=bigarr[0,:,:].to_dataset(name=vname)  # subset in time (3D-->2D) and convert to xarray dataset
ds=ds.drop('time')
ds

In [15]:
ds=ds.assign_coords({'spatial_ref':spatial_ref})
# variable/coordinate metadata
ds[vname].attrs=varattrs
ds['lat'].attrs=latattrs
ds['lon'].attrs=lonattrs
ds

In [16]:
# test for values other than 0,1
print(np.unique(ds[vname]))

# test for correct mask
ngrids=ds[vname].data.sum()
print(ngrids)

[0 1]
2268708


In [None]:
# ds[vname].plot()

In [None]:
# write to file
ds.to_netcdf(temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      vname:var_encoding})

In [17]:
maskMON=ds.mask

### Next, we create a mask from where data is present in the precip daily dev pxv file (3) 

In [18]:
vname='mask'
infile=dailyfileformask

# parse dat file into 2D numpy array
temp=open(infile).read().splitlines() # get each line from dat file as a string and remove carriage returns
ilatilon=temp[0::2] # grab the lines with lat/lons (every other line)
data=temp[1::2] # grab the lines with the data (every other line)

# get each string lat/lon as integer and put it in an numpy array
ilat=np.array([int(i.split()[0]) for i in ilatilon]).astype('int16') 
ilon=np.array([int(i.split()[1]) for i in ilatilon]).astype('int16') 

# put data in a numpy array too, takes 30-60s
data2D=np.loadtxt(data,dtype='int16')
nt=data2D.shape[1]

# how many grids have data (excluding the missing value)?  
nomissing=np.where(data2D==fillval,0,1)
ngrids=nomissing.sum()/nt


# how many grids are set to the missing value?
missing=np.where(data2D==fillval,1,0)
nmissing=missing.sum()/nt

# print some info
print('in dat file:',ngrids,'grids with data present')
print('in dat file:',nmissing,'grids set to fill value')

in dat file: 2287408.0 grids with data present
in dat file: 7950.0 grids set to fill value


In [19]:
# create a time dimension
time=pd.date_range('2020-01-01','2020-12-31',freq='D')

In [20]:
# functions to build global data grid
def build_full_lat(ixs,data,y,x,t):
    # create nan array of shape (all days, 1 lat, all lons)
    arr=np.empty((nt,len(y),len(x)),dtype='float32')
    arr[:]=np.nan    
    for i,ix in enumerate(ixs):
        arr[:,0,ix]=data[i,:] # fill in data  
    return arr

def build_empty_lat(y,x,t):
    # create nan array of shape (all days, 1 lat, all lons)   
    arr=np.empty((nt,len(y),len(x)),dtype='float32')
    arr[:]=np.nan
    return arr

In [21]:
# build global data
arr_list=[]
for iy in range(nlats):
#     if iy%500==0: print('processing iy = ',iy,'of',nlats)
    indices=np.where(ilat==iy+1)[0] # find which data rows apply to this latitude
    if np.any(indices):
        result=build_full_lat((ilon[indices]-1),data2D[indices,:],lats[iy:iy+1],lons,time)
        arr_list.append(result)
    else:
        result=build_empty_lat(lats[iy:iy+1],lons,time)     
        arr_list.append(result)

In [22]:
# concat numpy arrays together
bignp=np.concatenate(arr_list,axis=1)

# convert to xarray for easier manipulation
bigarr=xr.DataArray(bignp,
                 dims=['time','lat','lon'],
                 coords={'time':('time',time),'lat':('lat',lats),'lon':('lon',lons)}).astype('float32')

# slice out antarctica
bigarr=bigarr.sel(lat=slice(90,-60))

In [23]:
# see how many grids have data present or missing value
nomissing=np.where((bigarr==-9999.)|(~np.isfinite(bigarr)),0,1) # where data is present
ngrids=nomissing.sum()/nt

missing=np.where(bigarr==-9999.,1,0) # where data is the missing value
nmissing=missing.sum()/nt

print('for mask, number of grids with data:',ngrids)
print('for mask, number of grid set to missing:',nmissing)
print('these numbers combined equals the number of grids=1 in the ALOS mask:',ngrids+nmissing)

for mask, number of grids with data: 2287408.0
for mask, number of grid set to missing: 7950.0
these numbers combined equals the number of grids=1 in the ALOS mask: 2295358.0


In [24]:
# replace the -9999 fill value with nan
bigarr=bigarr.where(bigarr!=fillval)          
bigarr=xr.where(np.isfinite(bigarr),1,0)  # convert to a 0,1 mask

In [25]:
# convert to dataset
ds=bigarr[0,:,:].to_dataset(name=vname)  # subset in time (3D-->2D) and convert to xarray dataset
ds=ds.drop('time')        
# variable/coordinate metadata
ds=ds.assign_coords({'spatial_ref':spatial_ref})    
ds[vname].attrs=varattrs
ds['lat'].attrs=latattrs
ds['lon'].attrs=lonattrs

In [26]:
# checks
ngrids=ds[vname].data.sum() # test for correct mask application
print('mask contains values',np.unique(ds[vname]),'and has',ngrids,'grids with data') 

mask contains values [0 1] and has 2287408 grids with data


In [None]:
# write to file
out_mask=temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc'
print('writing',out_mask)        
ds.to_netcdf(temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      vname:var_encoding})   

In [27]:
maskDAY=ds.mask

#### we now have 3 masks to work with at /work/hpc/datasets/gaez_v5_intermediate/static/

1) mask_2295358_5m.nc (from ALOSmask5m_fill.rst)

2) mask_2268708_5m.nc (from Precip_AgERA5_Hist_2020_01_5m.rst)

3) mask_2287408_5m.nc (from Precip365_AgERA5_Hist_2020_5m.dat)

# Last we write elevation netcdf

In [47]:
vname='elev'
infile=ALOSelevfile

ds=xr.open_dataset(infile,engine='rasterio').squeeze() 
del ds.coords['band']

if 'y' in list(ds.coords): 
    ds['y']=ds['y'].astype('float32')
if 'x' in list(ds.coords):
    ds['x']=ds['x'].astype('float32')      

# rename variable and dimensions
ds=ds.rename({'x':'lon','y':'lat','band_data':vname})

# create binary mask
# ds[vname]=xr.where(ds[vname]>0,1,0)
# ds[vname]=xr.where(ds.lat<-60,0,ds[vname]) # eliminate antarctica
# ds[vname]=ds[vname].astype('int32')

# variable/coordinate metadata
ds[vname].attrs=elevattrs
ds['lat'].attrs=latattrs
ds['lon'].attrs=lonattrs

# global attributes
ds=ds.assign_attrs({'source_data':infile,
                    'source_code':nb_link})

# slice off Antarctica
ds=ds.sel(lat=slice(90,-60))

ds

In [48]:
ds.elev.min().data,ds.elev.max().data

(array(-415., dtype=float32), array(6498., dtype=float32))

In [49]:
# test for correct mask
elevmask=xr.where(np.isfinite(ds.elev),1,0)
ngrids=elevmask.data.sum()
print(ngrids)

2295682


### elev file to match mask (1)

In [50]:
dsALOS=ds.copy()
dsALOS['elev']=dsALOS.elev.where(maskALOS==1)
dsALOS

In [51]:
# test for correct mask
elevmask=xr.where(np.isfinite(dsALOS.elev),1,0)
ngrids=elevmask.data.sum()
print(ngrids)

2295358


In [33]:
# write to file
out_mask=temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc'
print('writing',out_mask)        
ds.to_netcdf(temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      vname:elev_encoding})   

writing /work/hpc/datasets/un_fao/gaez_v5_intermediate/static/elev_2295358_5m.nc


### elev file to match mask (2)

In [52]:
dsMON=ds.copy()
dsMON['elev']=dsMON.elev.where(maskMON==1)
dsMON

In [53]:
# test for correct mask
elevmask=xr.where(np.isfinite(dsMON.elev),1,0)
ngrids=elevmask.data.sum()
print(ngrids)

2268708


In [54]:
# write to file
out_mask=temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc'
print('writing',out_mask)        
ds.to_netcdf(temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      vname:elev_encoding})   

writing /work/hpc/datasets/un_fao/gaez_v5_intermediate/static/elev_2268708_5m.nc


### elev file to match mask (3)

In [55]:
dsDAY=ds.copy()
dsDAY['elev']=dsDAY.elev.where(maskDAY==1)
dsDAY

In [56]:
# test for correct mask
elevmask=xr.where(np.isfinite(dsDAY.elev),1,0)
ngrids=elevmask.data.sum()
print(ngrids)

2287408


In [46]:
# write to file
out_mask=temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc'
print('writing',out_mask)        
ds.to_netcdf(temp_dir+'static/'+vname+'_'+str(ngrids)+'_5m.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      vname:elev_encoding})   

writing /work/hpc/datasets/un_fao/gaez_v5_intermediate/static/elev_2287408_5m.nc
