# Set up

In [1]:
import glob # for system ls
from natsort import natsorted # for alphabetical sorting

import xarray as xr # for reading netcdf
# import numpy as np
import dask.array as da
import dask
import rioxarray as rio # for writing tif

import os

In [2]:
# input data directories are the same for everyone on HPC Orion
data_nc = '/work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/'
data_static = '/work/hpc/datasets/un_fao/pyaez/static/netcdf/'

# these are output directories, change per user or overwrite files at these locations
# if you change these, make sure the directories exist (mkdir) before running the script
data_npy = '/work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/'
data_tif = '/work/hpc/datasets/un_fao/pyaez/static/tif/'

varnames = ['Precip','Rhum','Srad','Tmax-2m','Tmin-2m','Wind-2m']
varnames365 = ['Precip365','Rhum365','Srad365','Tmax-2m365','Tmin-2m365','Wind-2m365']

# Main Code

# netcdf to npy

In [3]:
# each 3D array has total size ~12.7GB
# we need to chunk these arrays so that a chunk fits into memory (~9GB per single core)
# any chunk size less than ~9GB should work, we'll use 8 chunks --> ~1.5GB per chunk

chunks={'time':-1,'lat':450,'lon':2160} # 8 chunks

# look at size of the chunks
test=xr.open_dataset(data_nc+'Tmin-2m_daily_1980_5m.nc',chunks=chunks)['Tmin-2m'].sel(lat=slice(90,-60.)).transpose('lat','lon','time').data
test

Unnamed: 0,Array,Chunk
Bytes,10.60 GiB,1.33 GiB
Shape,"(1800, 4320, 366)","(450, 2160, 366)"
Dask graph,8 chunks in 4 graph layers,8 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 10.60 GiB 1.33 GiB Shape (1800, 4320, 366) (450, 2160, 366) Dask graph 8 chunks in 4 graph layers Data type float32 numpy.ndarray",366  4320  1800,

Unnamed: 0,Array,Chunk
Bytes,10.60 GiB,1.33 GiB
Shape,"(1800, 4320, 366)","(450, 2160, 366)"
Dask graph,8 chunks in 4 graph layers,8 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [5]:
# # this cell outputs all 366 days of data, skip for now

# del test

# # do the nc to npy conversion for each variable
# # we are also subsetting the global array (to eliminate artarctica where all grids are nan)
# # expect about 1 min run time per variable

# for var in varnames:
#     # get nc file name
#     f = natsorted(glob.glob(data_nc+var+'*_5m.nc'))[0]
        
#     if f:
#         # read netcdf data into a dask array of numpy array chunks
#         print('reading',f)
#         data = xr.open_dataset(f,chunks=chunks)[var].sel(lat=slice(90,-60.)).transpose('lat','lon','time').data        

#         # set up dir for writing npy
#         out_dir=data_npy+var+'/'
#         isExist = os.path.exists(out_dir)
#         if not isExist:
#             os.makedirs(out_dir)
#         # write npy data
#         print('writing to',out_dir+'0.npy')     
#         da.to_npy_stack(out_dir,data,axis=2)          
#     else:
#         print('no file',f)

In [4]:
# to output files without the leap day run this

# do the nc to npy conversion for each variable
# we are also subsetting the global array (to eliminate artarctica where all grids are nan)
# we also drop the leap day to output 365 total days
# expect about 1 min run time per variable

del test

dropdate='1980-02-29'

with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    for var_in,var_out in zip(varnames,varnames365):
        # get nc file name
        f = natsorted(glob.glob(data_nc+var_in+'*_5m.nc'))[0]

        if f:
            # read netcdf data into a dask array of numpy array chunks
            print('reading',f)
            data = xr.open_dataset(f,chunks=chunks)[var_in].sel(lat=slice(90,-60.)).drop_sel(time=dropdate).transpose('lat','lon','time').data

            # set up dir for writing npy
            out_dir=data_npy+var_out+'/'
            isExist = os.path.exists(out_dir)
            if not isExist:
                os.makedirs(out_dir)
            # write npy data
            print('writing to',out_dir+'0.npy')     
            da.to_npy_stack(out_dir,data,axis=2)          
        else:
            print('no file',f)

reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Precip_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/Precip365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Rhum_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/Rhum365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Srad_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/Srad365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Tmax-2m_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/Tmax-2m365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Tmin-2m_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/Tmin-2m365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Wind-2m_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy

# netcdf mask and elevation to tif

### mask

In [4]:
# get file path and file name
f = glob.glob(data_static+'mask_*_5m.nc')[0]
filename=f.split('/')[-1]
print(f)
print(filename)

/work/hpc/datasets/un_fao/pyaez/static/netcdf/mask_2268708_5m.nc
mask_2268708_5m.nc


In [12]:
print('reading',f)

# load mask from netcdf file, remove antarctica
data = rio.open_rasterio(f)['mask'].sel(y=slice(90,-60.))

# write file
outfile=data_tif+filename[:-3]+'.tif'
print('writing',outfile)
data.rio.to_raster(outfile)

reading /work/hpc/datasets/un_fao/pyaez/static/netcdf/mask_2268708_5m.nc


### elevation

In [13]:
# get file path and file name
f = glob.glob(data_static+'Elevation_*_5m.nc')[0]
filename=f.split('/')[-1]
print(f)
print(filename)

/work/hpc/datasets/un_fao/pyaez/static/netcdf/Elevation_2268708_5m.nc
Elevation_2268708_5m.nc


In [14]:
print('reading',f)

# load mask from netcdf file
data = rio.open_rasterio(f)['Elevation'].sel(y=slice(90,-60.))

# write file
outfile=data_tif+filename[:-3]+'.tif'
print('writing',outfile)
data.rio.to_raster(outfile)

reading /work/hpc/datasets/un_fao/pyaez/static/netcdf/Elevation_2268708_5m.nc
writing /work/hpc/datasets/un_fao/pyaez/static/tif/Elevation_2268708_5m.tif
