# Set up

In [1]:
import glob # for system ls
from natsort import natsorted # for alphabetical sorting

import xarray as xr # for reading netcdf
# import numpy as np
import dask.array as da
import rioxarray as rio # for writing tif

import os

In [2]:
# these are the same for everyone on HPC Orion

data_nc = '/work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/'
data_npy = '/work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/'
data_tif = '/work/hpc/datasets/un_fao/pyaez/global_1980/daily/tif/'
data_static = '/work/hpc/datasets/un_fao/pyaez/static/netcdf/'

varnames = ['Precip','Rhum','Srad','Tmax-2m','Tmin-2m','Wind-2m','mask','Elevation']

# Main Code

# netcdf to npy

In [8]:
# each 3D array has total size ~12.7GB
# we need to chunk these arrays so that a chunk fits into memory (~9GB per single core)
# any chunk size less than ~9GB should work, we'll use 8 chunks --> ~1.5GB per chunk

chunks={'time':-1,'lat':540,'lon':2160} # 8 chunks

# look at size of the chunks
test=xr.open_dataset(data_nc+'Tmin-2m_daily_1980_5m.nc',chunks=chunks)['Tmin-2m'].transpose('lat','lon','time').data
test

Unnamed: 0,Array,Chunk
Bytes,12.72 GiB,1.59 GiB
Shape,"(2160, 4320, 366)","(540, 2160, 366)"
Dask graph,8 chunks in 3 graph layers,8 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 12.72 GiB 1.59 GiB Shape (2160, 4320, 366) (540, 2160, 366) Dask graph 8 chunks in 3 graph layers Data type float32 numpy.ndarray",366  4320  2160,

Unnamed: 0,Array,Chunk
Bytes,12.72 GiB,1.59 GiB
Shape,"(2160, 4320, 366)","(540, 2160, 366)"
Dask graph,8 chunks in 3 graph layers,8 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [9]:
del test

# do the nc to npy conversion for each variable
for var in varnames[0:1]:
    # get nc file name
    if var not in ['mask','Elevation']:
        f = natsorted(glob.glob(data_nc+var+'*_5m.nc'))[0]
    else:
        f = natsorted(glob.glob(data_static+var+'*_5m.nc'))[0]
        
    if f:
        if var not in ['mask','Elevation']:
            # read netcdf data into a dask array of numpy array chunks
            print('reading',f)
            data = xr.open_dataset(f,chunks=chunks)[var].transpose('lat','lon','time').data

            # set up dir for writing npy
            out_dir=data_npy+var+'/'
            isExist = os.path.exists(out_dir)
            if not isExist:
                os.makedirs(out_dir)
            # write npy data
            print('writing to',out_dir+'0.npy')     
            da.to_npy_stack(out_dir,data,axis=2)
        else:
            # read netcdf data into a dask array of numpy array chunks
            # for the 2D arrays mask and elevation we just use a single chunk (chunks=-1)
            print('reading',f)
            data = xr.open_dataset(f,chunks=-1)[var].data

            # set up dir for writing npy
            out_dir=data_npy+var+'/'
            isExist = os.path.exists(out_dir)
            if not isExist:
                os.makedirs(out_dir)

            # write npy data
            print('writing to',out_dir+'0.npy')     
            da.to_npy_stack(out_dir,data,axis=0)            
    else:
        print('no file',f)

reading /work/hpc/datasets/un_fao/pyaez/global_1980/daily/netcdf/Precip_daily_1980_5m.nc
writing to /work/hpc/datasets/un_fao/pyaez/global_1980/daily/npy/Precip/0.npy


# netcdf mask and elevation to tif

### mask

In [None]:
# get file path and file name
f = glob.glob(data_static+'mask_*_5m.nc')[0]
filename=f.split('/')[-1]
print(f)
print(filename)

In [None]:
print('reading',f)

# load mask from netcdf file
data = xr.open_dataset(f)['mask']

# replace nan with -999 and save as integer data
data= data.where(data==1).fillna(-999).astype('int16')

# write file
outfile='/work/hpc/datasets/un_fao/pyaez/static/tif/'+filename[:-3]+'.tif'
print('writing',outfile)
data.rio.to_raster(outfile)

### elevation

In [11]:
# get file path and file name
f = glob.glob(data_static+'Elevation_*_5m.nc')[0]
filename=f.split('/')[-1]
print(f)
print(filename)

/work/hpc/datasets/un_fao/pyaez/static/netcdf/Elevation_2268708_5m.nc
Elevation_2268708_5m.nc


In [12]:
print('reading',f)

# load mask from netcdf file
data = xr.open_dataset(f)['Elevation']

# replace nan with -999 and save as integer data
data= data.where(data==1).fillna(-999).astype('int16')

# write file
outfile='/work/hpc/datasets/un_fao/pyaez/static/tif/'+filename[:-3]+'.tif'
print('writing',outfile)
data.rio.to_raster(outfile)

reading /work/hpc/datasets/un_fao/pyaez/static/netcdf/Elevation_2268708_5m.nc
writing /work/hpc/datasets/un_fao/pyaez/static/tif/Elevation_2268708_5m.tif
