# Set up

In [2]:
import glob # for system ls
from natsort import natsorted # for alphabetical sorting

import xarray as xr # for reading netcdf
# import numpy as np
import dask.array as da
import dask
import rioxarray as rio # for writing tif

import os

In [3]:
# input data directories are the same for everyone on HPC Orion
data_nc = '/work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/'
data_static = '/work/hpc/datasets/un_fao/pyaez/china_static/netcdf/'

# these are output directories, change per user or overwrite files at these locations
# if you change these, make sure the directories exist (mkdir) before running the script
data_npy = '/work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/'
data_tif = '/work/hpc/datasets/un_fao/pyaez/china_static/tif/'

varnames = ['prcp','relh','srad','tmax','tmin','wspd']
varnames365 = ['Precip365','Rhum365','Srad365','Tmax-2m365','Tmin-2m365','Wind-2m365']

timetag='8110'

# Main Code

# netcdf to npy

In [4]:
# each 3D array has total size ~12.7GB
# we need to chunk these arrays so that a chunk fits into memory (~9GB per single core)
# any chunk size less than ~9GB should work, we'll use 8 chunks --> ~1.5GB per chunk

# chunks={'time':-1,'lat':450,'lon':2160} # 8 chunks
chunks=-1

# look at size of the chunks
test=xr.open_dataset(data_nc+'tmin_daily_8110.nc',chunks=chunks)['tmin'].sel(lat=slice(90,-60.)).data#transpose('lat','lon','time').data
test

Unnamed: 0,Array,Chunk
Bytes,441.59 MiB,441.59 MiB
Shape,"(365, 428, 741)","(365, 428, 741)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 441.59 MiB 441.59 MiB Shape (365, 428, 741) (365, 428, 741) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",741  428  365,

Unnamed: 0,Array,Chunk
Bytes,441.59 MiB,441.59 MiB
Shape,"(365, 428, 741)","(365, 428, 741)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [5]:
# # this cell outputs all 366 days of data, skip for now

# del test

# # do the nc to npy conversion for each variable
# # we are also subsetting the global array (to eliminate artarctica where all grids are nan)
# # expect about 1 min run time per variable

# for var in varnames:
#     # get nc file name
#     f = natsorted(glob.glob(data_nc+var+'*_5m.nc'))[0]
        
#     if f:
#         # read netcdf data into a dask array of numpy array chunks
#         print('reading',f)
#         data = xr.open_dataset(f,chunks=chunks)[var].sel(lat=slice(90,-60.)).transpose('lat','lon','time').data        

#         # set up dir for writing npy
#         out_dir=data_npy+var+'/'
#         isExist = os.path.exists(out_dir)
#         if not isExist:
#             os.makedirs(out_dir)
#         # write npy data
#         print('writing to',out_dir+'0.npy')     
#         da.to_npy_stack(out_dir,data,axis=2)          
#     else:
#         print('no file',f)

In [13]:
# to output files without the leap day run this

# do the nc to npy conversion for each variable
# we are also subsetting the global array (to eliminate artarctica where all grids are nan)
# we also drop the leap day to output 365 total days
# expect about 1 min run time per variable

del test

# dropdate='1980-02-29'

with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    for var_in,var_out in zip(varnames,varnames365):
        # get nc file name
        f = natsorted(glob.glob(data_nc+var_in+'*'+timetag+'.nc'))[0]

        if f:
            # read netcdf data into a dask array of numpy array chunks
            print('reading',f)
#             data = xr.open_dataset(f,chunks=chunks)[var_in].sel(lat=slice(90,-60.)).drop_sel(time=dropdate).transpose('lat','lon','time').data
            data = xr.open_dataset(f,chunks=chunks)[var_in].sel(lat=slice(90,-60.)).transpose('lat','lon','doy').data

            # set up dir for writing npy
            out_dir=data_npy+var_out+'/'
            isExist = os.path.exists(out_dir)
            if not isExist:
                os.makedirs(out_dir)
            # write npy data
            print('writing to',out_dir+'0.npy')     
            da.to_npy_stack(out_dir,data,axis=2)          
        else:
            print('no file',f)

reading /work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/prcp_daily_8110.nc
writing to /work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/Precip365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/relh_daily_8110.nc
writing to /work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/Rhum365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/srad_daily_8110.nc
writing to /work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/Srad365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/tmax_daily_8110.nc
writing to /work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/Tmax-2m365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/tmin_daily_8110.nc
writing to /work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/Tmin-2m365/0.npy
reading /work/hpc/datasets/un_fao/pyaez/china_8110/daily/netcdf/wspd_daily_8110.nc
writing to /work/hpc/datasets/un_fao/pyaez/china_8110/daily/npy/Wind-2m365/0.npy


# netcdf mask and elevation to tif

### mask

In [5]:
# get file path and file name
f = glob.glob(data_static+'mask_old*.nc')[0]
filename=f.split('/')[-1]
print(f)
print(filename)

/work/hpc/datasets/un_fao/pyaez/china_static/netcdf/mask_old.nc
mask_old.nc


In [23]:
data=xr.open_dataset('/work/hpc/datasets/un_fao/pyaez/static/netcdf/mask_2268708_5m.nc')
data

In [26]:
# data.coord/s['spatial_ref'].attrs['spatial_ref']
spatialref=data.spatial_ref
spatialref

In [35]:
print('reading',f)

lat_encoding={'_FillValue':None}
lon_encoding={'_FillValue':None}
var_encoding = {'zlib':True,'dtype':'float32'}

# load mask from netcdf file, remove antarctica
# data = rio.open_rasterio(f)#[0]#['mask']#.sel(y=slice(90,-60.))
data=xr.open_dataset(f)['mask'].astype('int32')#,engine='rasterio')['mask']
gt=data.coords['spatial_ref'].attrs['GeoTransform']
spatialref.attrs['GeoTransform']=gt
del data.coords['spatial_ref']
data.coords['spatial_ref']=spatialref
# spatialref=data.coords['spatial_ref']

# spatialref

data.to_netcdf(data_static+'mask.nc',
            encoding={'lat':lat_encoding,
                      'lon':lon_encoding,
                      'mask':{'zlib':True,'dtype':'int32'}})

# del data
# data = rio.open_rasterio(data_static+'mask.nc').astype('int32')
# del data.coords['spatial_ref']
# data.coords['spatial_ref']=spatialref
# # # data.to_netcdf(data_static+'mask.nc')
# data
# # # data = rio.open_rasterio(data_static+'mask.nc')#.sel(y=slice(90,-60.))

# data
# write file
outfile=data_tif+'mask.tif'
print('writing',outfile)
data.rio.to_raster(outfile)



reading /work/hpc/datasets/un_fao/pyaez/china_static/netcdf/mask_old.nc
writing /work/hpc/datasets/un_fao/pyaez/china_static/tif/mask.tif


In [34]:
data

IndexError: too many indices

### elevation

In [64]:
# get file path and file name
f = glob.glob(data_static+'elev_old*.nc')[0]
filename=f.split('/')[-1]
print(f)
print(filename)

/work/hpc/datasets/un_fao/pyaez/china_static/netcdf/elev_old.nc
elev_old.nc


In [68]:
print('reading',f)

# load mask from netcdf file
# data = rio.open_rasterio(f)['elev'].sel(y=slice(90,-60.))
# data=xr.open_dataset(f)['elev']

data=xr.open_dataset(f)['elev'].astype('int32')#,engine='rasterio')['mask']
# spatialref=data.spatial_ref
# del data
# data = rio.open_rasterio(f)[0]['elev'].astype('int32')
# del data.coords['spatial_ref']
data.coords['spatial_ref']=spatialref
# # data.to_netcdf(data_static+'mask.nc')
# data

# # data.to_netcdf(data_static+'mask.nc',
# #             encoding={'lat':lat_encoding,
# #                       'lon':lon_encoding,
# #                       'mask':{'zlib':True,'dtype':'int32'}})

# # data = rio.open_rasterio(data_static+'mask.nc')#.sel(y=slice(90,-60.))

# # data
# # write file
outfile=data_tif+'elev.tif'





# write file
print('writing',outfile)
data.rio.to_raster(outfile)

reading /work/hpc/datasets/un_fao/pyaez/china_static/netcdf/elev_old.nc
writing /work/hpc/datasets/un_fao/pyaez/china_static/tif/elev.tif
