note: you cannot process srad or wind on it's own. you must process it with precip. this is because the srad and wind values written to the pxv files span over the fill value used (-9999). so we need to create a mask from a different variable in order to correctly identify which srad/wind grids should be scaled and which are masked grids. I've chosen precip to create the additional mask.

In [1]:
import numpy as np
from calendar import isleap
import xarray as xr
import dask.array as da
import dask
import rioxarray as rio
import pandas as pd
import os
import matplotlib.pyplot as plt
from time import time as timer

import logging
logging.captureWarnings(True)

In [2]:
###############################################
# things we need to know up front from Gunther
###############################################

# 1) global grid ny,nx including Antarctia. this is the shape of the ALOS mask
ny_global=2160
nx_global=4320

# 2) global grid ny,nx excluding Antarctica. ny corresponds to IRmax in Gunther's fortran program
ny=1800
nx=nx_global

# 3) data type of what's inside the pxv files (2 byte int = python np.int16, 4 byte float = np.float32)
dtype_d=np.int16   # 2 byte integers for daily data
dtype_m=np.float32 # 4 byte floats for monthly data

# 4) fill value used in the pxv files
fillval_d=-9999   # daily
fillval_m=-9999.0 # monthly

# 5) total number of data points in the pxv files including points set to the fillval (number of lines in file)
npts=2295358

# 6) total number of data points in the pxv files with valid data values
# this is so we can identify which points should be masked and which shouldn't when the data range spans over the fillval
# which is a problem for srad
npts_valid_d=2287408  # daily files
npts_valid_m=2268708  # monthly files

# 7) scale factors for putting the data in the pxv files into units in the table below
# in alphabetical order by variable name 
scale_factors=[0.0001,1000.,0.01,0.01,0.01,0.001]
# scale_factors=[0.0001,0.01]

# Variable	Monthly data	Daily deviations/distr.	Scale factor
# Precip	     mm/day	         %_of_month×100	       0.0001
# Srad	       J/m2/day	            kJ/m2/day	        1000.
# Tmax	         °C	                 °C×100	            0.01
# Tmin	         °C	                 °C×100	            0.01
# Vapr	         hPa	                Pa	            0.01
# Wind	        m/sec	              mm/sec	        0.001

# 8) how many decimal places the daily output variables should have
output_trunc=[4,2,3,3,2,3]
# output_trunc=[4,2]

In [3]:
# other constants 

# pxv things
pxv_basedir='/work/hpc/datasets/un_fao/gaez_v5/clim/AgERA5/Hist/'
dataset='AgERA5'
experiment='Hist'
pxvsuf='_5m.pxv'
connector='_'
dailytag='365'
sep='/'
pxvdirnames=['prec','srad','tmax','tmin','vapr','wind']
varnames=['Precip','Srad','Tmax-2m','Tmin-2m','Vapr','Wind-10m']
# pxvdirnames=['prec','vapr']
# varnames=['Precip','Vapr']

# raster things
maskfile='/work/hpc/datasets/un_fao/gaez_v5/land/ALOSmask5m_fill.rst'
ydimname='y'
xdimname='x'

# output things
out_basedir='/work/hpc/datasets/un_fao/pyaez/inputs/global/daily365_npy/'

# years=[str(year) for year in np.arange(1980,1990)]
# years=np.arange(1980,1990)
years=np.arange(2020,2022)
nmonths=12

# chunks={'time':-1,'y':-1,'x':216}
xrchunks={'time':-1,'y':-1,'x':54}
dachunks=(-1,54,-1)
# yyyy=2020
# varind=0

# First, get the mask

In [4]:
### get the mask from the rst into an array of 2 dims (x,y)
### and check the mask has the same number of data grids as the pxv data

# open the maskfile but don't include antarctica so mask has shape (y:1800,x:4320)
ds=xr.open_dataset(maskfile,engine='rasterio').isel(y=slice(0,ny)).squeeze()
del ds.coords['band']

# clean up some metadata
ds[xdimname]=ds[xdimname].astype(np.float32)
ds[ydimname]=ds[ydimname].astype(np.float32)
mask2D=ds.band_data

# convert to 0 & 1 mask
mask2D=xr.where(mask2D>0,1,0).astype('int8')

mask1D=mask2D.stack(space=[ydimname,xdimname]) # collapse mask to 1D: 1800*4320 = 7776000 points
inds_data=mask1D==1  # keep track of which points are not masked out

# check number of data points in the mask
npts_mask=int(mask2D.sum().data)
print('total data points in mask file', npts_mask,'expecting',npts)

total data points in mask file 2295358 expecting 2295358


In [5]:
# function to call with dask delayed
def data_to_nd_array(i,inds,arr1D,pxv,arr2D):
    arr1D[inds]=pxv.squeeze()  # remove singleton dim (day)
    arr2D[:,:]=arr1D.unstack() # put 1D data onto the 2D grid
    return arr2D.copy()

for yyyy in years:
    start_time=timer()
    print('########################################################################')
    print('################################',yyyy,'################################')
    print('########################################################################')

    for varind, var in enumerate(varnames):
        print('*****************************************')
        print('*************** Processing',varnames[varind],'***************')    
        print('*****************************************')
        ######################################################################################################
        #  STEP 1: Translate monthly means from PXV to xarray data structures    
        ######################################################################################################
        print('################################ STEP1: PROCESSING MONTHLY PXV ################################')
        pxvfile_m=pxv_basedir+pxvdirnames[varind]+sep+varnames[varind]+connector+dataset+connector+experiment+connector+str(yyyy)+pxvsuf
        filename=pxvfile_m.split(sep)[-1]

        # read file to 1D array
        # monthly files have more data in them than we need so we subset the read with count=
        with open(pxvfile_m,'rb') as f:
            array1D_m=np.fromfile(f,dtype=dtype_m,count=nmonths*npts)

        # limit precision here?

        # reshape the array to (npoints,ndays)
        nvals=array1D_m.shape[0]             # total number of data values
        npts_flt=nvals/nmonths               # number of grid points in the file, float format
        npts_int=int(nvals/nmonths)              # convert to integer

        # check that number of grids found in flt and int are equivalent, if not, the file was read incorrectly
        assert npts_flt*10==float(npts_int*10), f"reading pxv file {filename} with incorrect number of days: {ndays}"
        # check that number of grids found in the file is the number expected, if not, coordinate with Gunther
        assert npts_int==npts, f"pxv file {filename} has {npts_int} total data points, expecting {npts} total data points"
        # print('nvalues in pxv file',nvals)
        # print('nmonths in pxv file',nmonths)
        print('total data points in pxv file',npts_int,'expecting',npts)

        # check that number of grids found in pxv file is the same as the number of grids=1 in the mask file
        assert npts_int==npts_mask, f"npts in pxv is {npts}, npts in mask is {npts_mask}"

        array2D_m=array1D_m.reshape(npts,nmonths) # reshape
        # print(array2D_m[15,:])

        # find out if data value range spans across the fillvalue
        # if it does, we'll need to apply an extra mask later
        flag_m=True if array2D_m.min() < fillval_m else False

        # check data values
        # print('shape of numpy data array',array2D_m.shape)
        print('data min/max values',array2D_m.min(),array2D_m.max())
        print('apply extra mask?',flag_m)

        # set up for putting data on full grid
        empty1D_m=mask1D.copy().astype(dtype_m)  # placeholder array for 1D space 
        empty1D_m.rio.write_nodata(fillval_m,inplace=True)
        empty1D_m[:]=fillval_m

        empty2D_m=mask2D.copy().astype(dtype_m)  # placeholder array for 2D grid 
        empty2D_m.rio.write_nodata(fillval_m,inplace=True)
        empty2D_m[:,:]=fillval_m  

        # dask parallel computing 
        # first convert to pxv data to chunked dask array, 1 day per chunk 
        # and save to list of delayed dask objects
        pxv_delay=da.from_array(array2D_m,chunks=(-1,1)).to_delayed().ravel() 

        # build a list a computational tasks to be executed later
        task_list=[dask.delayed(data_to_nd_array)(imonth,inds_data,empty1D_m.copy(),pxvdata,empty2D_m.copy()) for imonth,pxvdata in enumerate(pxv_delay)] 
        assert len(task_list)==nmonths, f'{len(task_list)} tasks in list, should be {nmonths}' # double check we've got 1 task per day of data

        # execute all computations
        print('putting 1D data on a 2D grid...')
        result_chunks_m=dask.compute(*task_list)

        # concatenate the resulting daily chunks along a new time dimension
        print('concatenating...')
        data3D_m=xr.concat(result_chunks_m,dim='time')

        # replace fillval with nan
        print('adding nans...')
        data3D_m=xr.where(data3D_m==fillval_m,np.nan,data3D_m)

        # check we have the correct number of non-missing data points
        data_mask_m=xr.where(np.isnan(data3D_m.data),0,1)  
        ngrids_data_m=int(data_mask_m.sum()/data_mask_m.shape[0])
        print('total number of non-missing data points',ngrids_data_m,'expecting',npts_valid_m)
        assert ngrids_data_m==npts_valid_m, f'data mask creation issue. found {ngrids_data_m} valid data points (non missing), expecting {npts_valid_m}' # double check we've got 1 task per day of data    

        # visual check January
        # figure=plt.figure(figsize=(6,4))
        # data3D_m.isel(time=0).plot()
        # plt.title(varnames[varind]+' data from monthly mean pxv, Jan '+str(yyyy))
        # plt.tight_layout()
        # plt.show()
        del array1D_m,pxvfile_m,filename,nvals,npts_flt,npts_int,array2D_m,flag_m,empty1D_m,empty2D_m,pxv_delay,task_list,result_chunks_m
        ######################################################################################################
        # END STEP 1
        #####################################################################################################

        ######################################################################################################
        # STEP 2: Translate daily deviations from PXV to scaled xarray data structures
        ######################################################################################################
        ### get the data from the pxv into an array of 2 dims (space,time)
        print('################################ STEP 2: PROCESSING DAILY DEV PXV ################################')
        pxvfile=pxv_basedir+pxvdirnames[varind]+sep+varnames[varind]+dailytag+connector+dataset+connector+experiment+connector+str(yyyy)+pxvsuf
        filename=pxvfile.split(sep)[-1]

        # read entire file into 1D array
        with open(pxvfile,'rb') as f:
            array1D_d=np.fromfile(f,dtype=dtype_d)            

        # reshape the array to (npoints,ndays)
        nvals=array1D_d.shape[0]             # total number of data values
        ndays=366 if isleap(yyyy) else 365 # number of days of data at each grid point
        npts_flt=nvals/ndays               # number of grid points in the file, float format
        npts_int=int(nvals/ndays)              # convert to integer

        # check that number of grids found in flt and int are equivalent, if not, the file was read incorrectly
        assert npts_flt*10==float(npts_int*10), f"reading pxv file {filename} with incorrect number of days: {ndays}"
        # check that number of grids found in the file is the number expected, if not, coordinate with Gunther
        assert npts_int==npts, f"pxv file {filename} has {npts_int} total data points, expecting {npts} total data points"
        # print('nvalues in pxv file',nvals)
        # print('ndays in pxv file',ndays)
        print('total data points in pxv file',npts_int,'expecting',npts)

        # check that number of grids found in pxv file is the same as the number of grids=1 in the mask file
        assert npts_int==npts_mask, f"npts in pxv is {npts}, npts in mask is {npts_mask}"

        array2D_d=array1D_d.reshape(npts,ndays) # reshape

        # find out if data value range spans across the fillvalue
        # if it does, we'll need to apply an extra mask later
        flag_d=True if array2D_d.min() < fillval_d else False

        # check data values
        # print('shape of numpy data array',array2D_d.shape)
        print('data min/max values before scaling:',array2D_d.min(),array2D_d.max())
        print('apply extra mask?',flag_d)
        # array2D[0,:]   

        # set up for putting data on full grid
        empty1D_d=mask1D.copy().astype(dtype_d)  # placeholder array for 1D space 
        empty1D_d.rio.write_nodata(fillval_d,inplace=True)
        empty1D_d[:]=fillval_d

        empty2D_d=mask2D.copy().astype(dtype_d)  # placeholder array for 2D grid 
        empty2D_d.rio.write_nodata(fillval_d,inplace=True)
        empty2D_d[:,:]=fillval_d

        # dask parallel computing 
        # first convert to pxv data to chunked dask array, 1 day per chunk 
        # and save to list of delayed dask objects
        pxv_delay=da.from_array(array2D_d,chunks=(-1,1)).to_delayed().ravel() 

        # build a list a computational tasks to be executed later
        task_list=[dask.delayed(data_to_nd_array)(iday,inds_data,empty1D_d.copy(),pxvdata,empty2D_d.copy()) for iday,pxvdata in enumerate(pxv_delay)] 
        assert len(task_list)==ndays, f'{len(task_list)} tasks in list, should be {ndays}' # double check we've got 1 task per day of data

        # execute all computations
        print('putting 1D data on a 2D grid...')
        result_chunks_d=dask.compute(*task_list)

        # concatenate the resulting daily chunks along a new time dimension
        print('concatenating...')
        data3D_d=xr.concat(result_chunks_d,dim='time')

        # change out the fill value to nans
        # this is where we need to apply an extra mask if the valid range of the data includes the -9999 fillval
        # which is the case for srad ************ADD OTHERS HERE************************

        print('changing dtype...')      
        # we need to force float32 if we want to have nans in the output
        data3D_d=data3D_d.astype(np.float32)

        # if valid data range includes fillval (srad, wind), scale then additional masking
        if flag_d:
            print('processing pxv variable with data values that span over the fillval')
            print('scaling...')
            data3D_d=data3D_d*scale_factors[varind]    
            print('applying additional mask...')
            data3D_d=xr.where(data_mask_d,data3D_d,np.nan)
            # verify that the masking worked
            valid_arr=np.where(np.isnan(data3D_d.data[14,:,:]),0,1) # pick one day to verify  
            nvalid=valid_arr.sum()
            print('total number of non-missing data points',nvalid,'expecting',npts_valid_d)
            assert nvalid==npts_valid_d, f'data mask application issue. found {nvalid} valid data points (non missing), expecting {npts_valid_d}'    
            del valid_arr,nvalid
        # if valid data range doesn't include fillval (precip,tmin,tmax,vapr), convert fillval to nan then scale
        else:
            print('adding nans...')
            data3D_d=xr.where(data3D_d==fillval_d,np.nan,data3D_d)
            print('scaling...')
            data3D_d=data3D_d*scale_factors[varind]

            if varnames[varind] =='Precip':
                print('computing additional mask...')
                data_mask_d=xr.where(np.isnan(data3D_d.data),0,1)  
                ngrids_data_d=int(data_mask_d.sum()/data_mask_d.shape[0])
                print('total number of non-missing data points',ngrids_data_d,'expecting',npts_valid_d)
                assert ngrids_data_d==npts_valid_d, f'data mask creation issue. found {ngrids_data_d} valid data points (non missing), expecting {npts_valid_d}' # double check we've got 1 task per day of data

        print('data min/max values after scaling:',data3D_d.min().data,data3D_d.max().data)
                
        # figure=plt.figure(figsize=(6,4))
        # data3D_d.isel(time=14).plot()
        # plt.title(varnames[varind]+' scaled data from daily dev pxv, 15 Jan '+str(yyyy))
        # plt.tight_layout()
        # plt.show()
        del array1D_d,pxvfile,filename,nvals,npts_flt,npts_int,array2D_d,flag_d,empty1D_d,empty2D_d,pxv_delay,task_list,result_chunks_d
        ######################################################################################################
        # END STEP 2
        ######################################################################################################

        ######################################################################################################
        # STEP 3: Create daily data from monthly means and daily deviations
        ######################################################################################################
        print('################################ STEP 3: CREATING DAILY DATA FROM MONTHLY MEAN AND DAILY DEV ################################')
        if (yyyy>=1980) & (yyyy<=2024):
            time_m=pd.date_range(str(yyyy)+'-01-01',str(yyyy)+'-12-31',freq='MS')
            time_d=pd.date_range(str(yyyy)+'-01-01',str(yyyy)+'-12-31',freq='D')
        else:
            time_m=pd.date_range('1900-01-01','1900-12-31',freq='MS')  
            time_d=pd.date_range('1900-01-01','1900-12-31',freq='D') 

        data3D_m=data3D_m.assign_coords(time=("time",time_m))
        data3D_d=data3D_d.assign_coords(time=("time",time_d))

        if varnames[varind]=='Precip':
            print('computing daily values for Precip...')
            var_acc=data3D_m.chunk(xrchunks)
            var_frac=data3D_d.chunk(xrchunks)

            var_acc=var_acc.rename({'time':'month'})
            months=np.arange(12)+1
            var_acc['month']=months

            with dask.config.set(**{'array.slicing.split_large_chunks': False}):
                var_daily=var_frac.groupby('time.month')*var_acc  # times here instead of add
            del var_acc, var_frac
        else:
            print('computing daily values for',varnames[varind],'...')
            var_mean=data3D_m.chunk(xrchunks)
            var_prime=data3D_d.chunk(xrchunks)

            var_mean=var_mean.rename({'time':'month'})
            months=np.arange(12)+1
            var_mean['month']=months

            with dask.config.set(**{'array.slicing.split_large_chunks': False}):
                var_daily=var_prime.groupby('time.month') + var_mean
            del var_mean, var_prime

        attrs={}
        newvarname=varnames[varind]
        if varnames[varind] == 'Srad':
            print('changing units to W/m2...')
            # attrs=ds[varname].attrs
            attrs['units']='W/m2'

            # Convert J/m2/day to W/m2
            s_per_day=86400
            var_daily=var_daily/s_per_day
            var_daily.attrs=attrs    

        if varnames[varind] == 'Wind-10m':
            # interp from 10m to 2m height
            print('interpolating wind to 2m...')
            z=10
            z_adjust=4.87/(np.log(67.8*z-5.42))
            var_daily=var_daily*z_adjust

            # fix metadata
            newvarname='Wind-2m'
            attrs={'standard_name':newvarname,'long_name':'2m Wind Speed','units':'m/s'}
            var_daily.attrs=attrs   

        if (yyyy>=1980) & (yyyy<=2024):
            dropdate=str(yyyy)+'-02-29'
        else: 
            dropdate='1900-02-29'

        with dask.config.set(**{'array.slicing.split_large_chunks': False}):
            try:
                data_out = var_daily.drop_sel(time=dropdate).transpose('y','x','time')
                print('dropping date',dropdate)
                del dropdate
            except:
                data_out = var_daily.transpose('y','x','time')    

        # limit precision here?
        # data_out=np.trunc(data_out*10**output_trunc[varind])/(10**output_trunc[varind]) 

        print('min/max values of daily data:',data_out.min().compute().data,data_out.max().compute().data)
        print('data_out dtype:',data_out.dtype)
        # figure=plt.figure(figsize=(6,4))
        # data_out.isel(time=14).plot()
        # plt.title(varnames[varind]+' daily data for input to pyaez, 15 Jan '+str(yyyy))    
        # plt.tight_layout()
        # plt.show()
        del var_daily, data3D_m, data3D_d, time_m, time_d
        ######################################################################################################
        # END STEP 3
        ######################################################################################################

        ######################################################################################################
        # STEP 4: write out npy file
        ######################################################################################################
        print('################################ STEP 4: WRITING DATA FILES ################################')
        out_dir=out_basedir+str(yyyy)+sep+newvarname+sep

        # set up dir for writing npy
        isExist = os.path.exists(out_dir)
        if not isExist:
            os.makedirs(out_dir)

        print('writing stack to',out_dir+'...')  
        # print(data_out)
        # data_out=data_out.rechunk(dachunks)
        da.to_npy_stack(out_dir,data_out.data,axis=1)            
        del out_dir, data_out
        print('done with',var)
        print('####################################################################################')
        ######################################################################################################
        # END STEP 4
        ######################################################################################################   

    print('*****************************************')
    print('*************** Processing Rhum ***************')    
    print('*****************************************')
    ######################################################################################################
    # STEP 1: create relative humidity
    ######################################################################################################
    print('################################ STEP RH1: RELATIVE HUMIDITY CALC ################################')
    vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/').rechunk(dachunks)*0.1 # hPa-->kPa
    tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/').rechunk(dachunks)
    tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/').rechunk(dachunks)

    vapr=(np.trunc(vapr*10**3)/(10**3))
    tmax=(np.trunc(tmax*10**3)/(10**3))
    tmix=(np.trunc(tmin*10**3)/(10**3))
    
    print('lazy calc...')
    vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) )) # kPa
    Rhum=(vapr/vapr_sat) # fraction, not percent
    Rhum=np.trunc(Rhum*10**4)/(10**4)

    ######################################################################################################
    # STEP 2: write out npy file
    ######################################################################################################
    print('################################ STEP RH2: WRITING DATA FILE ################################')
    out_dir=out_basedir+str(yyyy)+sep+'Rhum'+sep

    # set up dir for writing npy
    isExist = os.path.exists(out_dir)
    if not isExist:
        os.makedirs(out_dir)

    print('computing and writing stack to',out_dir+'...')     
    da.to_npy_stack(out_dir,Rhum,axis=1)   
    
    del out_dir, Rhum, vapr, tmax, tmin, vapr_sat
    print('done with Rhum')
    print('####################################################################################')        
    task_time=(timer()-start_time)/60.
    print('DONE',yyyy,'IN',task_time,'MINUTES')

########################################################################
################################ 2020 ################################
########################################################################
*****************************************
*************** Processing Precip ***************
*****************************************
################################ STEP1: PROCESSING MONTHLY PXV ################################
total data points in pxv file 2295358 expecting 2295358
data min/max values -9999.0 4785.29
apply extra mask? False
putting 1D data on a 2D grid...
concatenating...
adding nans...
total number of non-missing data points 2268708 expecting 2268708
################################ STEP 2: PROCESSING DAILY DEV PXV ################################
total data points in pxv file 2295358 expecting 2295358
data min/max values before scaling: -9999 10000
apply extra mask? False
putting 1D data on a 2D grid...
concatenating...
changing dtype...
adding nans..

In [None]:
yyyy='2020'

In [None]:
%%time
# single chunk with truncation of everything

vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/')*0.1 # hPa-->kPa
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/')
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/')

vapr=(np.trunc(vapr*10**3)/(10**3))
tmax=(np.trunc(tmax*10**3)/(10**3))
tmix=(np.trunc(tmin*10**3)/(10**3))

vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) ))
Rhum=(vapr/vapr_sat)
Rhum=np.trunc(Rhum*10**4)/(10**4)
Rhum=Rhum.compute()

print(np.nanmin(Rhum),np.nanmax(Rhum),np.nanmin(Rhum[:,:,14]),np.nanmax(Rhum[:,:,14]))
del vapr, tmax, tmin, vapr_sat, Rhum

In [None]:
%%time
# 80 chunks with truncation of everything

vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/').rechunk(dachunks)*0.1 # hPa-->kPa
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/').rechunk(dachunks)
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/').rechunk(dachunks)

vapr=(np.trunc(vapr*10**3)/(10**3))
tmax=(np.trunc(tmax*10**3)/(10**3))
tmix=(np.trunc(tmin*10**3)/(10**3))

vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) ))
Rhum=(vapr/vapr_sat)
Rhum=np.trunc(Rhum*10**4)/(10**4)
Rhum=Rhum.compute()

print(np.nanmin(Rhum),np.nanmax(Rhum),np.nanmin(Rhum[:,:,14]),np.nanmax(Rhum[:,:,14]))
del vapr, tmax, tmin, vapr_sat, Rhum

In [None]:
%%time
# 80 chunks with truncation of everything

vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/').astype(np.float16)*0.1 # hPa-->kPa
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/').astype(np.float16)
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/').astype(np.float16)

vapr=(np.trunc(vapr*10**3)/(10**3))
tmax=(np.trunc(tmax*10**3)/(10**3))
tmix=(np.trunc(tmin*10**3)/(10**3))

vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) ))
Rhum=(vapr/vapr_sat)
Rhum=np.trunc(Rhum*10**4)/(10**4)
Rhum=Rhum.compute()

print(np.nanmin(Rhum),np.nanmax(Rhum),np.nanmin(Rhum[:,:,14]),np.nanmax(Rhum[:,:,14]))
del vapr, tmax, tmin, vapr_sat, Rhum

In [None]:
%%time
# single chunk with truncation at the end

vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/')*0.1
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/')
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/')

vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) ))
Rhum=(vapr/vapr_sat)
Rhum=np.trunc(Rhum*10**4)/(10**4)
Rhum=Rhum.compute()

print(np.nanmin(Rhum),np.nanmax(Rhum),np.nanmin(Rhum[:,:,14]),np.nanmax(Rhum[:,:,14]))
del vapr, tmax, tmin, vapr_sat, Rhum

In [None]:
%%time
# single chunk type float16 with truncation at the end

vapr=(da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/')*0.1).astype(np.float16)
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/').astype(np.float16)
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/').astype(np.float16)

vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) ))
Rhum=(vapr/vapr_sat)
Rhum=np.trunc(Rhum*10**4)/(10**4)
Rhum=Rhum.compute()

print(np.nanmin(Rhum),np.nanmax(Rhum),np.nanmin(Rhum[:,:,14]),np.nanmax(Rhum[:,:,14]))
del vapr, tmax, tmin, vapr_sat, Rhum

In [None]:
vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/')#.rechunk(dachunks)*0.1 # hPa-->kPa
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/')#.rechunk(dachunks)
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/')#.rechunk(dachunks)
vapr

In [None]:
%%time
vapr=(np.trunc(vapr*10**3)/(10**3)).compute()
tmax=(np.trunc(tmax*10**3)/(10**3))
tmix=(np.trunc(tmin*10**3)/(10**3))

In [None]:
%%time
vapr_sat=(0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) )).compute()
Rhum=(vapr/vapr_sat)
Rhum=np.trunc(Rhum*10**4)/(10**4)
# print('min/max:',np.nanmin(data_out).compute(),np.nanmax(data_out).compute())
Rhum

In [None]:
%%time
print('calling compute on Rhum')
Rhum=Rhum.compute()

In [None]:
np.nanmin(Rhum),np.nanmax(Rhum),np.nanmin(Rhum[:,:,14]),np.nanmax(Rhum[:,:,14])

In [None]:
figure=plt.figure(figsize=(20,5))
plt.imshow(Rhum[:,:,14],interpolation='none')#,vmin=-1,vmax=1)
plt.colorbar(shrink=0.9)
plt.title('kerrie Rhum')    
plt.show()


In [None]:
yyyy='2020'
print('*****************************************')
print('*************** Processing Rhum ***************')    
print('*****************************************')
######################################################################################################
# STEP 1: create relative humidity
######################################################################################################
print('################################ STEP RH1: RELATIVE HUMIDITY CALC ################################')
vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/').rechunk(dachunks).persist()
tmax=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmax-2m/').rechunk(dachunks).persist()
tmin=da.from_npy_stack(out_basedir+str(yyyy)+'/Tmin-2m/').rechunk(dachunks).persist()

# print('getting the min/max...')
print('lazy calc...')
vapr=vapr*0.1 # hPa-->kPa
vapr_sat=0.5*( np.exp((17.27*tmax)/(tmax+237.3)) + np.exp((17.27*tmin)/(tmin+237.3)) ) # kPa
vapr_sat=np.trunc(vapr_sat*10**2)/(10**2)
Rhum=(vapr/vapr_sat)
# data_out=(np.trunc(Rhum*10**4)/(10**4))#.compute()
# # print('min/max:',np.nanmin(data_out).compute(),np.nanmax(data_out).compute())

# ######################################################################################################
# # STEP 2: write out npy file
# ######################################################################################################
# print('################################ STEP RH2: WRITING DATA FILE ################################')
# out_dir=out_basedir+str(yyyy)+sep+'Rhum'+sep

# # set up dir for writing npy
# isExist = os.path.exists(out_dir)
# if not isExist:
#     os.makedirs(out_dir)

# print('computing and writing to',out_dir+'0.npy...')     
# da.to_npy_stack(out_dir,data_out,axis=2)            
# del out_dir, data_out
# print('done with Rhum')
# print('####################################################################################')




In [None]:
Rhum

In [None]:
Rhum=Rhum.compute()

In [None]:
np.nanmin(Rhum),np.nanmax(Rhum)

In [None]:
np.nanmin(vapr).compute(),np.nanmax(vapr).compute()

In [None]:
np.nanmin(vapr_sat).compute(),np.nanmax(vapr_sat).compute()

In [None]:
vapr_sat=(np.trunc(vapr_sat*10**2)/(10**2)).compute()
vapr_sat

In [None]:
np.nanmin(vapr_sat),np.nanmax(vapr_sat)

In [None]:
vapr=vapr.compute()
Rhum=(vapr/vapr_sat)
np.nanmin(Rhum),np.nanmax(Rhum)

In [None]:
rhtest=da.from_npy_stack(out_basedir+str(yyyy)+'/Rhum/')[:,:,0]
rhtest

In [None]:
vapr=da.from_npy_stack(out_basedir+str(yyyy)+'/Vapr/').astype(np.float16).rechunk(dachunks)
vapr

In [None]:
vapr[0,0,:].compute()

In [None]:
ind=0
figure=plt.figure(figsize=(20,5))
idoy=idoys[ind]

kerrie_file=kerrie_path+'Rhum/'
kdata=da.from_npy_stack(kerrie_file).compute()
minmax=(np.nanmin(kdata),np.nanmax(kdata))
print('all data minmax:',minmax)
kdata=kdata[:,:,idoy]
minmax=(np.nanmin(kdata),np.nanmax(kdata))
print('single date minmax:',minmax)

kdata=np.where(kdata>=0,1,kdata)
kdata=np.where(kdata<0,-1,kdata)

plt.imshow(kdata,interpolation='none',vmin=-1,vmax=1)
plt.colorbar(shrink=0.9)
plt.title('kerrie Rhum '+dates[ind]+' positive vs negative')    
plt.show()

In [None]:
plt.imshow(vapr[:,:,14],interpolation='none')
plt.colorbar(shrink=0.7)
plt.show()

In [None]:
np.nanmin(vapr).compute(),np.nanmax(vapr).compute()

In [None]:
np.finfo(np.float16)


In [None]:
np.finfo(np.float32)

In [None]:
np.finfo(np.float32).min,np.finfo(np.float32).max


In [None]:
for yyyy in years:
    print(yyyy)

In [None]:
for yyyy in years:
    for var in varnames:
        indir=out_basedir+str(yyyy)+sep+var+sep
        print(indir)
        try:
            data=da.from_npy_stack(indir)
            print(indir,data.shape)
        except:
            pass

        try:
            figure=plt.figure(figsize=(20,8))
            plt.imshow(data[:,:,275],interpolation='none')
            plt.colorbar(shrink=0.7)
            plt.show()
        except:
            pass
        

In [None]:
# # make sure this numpy method yields data exactly the same as the fortran method 
# # monthly
# for yyyy in years[0:1]:
#     print('########################################################################')
#     print('################################',yyyy,'################################')
#     print('########################################################################')

#     for var,pdir in zip(varnames[:],pxvdirnames[:]):
#         print('Processing',var)    

#         # Fortran method: get data from dat files
#         datfile='/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/'+var+'_AgERA5_Hist_'+str(yyyy)+'_5m.dat'
#         print('dat file:',datfile)
#         datstrings=open(datfile).read().splitlines() 
#         dat=datstrings[1::2] # grab the lines with the data (every other line)
#         data_dat=np.loadtxt(dat,dtype='float32')
        
#         # Numpy method: get data directly from pxv file
#         pxvfile=pxv_basedir+pdir+sep+var+connector+dataset+connector+experiment+connector+str(yyyy)+pxvsuf
#         print('pxv file:',pxvfile)
#         filename=pxvfile.split(sep)[-1]
#         nmonths=12            
#         with open(pxvfile,'rb') as f:
#             array1D=np.fromfile(f,dtype=dtype_m,count=nmonths*npts)
#         array2D=array1D.reshape(npts,nmonths) # reshape
        
#         decs=3
#         data_dat=np.trunc(data_dat*10**decs)/(10**decs)
#         array2D=np.trunc(array2D*10**decs)/(10**decs)     
        
#         print('comparing read methods....')
#         for p in np.arange(data_dat.shape[0]):
#             unique=np.unique(data_dat[p,:]-array2D[p,:])
#             assert len(unique)==1,f'not equal at index point {p}'

In [None]:
data_dat[p,:]

In [None]:
array2D[p,:]

In [None]:
# # make sure this numpy method yields data exactly the same as the fortran method 
# # daily dev
# for yyyy in years[0:1]:
#     print('########################################################################')
#     print('################################',yyyy,'################################')
#     print('########################################################################')

#     for var,pdir in zip(varnames[2:3],pxvdirnames[2:3]):
#         print('Processing',var)    

#         # Fortran method: get data from dat files
#         datfile='/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/'+var+'365_AgERA5_Hist_'+str(yyyy)+'_5m.dat'
#         print('dat file:',datfile)
#         datstrings=open(datfile).read().splitlines() 
#         dat=datstrings[1::2] # grab the lines with the data (every other line)
#         data_dat=np.loadtxt(dat,dtype='int16')
        
#         # Nunpy method: get data directly from pxv file
#         pxvfile=pxv_basedir+pdir+sep+var+dailytag+connector+dataset+connector+experiment+connector+str(yyyy)+pxvsuf
#         print('pxv file:',pxvfile)
#         filename=pxvfile.split(sep)[-1]
#         with open(pxvfile,'rb') as f:
#             array1D=np.fromfile(f,dtype=dtype_d)
#         ndays=366 if isleap(yyyy) else 365            
#         array2D=array1D.reshape(npts,ndays) # reshape

   
        
#         print('comparing read methods....')
#         for p in np.arange(data_dat.shape[0]):
#             unique=np.unique(data_dat[p,:]-array2D[p,:])
#             assert len(unique)==1,f'not equal at index point {p}'

In [None]:
unique


# Translate monthly means from PXV to xarray data structures

In [None]:
pxvfile_m=pxv_basedir+pxvdirnames[varind]+sep+varnames[varind]+connector+dataset+connector+experiment+connector+str(yyyy)+pxvsuf
filename=pxvfile_m.split(sep)[-1]

nmonths=12

# read file to 1D array
# monthly files have more data in them than we need so we subset the read with count=
with open(pxvfile_m,'rb') as f:
    array1D_m=np.fromfile(f,dtype=dtype_m,count=nmonths*npts)

# limit precision here?
# decs=4
# array1D=np.trunc(array1D*10**decs)/(10**decs)

# reshape the array to (npoints,ndays)
nvals=array1D_m.shape[0]             # total number of data values
npts_flt=nvals/nmonths               # number of grid points in the file, float format
npts_int=int(nvals/nmonths)              # convert to integer

# check that number of grids found in flt and int are equivalent, if not, the file was read incorrectly
assert npts_flt*10==float(npts_int*10), f"reading pxv file {filename} with incorrect number of days: {ndays}"
# check that number of grids found in the file is the number expected, if not, coordinate with Gunther
assert npts_int==npts, f"pxv file {filename} has {npts_int} total data points, expecting {npts} total data points"
print('nvalues in pxv file',nvals)
print('nmonths in pxv file',nmonths)
print('total data points in pxv file',npts_int,'expecting',npts)

# check that number of grids found in pxv file is the same as the number of grids=1 in the mask file
npts_mask=int(mask2D.sum().data)
print('total data points in mask file', npts_mask,'expecting',npts)
assert npts_int==npts_mask, f"npts in pxv is {npts}, npts in mask is {npts_mask}"

array2D_m=array1D_m.reshape(npts,nmonths) # reshape

# find out if data value range spans across the fillvalue
# if it does, we'll need to apply an extra mask later
flag_m=True if array2D_m.min() < fillval_m else False

# check data values
print('shape of numpy data array',array2D_m.shape)
print('data min/max values',array2D_m.min(),array2D_m.max())
print('apply extra mask?',flag_m)

In [None]:
# # make sure this numpy method reads data exactly the same as the fortran method 
# temp=open('/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/Tmax-2m_AgERA5_Hist_2020_5m.dat').read().splitlines() 
# dat=temp[1::2] # grab the lines with the data (every other line)
# data_dat=np.loadtxt(dat,dtype=dtype_m)

# decs=3
# data_dat=np.trunc(data_dat*10**decs)/(10**decs)
# array1D=np.trunc(array1D*10**decs)/(10**decs)

# for p in np.arange(data_dat.shape[0]):
#     ind1=p*12
#     ind2=ind1+12
#     unique=np.unique(data_dat[p,:]-array1D[ind1:ind2])
#     assert len(unique)==1,f'not equal at index point {p}'

now we need to match each space point of the pxv data to a grid box on the mask

first we need to take each month of pxv data from 2295358 points to the full grid of 7776000 space points in 1D

then we need to reshape the 7776000 space points to 2D with shape (y:1800,x:4320)

In [None]:
mask1D=mask2D.stack(space=[ydimname,xdimname]) # collapse mask to 1D: 1800*4320 = 7776000 points
inds_data=mask1D==1  # keep track of which points are not masked out

empty1D_m=mask1D.copy().astype(dtype_m)  # placeholder array for 1D space 
empty1D_m.rio.write_nodata(fillval_m,inplace=True)
empty1D_m[:]=fillval_m

empty2D_m=mask2D.copy().astype(dtype_m)  # placeholder array for 2D grid 
empty2D_m.rio.write_nodata(fillval_m,inplace=True)
empty2D_m[:,:]=fillval_m

In [None]:
# function to call with dask delayed
def data_to_nd_array(i,inds,arr1D,pxv,arr2D):
    arr1D[inds]=pxv.squeeze()  # remove singleton dim (day)
    arr2D[:,:]=arr1D.unstack() # put 1D data onto the 2D grid
    return arr2D.copy()

In [None]:
%%time
# dask parallel computing 

# first convert to pxv data to chunked dask array, 1 day per chunk 
# and save to list of delayed dask objects
pxv_delay=da.from_array(array2D_m,chunks=(-1,1)).to_delayed().ravel() 

# build a list a computational tasks to be executed later
task_list=[dask.delayed(data_to_nd_array)(imonth,inds_data,empty1D_m,pxvdata,empty2D_m) for imonth,pxvdata in enumerate(pxv_delay)] 
assert len(task_list)==nmonths, f'{len(task_list)} tasks in list, should be {nmonths}' # double check we've got 1 task per day of data

# execute all computations
print('putting 1D data on a 2D grid...')
result_chunks_m=dask.compute(*task_list)

# concatenate the resulting daily chunks along a new time dimension
print('concatenating...')
data3D_m=xr.concat(result_chunks_m,dim='time')

# replace fillval with nan
print('adding nans...')
data3D_m=xr.where(data3D_m==fillval_m,np.nan,data3D_m)

# check we have the correct number of non-missing data points
data_mask_m=xr.where(np.isnan(data3D_m.data),0,1)  
ngrids_data_m=int(data_mask_m.sum()/data_mask_m.shape[0])
print('total number of non-missing data points',ngrids_data_m,'expecting',npts_valid_m)
assert ngrids_data_m==npts_valid_m, f'data mask creation issue. found {ngrids_data_m} valid data points (non missing), expecting {npts_valid_m}' # double check we've got 1 task per day of data

In [None]:
# visual check January
data3D_m.isel(time=0).plot()

# Translate daily deviations from PXV to scaled xarray data structures

In [None]:
### get the data from the pxv into an array of 2 dims (space,time)
pxvfile=pxv_basedir+pxvdirnames[varind]+sep+varnames[varind]+dailytag+connector+dataset+connector+experiment+connector+str(yyyy)+pxvsuf
filename=pxvfile.split(sep)[-1]

# read entire file into 1D array
with open(pxvfile,'rb') as f:
    array1D_d=np.fromfile(f,dtype=dtype_d)
    
# reshape the array to (npoints,ndays)
nvals=array1D_d.shape[0]             # total number of data values
ndays=366 if isleap(yyyy) else 365 # number of days of data at each grid point
npts_flt=nvals/ndays               # number of grid points in the file, float format
npts_int=int(nvals/ndays)              # convert to integer

# check that number of grids found in flt and int are equivalent, if not, the file was read incorrectly
assert npts_flt*10==float(npts_int*10), f"reading pxv file {filename} with incorrect number of days: {ndays}"
# check that number of grids found in the file is the number expected, if not, coordinate with Gunther
assert npts_int==npts, f"pxv file {filename} has {npts_int} total data points, expecting {npts} total data points"
print('nvalues in pxv file',nvals)
print('ndays in pxv file',ndays)
print('total data points in pxv file',npts_int,'expecting',npts)

# check that number of grids found in pxv file is the same as the number of grids=1 in the mask file
npts_mask=int(mask2D.sum().data)
print('total data points in mask file', npts_mask,'expecting',npts)
assert npts_int==npts_mask, f"npts in pxv is {npts}, npts in mask is {npts_mask}"

array2D_d=array1D_d.reshape(npts,ndays) # reshape

# find out if data value range spans across the fillvalue
# if it does, we'll need to apply an extra mask later
flag_d=True if array2D_d.min() < fillval_d else False

# check data values
print('shape of numpy data array',array2D_d.shape)
print('data min/max values',array2D_d.min(),array2D_d.max())
print('apply extra mask?',flag_d)
# array2D[0,:]   

In [None]:
# # make sure this numpy method reads data exactly the same as the fortran method 
# temp=open('/work/hpc/datasets/un_fao/gaez_v5_intermediate/dat/Tmax-2m365_AgERA5_Hist_2020_5m.dat').read().splitlines() 
# dat=temp[1::2] # grab the lines with the data (every other line)
# data_dat=np.loadtxt(dat,dtype='int16')
# for p in np.arange(array2D.shape[0]):
#     unique=np.unique(data_dat[p,:]-array2D[p,:])
#     assert len(unique)==1,f'not equal at index point {p}'

now we need to match each space point of the pxv data to a grid box on the mask

first we need to take each day of pxv data from 2295358 points to the full grid of 7776000 space points in 1D

then we need to reshape the 7776000 space points to 2D with shape (y:1800,x:4320)

In [None]:
mask1D=mask2D.stack(space=[ydimname,xdimname]) # collapse mask to 1D: 1800*4320 = 7776000 points
inds_data=mask1D==1  # keep track of which points are not masked out

empty1D_d=mask1D.copy().astype(dtype_d)  # placeholder array for 1D space 
empty1D_d.rio.write_nodata(fillval_d,inplace=True)
empty1D_d[:]=fillval_d

empty2D_d=mask2D.copy().astype(dtype_d)  # placeholder array for 2D grid 
empty2D_d.rio.write_nodata(fillval_d,inplace=True)
empty2D_d[:,:]=fillval_d

In [None]:
# function to call with dask delayed
def data_to_nd_array(i,inds,arr1D,pxv,arr2D):
    arr1D[inds]=pxv.squeeze()  # remove singleton dim (day)
    arr2D[:,:]=arr1D.unstack() # put 1D data onto the 2D grid
    return arr2D.copy()

In [None]:
%%time
# dask parallel computing 

# first convert to pxv data to chunked dask array, 1 day per chunk 
# and save to list of delayed dask objects
pxv_delay=da.from_array(array2D_d,chunks=(-1,1)).to_delayed().ravel() 

# build a list a computational tasks to be executed later
task_list=[dask.delayed(data_to_nd_array)(iday,inds_data,empty1D_d,pxvdata,empty2D_d) for iday,pxvdata in enumerate(pxv_delay)] 
assert len(task_list)==ndays, f'{len(task_list)} tasks in list, should be {ndays}' # double check we've got 1 task per day of data

# execute all computations
print('putting 1D data on a 2D grid...')
result_chunks_d=dask.compute(*task_list)

# concatenate the resulting daily chunks along a new time dimension
print('concatenating...')
data3D_d=xr.concat(result_chunks_d,dim='time')

In [None]:
%%time
# change out the fill value to nans
# this is where we need to apply an extra mask if the valid range of the data includes the -9999 fillval
# which is the case for srad ************ADD OTHERS HERE************************

print('changing dtype...')
data3D_d=data3D_d.astype(np.float32)

# if valid data range includes fillval
if flag_d:
    print('processing pxv variable with data values that span over the fillval')
    print('scaling...')
    data3D_d=data3D_d*scale_factors[varind]    
    print('applying additional mask...')
    data3D_d=xr.where(data_mask,data3D_d,np.nan)
    # verify that the masking worked
    valid_arr=np.where(np.isnan(data3D_d.data[15,:,:]),0,1) # pick one day to verify  
    nvalid=valid_arr.sum()
    print('total number of non-missing data points',nvalid,'expecting',npts_valid_d)
    assert nvalid==npts_valid_d, f'data mask application issue. found {nvalid} valid data points (non missing), expecting {npts_valid_d}'    

# if valid data range doesn't include fillval
else:
    print('adding nans...')
    data3D_d=xr.where(data3D_d==fillval_d,np.nan,data3D_d)
    print('scaling...')
    data3D_d=data3D_d*scale_factors[varind]

    if varnames[varind] =='Precip':
        print('computing additional mask...')
        data_mask=xr.where(np.isnan(data3D_d.data),0,1)  
        ngrids_data_d=int(data_mask.sum()/data_mask.shape[0])
        print('total number of non-missing data points',ngrids_data_d,'expecting',npts_valid_d)
        assert ngrids_data_d==npts_valid_d, f'data mask creation issue. found {ngrids_data_d} valid data points (non missing), expecting {npts_valid_d}' # double check we've got 1 task per day of data

In [None]:
data3D_d.isel(time=15).plot()

# Create daily data from monthly means and daily deviations

In [None]:
chunks={'time':-1,'y':450,'x':2160}
# chunks=(-1,450,2160)

In [None]:
# create a time dimension for monthly and daily data
if (yyyy>=1980) & (yyyy<=2024):
    time_m=pd.date_range(str(yyyy)+'-01-01',str(yyyy)+'-12-31',freq='MS')
    time_d=pd.date_range(str(yyyy)+'-01-01',str(yyyy)+'-12-31',freq='D')
else:
    time_m=pd.date_range('1900-01-01','1900-12-31',freq='MS')  
    time_d=pd.date_range('1900-01-01','1900-12-31',freq='D')  

# time_m,time_d

In [None]:
# assign time metadata to monthly and daily data
data3D_m=data3D_m.assign_coords(time=("time",time_m))
data3D_d=data3D_d.assign_coords(time=("time",time_d))

In [None]:
# data3D_m

In [None]:
# var_mean=da.from_array(data3D_m,chunks=chunks)
# var_daily=da.from_array(data3D_d.data,chunks=chunks)
# var_mean=data3D_m.chunk(chunks)
# var_prime=data3D_d.chunk(chunks)
# var_mean

In [None]:
# var_prime=data3D_d
# var_mean=data3D_m



if varnames[varind]=='Precip':
    print('computing daily values for Precip')
    var_acc=data3D_m.chunk(chunks)
    var_frac=data3D_d.chunk(chunks)
    
    var_acc=var_acc.rename({'time':'month'})
    months=np.arange(12)+1
    var_acc['month']=months
    
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        var_daily=var_frac.groupby('time.month')*var_acc  # times here instead of add
else:
    print('computing daily values for',varnames[varind])
    var_mean=data3D_m.chunk(chunks)
    var_prime=data3D_d.chunk(chunks)
    
    var_mean=var_mean.rename({'time':'month'})
    months=np.arange(12)+1
    var_mean['month']=months
        
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        var_daily=var_prime.groupby('time.month') + var_mean
    
# var_daily

In [None]:
attrs={}
newvarname=varnames[varind]
if varnames[varind] == 'Srad':
    print('changing units to W/m2')
    # attrs=ds[varname].attrs
    attrs['units']='W/m2'
    
    # Convert J/m2/day to W/m2
    s_per_day=86400
    var_daily=var_daily/s_per_day
    var_daily.attrs=attrs    
    
if varnames[varind] == 'Wind-10m':
    # interp from 10m to 2m height
    print('interpolating wind to 2m')
    z=10
    z_adjust=4.87/(np.log(67.8*z-5.42))
    var_daily=var_daily*z_adjust
    
    # fix metadata
    newvarname='Wind-2m'
    attrs={'standard_name':newvarname,'long_name':'2m Wind Speed','units':'m/s'}
    # ds=ds.rename({varname:newvarname})
    var_daily.attrs=attrs    
    # varname=newvarname

In [None]:
var_daily

In [None]:
try:
    if (yyyy>=1980) & (yyyy<=2024):
        dropdate=str(yyyy)+'-02-29'
    else: 
        dropdate='1900-02-29'
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        data_out = var_daily.drop_sel(time=dropdate).transpose('y','x','time')#.data
    print('dropping date',dropdate)
except:
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        data_out = var_daily.drop_sel(time=dropdate).transpose('y','x','time')#.data
data_out

In [None]:
# %%time
# data_out=data_out.compute()
# data_out

In [None]:
# import matplotlib.pyplot as plt
# plt.imshow(data_out[:,:,15],interpolation='none')
# plt.colorbar()
data_out.isel(time=15).plot()

In [None]:
np.nanmin(data_out.data).compute(),np.nanmax(data_out.data).compute()

In [None]:
%%time
out_dir=out_basedir+'testing/'+str(yyyy)+sep+varnames[varind]+sep

# set up dir for writing npy
# out_dir=npy_dir+year+'/'+var_out+'
isExist = os.path.exists(out_dir)
if not isExist:
    os.makedirs(out_dir)
            
print('writing to',out_dir+'0.npy')     
da.to_npy_stack(out_dir,data_out.data,axis=2)    