In [12]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import xarray as xr
import numpy as np
import os
from glob import glob
#import climpredNEW.climpred 
#from climpredNEW.climpred.options import OPTIONS
from mpl_toolkits.basemap import Basemap
from numpy import meshgrid
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
import matplotlib.colors as mcolors
import cartopy.feature as cfeature
import itertools
import cartopy.crs as ccrs
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter, LatitudeLocator
import matplotlib.ticker as mticker
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, TwoSlopeNorm
import pandas as pd
import math
from scipy.stats import percentileofscore as pos
from scipy.stats import rankdata
from datetime import datetime
import datetime as dt
from multiprocessing import Pool
from numba import njit,prange
import numba


from function import preprocessUtils as putils
from function import masks
from function import verifications
from function import funs as f
from function import conf
from function import loadbias
from function import percentile as per
from function import caseUtils as cutils
from function import masks


'''For looping through the GEFS and ECMWF data for percentile distributions'''
date_index_start = 210 
# [0,15,30,45,60,75,90,105,120,135,150,165,180,195,210,225,240,255,270,285,300,315,330,345,360]

global n_processes,day_to_grab
n_processes = day_to_grab = 10

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# This script will only look at the testing period and include the bias corrected subseasonal reforecasts

## Only do this for GLEAM observations

In [5]:
#Set script parameters

start_obs = '2000-01-01' #Beginning of observation period for analysis. We actually have data starting from 1999 so that we could have a 7-day rolling mean applied to the data and have up to 12 weeks lags for RZSM
end_obs = '2020-12-31' #end of observations for ERA5 and GLEAM. We actually needed data through 2020-02-15 since we have an initialization on 2019-12-25
start_testing = '2018-01-01' #Beginning of testing period
end_testing = '2019-12-31'
train_end_string = '2015-12-30' #last string date for training
train_end = 2015 #last year of training dates

#For creation of percentiles. Get +/- all days from observation within the window
window = 45


#Unet final experiment name (week 5)
experiment_name='EX29_regular_RZSM'

region_name = 'CONUS'


global mask, mask_anom
mask,mask_anom = masks.load_mask_vals(region_name) #Load the mask xarry and mask numpy files. Values of 1 = land 



In [3]:
# #template for adding data
# global template,template_ECM
template = xr.open_mfdataset(f'{conf.gefsv12_data}/{region_name}/baseline_RZSM_anomaly/soilw_bgrnd*',combine='nested',concat_dim=['S'],).isel(L=[6,13,20,27,34]).sel(S=slice(start_testing,end_testing)).load()
                                                                                                                                                                     
# # ECMWF anomaly
# ecm_anom = f'{conf.ecmwf_data}/{region_name}/baseline_RZSM_anomaly'
# template_ECM = xr.open_mfdataset(f'{ecm_anom}/soilw_bgrnd*',combine='nested',concat_dim=['S'],).sel(L=[6,13,20,27,34]).sel(S=slice(start_testing,end_testing)).load()


In [152]:
ecmwf_bias_corrected = xr.open_dataset(conf.return_bias_corrected_anomaly('CONUS', 'ECMWF')).isel(lead=[6,13,20,27,34]).rename({'init':'S','member':'M','lead': 'L','lat':'Y','lon':'X'})
gefs_bias_corrected = xr.open_dataset(conf.return_bias_corrected_anomaly('CONUS', 'GEFSv12')).isel(lead=[6,13,20,27,34]).rename({'init':'S','member':'M','lead': 'L','lat':'Y','lon':'X'})

In [33]:
def Observations_open_file_restrict_to_CONUS_apply_7day_rolling_mean(path,start_obs,end_obs,CONUS_mask):
    open_file = f.restrict_to_CONUS_bounding_box(xr.open_dataset(path),CONUS_mask)
    try:
        open_file = open_file.drop('time_bnds')
    except ValueError:
        open_file = open_file
    open_file = open_file.rolling(time=7, min_periods=7,center=False).mean().sel(time = slice(start_obs,end_obs))
    return(open_file)

def Reforecast_open_file_restrict_to_CONUS_apply_7day_rolling_mean(path,CONUS_mask):
    return(f.restrict_to_CONUS_bounding_box(xr.open_mfdataset(path).rolling(L=7, min_periods=7,center=False).mean(),CONUS_mask))
    

In [179]:
def create_data_julian_dates_ECMWF(reforecast_file):
    dim_order = ['S','M','L','Y','X']
    
    new_save_dir = f'{conf.ecmwf_data}/{region_name}/soilw_bgrnd/RZSM_anomaly_with_julian_dates_testing_distribution'
    os.system(f'mkdir -p {new_save_dir}')
    
    #Make the julian dates for easier processing
    for idx,date in enumerate(reforecast_file.S.values):
        # break
        date_out = f'{pd.to_datetime(date).year}-{pd.to_datetime(date).month:02}-{pd.to_datetime(date).day:02}'
        out_name = f'{new_save_dir}/soilw_bgrnd_{date_out}.nc'
        
        if os.path.exists(out_name):
            pass
        else:
            single_file = reforecast_file.sel(S=date).expand_dims({'S': 1}).transpose(*dim_order)
            date_list = [single_file.S.values + np.timedelta64(i, 'D') for i in single_file.L.values]

            julian_dates = [datestdtojd(pd.to_datetime(i).year[0], pd.to_datetime(i).month[0], pd.to_datetime(i).day[0]) for i in date_list]
            single_file['L'] = julian_dates
            single_file.to_netcdf(out_name)

    print('Loading the ECMWF julian day anomaly dataset')
    
    return(xr.open_mfdataset(f'{new_save_dir}/soil*',combine='nested',concat_dim=['S']))

In [180]:
def datestdtojd (before_year, before_month, before_day):
    # convert dates to julian date to allow for window selection
    fmt='%Y-%m-%d %H:%M:%S'
    target_date = str(datetime(before_year, before_month, before_day))
    sdtdate = datetime.strptime(target_date, fmt)
    sdtdate = sdtdate.timetuple()
    jdate = sdtdate.tm_yday
    return(jdate)

def jdtodatestd (jdate):
    #convert julian date to datetime 
    fmt = '%Y%j'
    datestd = datetime.datetime.strptime(jdate, fmt).date()
    return(datestd)

# Create the percentile dataset first. Need to have the percentiles from within a +/- 45 day window of the observation day over all years

In [58]:
def window_select(file,time,X,Y,window):
    #Grab all time samples from within the window size
    #Test 
    # file = obs_RZSM_anomaly
    # time = time
    # window=45

    before_month =  pd.to_datetime(time - np.timedelta64(45,'D')).month
    before_day = pd.to_datetime(time - np.timedelta64(45,'D')).day
    before_year = pd.to_datetime(time - np.timedelta64(45,'D')).year
    
    julian_date_before =  datestdtojd (before_year, before_month, before_day)
    
    after_month =  pd.to_datetime(time + np.timedelta64(45,'D')).month
    after_day = pd.to_datetime(time + np.timedelta64(45,'D')).day
    after_year =  pd.to_datetime(time + np.timedelta64(45,'D')).year
    
    julian_date_after =  datestdtojd (after_year, after_month, after_day)
    
    #Now we have created the before and after julian dates, now subset all data
    if julian_date_before < julian_date_after:
        selected_data = file.sel(time=(file['day_of_year'] > julian_date_before) & (file['day_of_year'] < julian_date_after)).isel(longitude=X,latitude=Y)[putils.xarray_varname(file)].values
    else:
        selected_data = file.sel(time=(file['day_of_year'] > julian_date_before) | (file['day_of_year'] < julian_date_after)).isel(longitude=X,latitude=Y)[putils.xarray_varname(file)].values
    
    #Remove any bad data points
    selected_data = selected_data[selected_data != 0]
    selected_data = selected_data[~np.isnan(selected_data)]
    
    return(selected_data)



def window_select_OBS(file,time,window):
    #Grab all time samples from within the window size
    #Test 
    # file = obs_RZSM_anomaly
    # time = time
    # window=45

    before_month =  pd.to_datetime(time - np.timedelta64(45,'D')).month
    before_day = pd.to_datetime(time - np.timedelta64(45,'D')).day
    before_year = pd.to_datetime(time - np.timedelta64(45,'D')).year
    
    julian_date_before =  datestdtojd (before_year, before_month, before_day)
    
    after_month =  pd.to_datetime(time + np.timedelta64(45,'D')).month
    after_day = pd.to_datetime(time + np.timedelta64(45,'D')).day
    after_year =  pd.to_datetime(time + np.timedelta64(45,'D')).year
    
    julian_date_after =  datestdtojd (after_year, after_month, after_day)

    #Now we have created the before and after julian dates, now subset all data
    if julian_date_before < julian_date_after:
        selected_data = file.sel(time=(file['day_of_year'] > julian_date_before) & (file['day_of_year'] < julian_date_after))[putils.xarray_varname(file)].values
    else:
        selected_data = file.sel(time=(file['day_of_year'] > julian_date_before) | (file['day_of_year'] < julian_date_after))[putils.xarray_varname(file)].values

    return(selected_data)
    

# Now convert the observation percentiles to the same format as GEFSv12 reforecast

In [49]:
dates = gefs_bias_corrected.init.values
init_date_list = [pd.to_datetime(i) for i in dates]
# init_date_list = sorted([i.split('_')[-1][:-3] for i in dates])

In [74]:
global percentile_ranges
percentile_ranges = [5,10,20,33,66,80,90,95]

save_percentile_observations = f'{conf.gleam_data}/{region_name}/anomaly_percentile_RZSM_full_distribution_with_different_thresholds_testing_distribution.nc4'

if os.path.exists(save_percentile_observations):
    obs_RZSM_percentile = xr.open_dataset(save_percentile_observations).load()
    obs_RZSM_percentile.close()
else:
    obs_RZSM_anomaly = xr.open_dataset(f'{conf.gleam_data}/{region_name}/RZSM_anomaly.nc').sel(time=slice(start_testing,'2020-03-30')).rename({'SMsurf':'RZSM'}).load()
    obs_RZSM_anomaly 
    # Calculate day of the year (Julian day)
    obs_RZSM_anomaly['day_of_year'] = obs_RZSM_anomaly['time'].dt.dayofyear

    #Convert to array 
    # obs_RZSM_arr = np.array(obs_RZSM_anomaly.RZSM.values)

    obs_RZSM_percentile = obs_RZSM_anomaly.copy(deep=True)


    
    for i in percentile_ranges:
        obs_RZSM_percentile[f'{i}th_percentile'] = xr.DataArray(np.empty(obs_RZSM_percentile.RZSM.shape, dtype='float'), dims=obs_RZSM_percentile.RZSM.dims)
    # Assuming obs_RZSM_percentile is an xarray Dataset
    # if 'serialized_percentiles' not in obs_RZSM_percentile:
    #     obs_RZSM_percentile['serialized_percentiles'] = xr.DataArray(np.empty(obs_RZSM_percentile.RZSM.shape, dtype='object'), dims=obs_RZSM_percentile.RZSM.dims)

    del obs_RZSM_percentile['RZSM']
    del obs_RZSM_percentile['day_of_year']
    
    #Find percentile of score for each grid cell
    

    for idx,time in enumerate(obs_RZSM_anomaly.time.values[0:367]):
        '''We only want to do the first year. Then we will add back the values later for other days'''
        print(f'Working on date index {idx} out of 367')
        
        all_values = window_select_OBS(obs_RZSM_anomaly,time,window)
        all_values.shape

        all_value_percentiles = {i:np.nanpercentile(all_values,i,axis=0) for i in percentile_ranges}
        
        for i in percentile_ranges:
            obs_RZSM_percentile[f'{i}th_percentile'][idx,:,:] = all_value_percentiles[i]
    

    #Now we need to add the data from each day of the first year to the next years

    #First get the base values for the 366 days that we computed
    time_dates = obs_RZSM_anomaly.time.values[0:367]

    dict_base = {}
    for date_ in time_dates:
        month_day = f'{pd.to_datetime(date_).month:02}-{pd.to_datetime(date_).day:02}'
        dict_base[month_day] = {}
        for i in percentile_ranges:
            # break
            dict_base[month_day][f'{i}th_percentile'] = obs_RZSM_percentile[f'{i}th_percentile'].sel(time=date_).values

    #Now we need to add everything to each dataset for each date to have all the values
    for idx,date_ in enumerate(obs_RZSM_percentile.time.values):
        # break
        month_day = f'{pd.to_datetime(date_).month:02}-{pd.to_datetime(date_).day:02}'

        for i in percentile_ranges:
            if month_day == '02-29':
                pass
            else:
                obs_RZSM_percentile[f'{i}th_percentile'][idx,:,:] =  dict_base[month_day][f'{i}th_percentile']

    obs_RZSM_percentile = obs_RZSM_percentile.astype(np.float32)
    
    obs_RZSM_percentile.to_netcdf(save_percentile_observations)

   

In [80]:


save_percentile_of_score_observations = f'{conf.gleam_data}/{region_name}/anomaly_percentile_of_score_RZSM_testing_distribution.nc4'
obs_RZSM_anomaly = xr.open_dataset(f'{conf.gleam_data}/{region_name}/RZSM_anomaly.nc').sel(time=slice(start_testing,'2020-03-30')).rename({'SMsurf':'RZSM'}).load()
obs_RZSM_anomaly 

# Calculate day of the year (Julian day)
obs_RZSM_anomaly['day_of_year'] = obs_RZSM_anomaly['time'].dt.dayofyear
obs_RZSM_output = obs_RZSM_anomaly.copy(deep=True)
obs_RZSM_output.RZSM[:,:,:] = np.nan

obs_RZSM_pos = np.empty(shape=obs_RZSM_anomaly.RZSM.shape)
obs_RZSM_pos[:,:,:] = np.nan

obs_RZSM_anomaly_ARR = obs_RZSM_anomaly.RZSM.values

In [11]:
try:
    obs_RZSM_output = xr.open_dataset(save_percentile_of_score_observations)
except FileNotFoundError:
    for idx,time in enumerate(obs_RZSM_anomaly.time.values):
        # break
        a = window_select_OBS(obs_RZSM_anomaly,time,window) #All of the values within the windows
    
        print(f'Working on day {idx} out of {len(obs_RZSM_anomaly.time.values)}')
        for Y in range(obs_RZSM_anomaly.latitude.shape[0]):
    
            for X in range(obs_RZSM_anomaly.longitude.shape[0]):
                if mask_anom[Y,X] == 1:
                    obs_RZSM_pos[idx,Y,X] =  pos(a[:,Y,X],obs_RZSM_anomaly_ARR[idx,Y,X])
    
    obs_RZSM_output.RZSM[:,:,:] = obs_RZSM_pos
    
    obs_RZSM_output.to_netcdf(save_percentile_of_score_observations)



In [107]:

def convert_OBS_percentiles_to_SubX_format(_date):  
# for _date in init_date_list:
    # break
    # _date=init_date_list[0]
    
    ref_dir =f'{conf.gefsv12_data}/{region_name}/soilw_bgrnd' #Just use a single reference directory to serve as the template for file creation
    save_dir = f'{conf.gleam_data}/{region_name}/RZSM_percentile_reformat_testing_distribution'
    print('Test stop 1')
    os.system(f'mkdir -p {save_dir}')
    
    
    '''We are going to create new leads that are different than reforecast. The reasoning for this is that we want the actual weekly lags (and 1 day lag) and this will
    assist with future predictions within the deep learning model'''
    print('Test stop 2')
    #Grab a single SubX to use as the template. Doesn't matter if it is the same variable or not or the same date
    fcst_file = glob(f'{ref_dir}/*2000-01-05*')[0]
    fcst_file = xr.open_dataset(fcst_file).load() #(1, 11, 35, 48, 96)
    print('Test stop 3')

    if region_name == 'CONUS':
        new_X_coords = [i+360 if i < 0 else i for i in fcst_file.X.values]
        fcst_file = fcst_file.assign_coords({'X':new_X_coords})
    print('Test stop 4')
    fcst_file = putils.restrict_to_bounding_box(fcst_file,mask)
    print('Test stop 5')
    #Create a file to overwrite
    out_file = xr.zeros_like(fcst_file)

    obs_file_name = f'RZSM_percentile_reformat_{_date}.nc4'
    save_file = f'{save_dir}/{obs_file_name}'
    print('Test stop 6')
    if os.path.exists(save_file):
        pass
    else:
        print(f'Working on initialized day {_date} to find values integrating with SubX models, leads, & coordinates and saving data into {save_dir}.')

        for idx,i_lead in enumerate(fcst_file.L.values):
            # break

            date_val = pd.to_datetime(pd.to_datetime(_date) + dt.timedelta(days=int(i_lead)+0)) #Adding +1 may be suitable for other forecasts which predict the next day. But GEFSv12 predicts lead 0 as 12 UTC on the same date it is initialized
            #But be careful if you adapt this code to a new script. We are looking backwards in time from the first date.

            date_val = f'{date_val.year}-{date_val.month:02}-{date_val.day:02}'

            out_file[putils.xarray_varname(out_file)][0,:, idx, :, :] = \
                obs_file[putils.xarray_varname(obs_file)].sel(time = date_val).values

        var_OUT = xr.Dataset(
            data_vars = dict(
                RZSM_percentile = (['S','M','L','Y','X'],    out_file[list(out_file.keys())[0]].values),
            ),
            coords = dict(
                S = np.atleast_1d(_date),
                X = out_file.X.values,
                Y = out_file.Y.values,
                L = list(out_file.L.values[:]),
                M = out_file.M.values,

            ),
            attrs = dict(
                Description = f'RZSM percentiles values on the exact same date and grid \
                cell as EMC reforecast data. 7-day rolling mean already applied.'),
        )                    
        var_OUT.to_netcdf(save_file)

    return(0)

global obs_file
obs_file = xr.open_dataset(f'{conf.gleam_data}/{region_name}/anomaly_percentile_of_score_RZSM_testing_distribution.nc4').drop('day_of_year').load()



####### RUN FUNCTION #######
for _date in init_date_list:
    
    convert_OBS_percentiles_to_SubX_format(_date)



Test stop 1
Test stop 2
Test stop 3
Test stop 4
Test stop 5
Test stop 6
Working on initialized day 2018-01-03 00:00:00 to find values integrating with SubX models, leads, & coordinates and saving data into /glade/work/klesinger/FD_RZSM_deep_learning/Data/reanalysis/GLEAM/CONUS/RZSM_percentile_reformat_testing_distribution.
Test stop 1
Test stop 2
Test stop 3
Test stop 4
Test stop 5
Test stop 6
Working on initialized day 2018-01-10 00:00:00 to find values integrating with SubX models, leads, & coordinates and saving data into /glade/work/klesinger/FD_RZSM_deep_learning/Data/reanalysis/GLEAM/CONUS/RZSM_percentile_reformat_testing_distribution.
Test stop 1
Test stop 2
Test stop 3
Test stop 4
Test stop 5
Test stop 6
Working on initialized day 2018-01-17 00:00:00 to find values integrating with SubX models, leads, & coordinates and saving data into /glade/work/klesinger/FD_RZSM_deep_learning/Data/reanalysis/GLEAM/CONUS/RZSM_percentile_reformat_testing_distribution.
Test stop 1
Test stop 2
T

# Create a new RZSM reforecast set of data which has the julian dates only and not the lead times

## Create dataset from predictions for each week

In [114]:
def window_select_reforecast_by_realization(file,time,window,add_window):
    #Grab all time samples from within the window size
    #Test 
    # file = obs_RZSM_anomaly
    # time = time
    # window=45

    before_month =  pd.to_datetime(time - np.timedelta64(45+add_window,'D')).month
    before_day = pd.to_datetime(time - np.timedelta64(45+add_window,'D')).day
    before_year = pd.to_datetime(time - np.timedelta64(45+add_window,'D')).year
    
    julian_date_before =  datestdtojd (before_year, before_month, before_day)
    
    after_month =  pd.to_datetime(time + np.timedelta64(45+add_window,'D')).month
    after_day = pd.to_datetime(time + np.timedelta64(45+add_window,'D')).day
    after_year =  pd.to_datetime(time + np.timedelta64(45+add_window,'D')).year
    
    julian_date_after =  datestdtojd (after_year, after_month, after_day)
    
    #Now we have created the before and after julian dates, now subset all data
    #We are adding 15 becuase otherwise we have to load each day individually and it takes a long time, this should shorten the waiting time.
    if julian_date_before < julian_date_after:
        selected_data = file.sel(L=(file['L'] > julian_date_before) & (file['L'] < julian_date_after)).load()
    else:
        selected_data = file.sel(L=(file['L'] > julian_date_before) | (file['L'] < julian_date_after)).load()
    
    
    return(selected_data)

In [115]:
def window_select_reforecast(file,time,window):
    #Grab all time samples from within the window size
    #Test 
    # file = obs_RZSM_anomaly
    # time = time
    # window=45

    before_month =  pd.to_datetime(time - np.timedelta64(45,'D')).month
    before_day = pd.to_datetime(time - np.timedelta64(45,'D')).day
    before_year = pd.to_datetime(time - np.timedelta64(45,'D')).year
    
    julian_date_before =  datestdtojd (before_year, before_month, before_day)
    
    after_month =  pd.to_datetime(time + np.timedelta64(45,'D')).month
    after_day = pd.to_datetime(time + np.timedelta64(45,'D')).day
    after_year =  pd.to_datetime(time + np.timedelta64(45,'D')).year
    
    julian_date_after =  datestdtojd (after_year, after_month, after_day)
    
    #Now we have created the before and after julian dates, now subset all data
    #We are adding 15 becuase otherwise we have to load each day individually and it takes a long time, this should shorten the waiting time.
    if julian_date_before < julian_date_after:
        selected_data = file.sel(L=(file['L'] > julian_date_before) & (file['L'] < julian_date_after)).load()
    else:
        selected_data = file.sel(L=(file['L'] > julian_date_before) | (file['L'] < julian_date_after)).load()
    
    
    return(selected_data)

In [116]:
def return_julian_before_after(time):
    #Grab all time samples from within the window size
    #Test 
    # file = obs_RZSM_anomaly
    # time = time
    # window=45

    before_month =  pd.to_datetime(time - np.timedelta64(45,'D')).month
    before_day = pd.to_datetime(time - np.timedelta64(45,'D')).day
    before_year = pd.to_datetime(time - np.timedelta64(45,'D')).year
    
    julian_date_before =  datestdtojd (before_year, before_month, before_day)
    
    after_month =  pd.to_datetime(time + np.timedelta64(45,'D')).month
    after_day = pd.to_datetime(time + np.timedelta64(45,'D')).day
    after_year =  pd.to_datetime(time + np.timedelta64(45,'D')).year
    
    julian_date_after =  datestdtojd (after_year, after_month, after_day)
 
    
    return(julian_date_before,julian_date_after)

In [117]:
def create_final_day_list(month_day):
    final_dates = []
    for i in month_day:
        if i not in final_dates:
            final_dates.append(i)
        else:
            pass
    return(final_dates)

In [118]:
# def percentile_of_score_by_ensemble_mean(anomaly_file, julian_file, percentile_output,save_dir):
     
    
#     #Test 
#     # anomaly_file = base_reforecast_anomaly
#     # julian_file = base_reforecast_anomaly_julian
#     # save_dir = 'Data/GEFSv12_reforecast/soilw_bgrnd/percentiles_baseline'
    
#     #first find out what the unique dates in the file
#     all_dates = anomaly_file.S.values
    
#     month_day = sorted([f'{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}' for i in all_dates])
    
#     final_dates = []
#     for i in month_day:
#         if i not in final_dates:
#             final_dates.append(i)
#         else:
#             pass
    
#     for i in final_dates:
#         dates = i.split('-')
#         month = int(dates[0])
#         day = int(dates[1])
#         # break
#         #Now loop through each of the month_days and create the distribution
#         run_dates = anomaly_file.sel(S=(anomaly_file['S.month'] == month) & (anomaly_file['S.day'] == day ))
        
#         #Now check if files exists
#         saved_dates1 = run_dates.S.values
#         saved_dates2 = [f'RZSM_percentiles_MEM_{pd.to_datetime(i).year}-{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}.nc' for i in saved_dates1]
        
#         completed_or_not = []
#         for i in saved_dates2:
#             if os.path.exists(f'{save_dir}/{i}'):
#                 completed_or_not.append(True)
#             else:
#                 completed_or_not.append(False)
        
#         if len(completed_or_not) == sum(completed_or_not):
#             #All files are already completed
#             pass
#         else:
            
#             print(f'Starting percentiles on:')
#             print(saved_dates2)
#             def run_selected_window_to_save_memory():
#                 #first choose a single date for the window
#                 single_date = saved_dates1[0]

#                 #Grab the distribution
#                 selected_window = window_select_reforecast(file=julian_file, time=single_date,window=window)

#                 #Now loop through each of the actual files and create the percentile distribution
#                 for idx,date_run in enumerate(saved_dates1):
#                     # break
#                     file_run  = anomaly_file.sel(S=date_run)

#                     for X in range(anomaly_file.X.shape[0]):
#                         for Y in range(anomaly_file.Y.shape[0]):
#                             all_leads = anomaly_file.isel(X=X,Y=Y).sel(S=date_run).RZSM.values
#                             all_values = selected_window.isel(X=X,Y=Y).RZSM.values.flatten()
#                             all_values = all_values[~np.isnan(all_values)]
#                             all_values = all_values[all_values != 0]
#                             #percentile of score
#                             percentile = pos(all_values, all_leads)

#                             #Now add back to the dataset
#                             #get index
#                             index = int(np.where(anomaly_file['S'] == pd.to_datetime(date_run))[0])
#                             percentile_output.RZSM[index,:, Y, X] = percentile
                    
#                     percentile_output.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')
                
#                 del selected_window
                
#                 return(0)
            
#             #Run function
#             run_selected_window_to_save_memory()


In [157]:
def create_reforecast_with_predictions(experiment_name):
    #Load previous predictions from experiments to create a percentile distribution
    temp_cp = template.copy(deep=True)

    for idx,lead in enumerate([1,2,3,4]):
        day_num = (lead*7)-1

        test = verifications.reverse_min_max_scaling(np.load(f'predictions/{region_name}/Wk{lead}_testing/Wk{lead}_testing_{experiment_name}.npy')[-1,:,:,:,0], region_name, day_num,'GEFSv12',2019)
        test = np.reshape(test,(test.shape[0]//11,11,test.shape[1],test.shape[2]))

        #Add data to file
        temp_cp.RZSM[0:,:,idx,:,:] = test
    
    return(temp_cp)

In [120]:
def create_data_julian_dates(experiment_list):
    dim_order = ['S','M','L','Y','X']
    lead = int(list(experiment_list.keys())[0])
    day_num = (lead*7)-1
    
    print('Saving the anomaly prediction without julian dates')
    no_julian_dir = f'predictions/{region_name}/anomaly_no_julian_dates_bias_corrected_testing_distribution'
    os.system(f'mkdir -p {no_julian_dir}')
    reforecast_RZSM = create_reforecast_with_predictions(experiment_list,day_num)
    
    for date in reforecast_RZSM.S.values:
        # break
        save_no_julian_file = f'{no_julian_dir}/{experiment_name}_{pd.to_datetime(date).year}-{pd.to_datetime(date).month:02}-{pd.to_datetime(date).day:02}.nc'
        if os.path.exists(save_no_julian_file):
            pass
        else:
            reforecast_RZSM.sel(S=date).expand_dims({'S': 1}).transpose(*dim_order).to_netcdf(save_no_julian_file)

    new_save_dir = 'predictions/anomaly_experiment_RZSM_julian_dates_bias_corrected'
    os.system(f'mkdir -p {new_save_dir}')
    
    #Make the julian dates for easier processing
    for idx,date in enumerate(reforecast_RZSM.S.values):
        # break
        date_out = f'{pd.to_datetime(date).year}-{pd.to_datetime(date).month:02}-{pd.to_datetime(date).day:02}'
        out_name = f'{new_save_dir}/RZSM_{experiment_name}_{date_out}.nc'
        
        if os.path.exists(out_name):
            pass
        else:
            single_file = reforecast_RZSM.sel(S=date).expand_dims({'S': 1}).transpose(*dim_order)
            date_list = [single_file.S.values + np.timedelta64(i, 'D') for i in single_file.L.values]

            julian_dates = [datestdtojd(pd.to_datetime(i).year[0], pd.to_datetime(i).month[0], pd.to_datetime(i).day[0]) for i in date_list]
            single_file['L'] = julian_dates
            single_file.to_netcdf(out_name)
            
    return('Completed dataset setup')

In [121]:
def percentile_of_score_by_realization_GEFS(anomaly_file, julian_file, save_dir, MEM_or_by_model):
    percentile_output = template.copy(deep=True)
    os.system(f'mkdir -p {save_dir}')
    
    if MEM_or_by_model == 'MEM':
        RZSM_name = 'soilw_bgrnd_percentiles_MEM'
        anomaly_file = anomaly_file.mean(dim='M')
        julian_file = julian_file.mean(dim='M')
        percentile_output = percentile_output.mean(dim='M')
    else:
        RZSM_name = 'soilw_bgrnd_percentiles'

    #first find out what the unique dates in the file
    all_dates = anomaly_file.S.values
    
    month_day = sorted([f'{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}' for i in all_dates])
    final_dates = create_final_day_list(month_day)
    final_dates.reverse()
    
    #run through several days at a time because it's a slow process to keep re-loading the same file when they are only 1 julian day apart
    total_days = len(final_dates)
    day_to_grab = 15
    
    for date_index in range(0, total_days, day_to_grab):
        # date_index=0
        # break
        non_completed_dates = per.get_non_completed_days(final_dates,date_index,day_to_grab, anomaly_file, RZSM_name, save_dir)
        print(f'Non completed dates:')
        print(non_completed_dates) 
           
        #Now loop through all the non-completed days
        def run_subset(date_to_run,selected_window):
            dattt = date_to_run.split('-')
            m = int(dattt[0])
            d = int(dattt[1])
        
            run_dates_final = anomaly_file.sel(S=(anomaly_file['S.month'] == m) & (anomaly_file['S.day'] == d))

            #Create the actual names of the files
            saved_dates1 = run_dates_final.S.values
            saved_dates2 = [f'{RZSM_name}_{pd.to_datetime(i).year}-{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}.nc' for i in saved_dates1]

            print(f'Starting percentiles on:')
            print(saved_dates2)
            #Now get the julian dates to properly subset
            julian_date_before,julian_date_after = return_julian_before_after(saved_dates1[0])

            #grab subset of julian dates
            if julian_date_before < julian_date_after:
                julian_date_before_subset = selected_window.sel(L=(selected_window['L'] > julian_date_before) & (selected_window['L'] < julian_date_after))
            else:
                julian_date_before_subset = selected_window.sel(L=(selected_window['L'] > julian_date_before) | (selected_window['L'] < julian_date_after))

            #Now loop through each of the actual files and create the percentile distribution

            # index = int(np.where(anomaly_file['S'] == pd.to_datetime(date_run))[0])
            if MEM_or_by_model == 'model':
                '''For this, we want to include the entire distribution when comparing with each realization. Each realization
                is not considered its own model'''
                all_leads1 = anomaly_file.RZSM.values
                all_leads1.shape
                'Perform the operation over all inits and realizations for each coordinate'''
                for X in range(anomaly_file.X.shape[0]):
                    for Y in range(anomaly_file.Y.shape[0]):
                        print(f'X: {X}, Y: {Y}')
                        if mask_anom[Y,X]==0 :
                            pass
                        else:
                            all_leads = all_leads1[:,:,:,Y,X]
                            all_values = julian_date_before_subset.isel(Y=Y,X=X).RZSM.values.flatten()
                            all_values = all_values[~np.isnan(all_values)]
                            all_values = all_values[all_values != 0]
                            #percentile of score
                            percentile = pos(all_values, all_leads)
    
                            #Now add back to the dataset
                            #get index
                            
                            percentile_output.RZSM[:,:,:, Y, X] = percentile
                
                for idx,date_run in enumerate(saved_dates1):
                    percentile_output.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')

            del julian_date_before_subset

            return(0)

        def run_selected_window_to_save_memory(non_completed_dates):
            #first get the first date to only have to load all the files once
            dates_init = non_completed_dates[0].split('-')
            month_init = int(dates_init[0])
            day_init = int(dates_init[1])

            #Select the anomaly file dates just to have it to access later dates
            run_dates = anomaly_file.sel(S=(anomaly_file['S.month'] == month_init) & (anomaly_file['S.day'] == day_init))
            saved_dates1 = run_dates.S.values

            #first choose a single date for the window
            single_date = saved_dates1[0]

            #Grab the distribution (getting even days 15 after)
            selected_window = window_select_reforecast_by_realization(file=julian_file, time=single_date,window=window,add_window=day_to_grab)
            selected_window = selected_window.load()
            
            #Now loop through each of the non_completed dates
            for date_to_run in non_completed_dates:
                # break
                # date_to_run = '12-31'
                run_subset(date_to_run,selected_window)

            del selected_window
            return(0)

        #Run function
        if len(non_completed_dates) > 0 :
            run_selected_window_to_save_memory(non_completed_dates)


In [122]:
def percentileofscoreFUNC(a, score):
    # a,score = all_values, all_leads1[:,Y,X]
    """Compute the percentile rank of a score relative to a list of scores.

    A `percentileofscore` of, for example, 80% means that 80% of the
    scores in `a` are below the given score. In the case of gaps or
    ties, the exact definition depends on the optional keyword, `kind`.

    Parameters
    ----------
    a : array_like
        A 1-D array to which `score` is compared.
    score : array_like
        Scores to compute percentiles for.

    Returns
    -------
    pcos : float
        Percentile-position of score (0-100) relative to `a`.

    See Also
    --------
    numpy.percentile
    scipy.stats.scoreatpercentile, scipy.stats.rankdata

    Examples
    --------
    Three-quarters of the given values lie below a given score:

    >>> import numpy as np
    >>> from scipy import stats
    >>> stats.percentileofscore([1, 2, 3, 4], 3)
    75.0

    With multiple matches, note how the scores of the two matches, 0.6
    and 0.8 respectively, are averaged:

    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3)
    70.0

    Only 2/5 values are strictly less than 3:

    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
    40.0

    But 4/5 values are less than or equal to 3:

    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
    80.0

    The average between the weak and the strict scores is:

    >>> stats.percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
    60.0

    Score arrays (of any dimensionality) are supported:

    >>> stats.percentileofscore([1, 2, 3, 3, 4], [2, 3])
    array([40., 70.])

    The inputs can be infinite:

    >>> stats.percentileofscore([-np.inf, 0, 1, np.inf], [1, 2, np.inf])
    array([75., 75., 100.])

    If `a` is empty, then the resulting percentiles are all `nan`:

    >>> stats.percentileofscore([], [1, 2])
    array([nan, nan])
    """
    # a,score = all_values, all_leads1[:,Y,X]
    score_out = score.copy()
    
    good_index = [idx for idx,_ in enumerate(score) if ~np.isnan(_)]
    #remove missing values
    score2 = score[good_index]
    
    a = np.asarray(a)
    n = len(a)
    score2 = np.asarray(score2)
    score2 = score2[..., None]
    
    def count(x):
        return np.count_nonzero(x,-1)

        # Main computations/logic

    left = count(a < score2)
    right = count(a <= score2)
    plus1 = left < right
    perct = (left + right + plus1) * (50.0 / n)

    #Now add back to original score
    score[good_index] = perct
    return score



In [123]:
def step_1_return_info(anomaly_file,julian_file,save_dir,MEM_or_by_model):

    percentile_output = anomaly_file.copy(deep=True)
    os.system(f'mkdir -p {save_dir}')
    
    if MEM_or_by_model == 'MEM':
        RZSM_name = 'soilw_bgrnd_percentiles_MEM'
        anomaly_file = anomaly_file.mean(dim='M')
        julian_file = julian_file.mean(dim='M')
        percentile_output = percentile_output.mean(dim='M')
    else:
        RZSM_name = 'soilw_bgrnd_percentiles'

    #first find out what the unique dates in the file
    all_dates = anomaly_file.S.values
    
    month_day = sorted([f'{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}' for i in all_dates])
    final_dates = create_final_day_list(month_day)
    final_dates.reverse()
    
    #run through several days at a time because it's a slow process to keep re-loading the same file when they are only 1 julian day apart
    total_days = len(final_dates)

    return(total_days,final_dates, RZSM_name)

In [124]:
def step_1_return_info_UNET_experiment(anomaly_file,julian_file,save_dir,MEM_or_by_model):

    percentile_output = anomaly_file.copy(deep=True)
    os.system(f'mkdir -p {save_dir}')
    
    if MEM_or_by_model == 'MEM':
        RZSM_name = 'soilw_bgrnd_percentiles_MEM'
        anomaly_file = anomaly_file.mean(dim='M')
        percentile_output = percentile_output.mean(dim='M')
    else:
        RZSM_name = 'soilw_bgrnd_percentiles'

    #first find out what the unique dates in the file
    all_dates = anomaly_file.S.values
    
    month_day = sorted([f'{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}' for i in all_dates])
    final_dates = create_final_day_list(month_day)
    final_dates.reverse()
    
    #run through several days at a time because it's a slow process to keep re-loading the same file when they are only 1 julian day apart
    total_days = len(final_dates)

    return(total_days,final_dates, RZSM_name)

In [125]:
def return_selected_window_to_save_memory(non_completed_dates):
    #first get the first date to only have to load all the files once
    dates_init = non_completed_dates[0].split('-')
    month_init = int(dates_init[0])
    day_init = int(dates_init[1])

    #Select the anomaly file dates just to have it to access later dates
    run_dates = anomaly_file.sel(S=(anomaly_file['S.month'] == month_init) & (anomaly_file['S.day'] == day_init))
    saved_dates1 = run_dates.S.values

    #first choose a single date for the window
    single_date = saved_dates1[0]

    #Grab the distribution (getting even days 15 after)
    selected_window = window_select_reforecast_by_realization(file=julian_file, time=single_date,window=window,add_window=day_to_grab)
    selected_window = selected_window.load()
    return(selected_window)


In [126]:
def return_selected_window_to_save_memory_ensemble_mean(non_completed_dates, anomaly_file, julian_file):
    #first get the first date to only have to load all the files once
    dates_init = non_completed_dates[0].split('-')
    month_init = int(dates_init[0])
    day_init = int(dates_init[1])

    #Select the anomaly file dates just to have it to access later dates
    run_dates = anomaly_file.sel(S=(anomaly_file['S.month'] == month_init) & (anomaly_file['S.day'] == day_init))
    saved_dates1 = run_dates.S.values

    #first choose a single date for the window
    single_date = saved_dates1[0]

    #Grab the distribution (getting even days 15 after)
    selected_window2 = window_select_reforecast_by_realization(file=julian_file, time=single_date,window=window,add_window=day_to_grab)
    selected_window2 = selected_window2.load()
    return(selected_window2)


In [127]:
#Now loop through all the non-completed days
def return_inputs(date_to_run,selected_window,anomaly_file):
    dattt = date_to_run.split('-')
    m = int(dattt[0])
    d = int(dattt[1])

    selected_window = xr.where(selected_window==0,np.nan,selected_window)
    run_dates_final = anomaly_file.sel(S=(anomaly_file['S.month'] == m) & (anomaly_file['S.day'] == d))
    run_dates_final = xr.where(run_dates_final == 0,np.nan,run_dates_final)
    #Create the actual names of the files
    saved_dates1 = run_dates_final.S.values
    saved_dates2 = [f'{RZSM_name}_{pd.to_datetime(i).year}-{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}.nc' for i in saved_dates1]

    print(f'Starting percentiles on:')
    print(saved_dates2)
    #Now get the julian dates to properly subset
    julian_date_before,julian_date_after = return_julian_before_after(saved_dates1[0])

    #grab subset of julian dates
    if julian_date_before < julian_date_after:
        julian_date_before_subset = selected_window.sel(L=(selected_window['L'] > julian_date_before) & (selected_window['L'] < julian_date_after))
    else:
        julian_date_before_subset = selected_window.sel(L=(selected_window['L'] > julian_date_before) | (selected_window['L'] < julian_date_after))

    julian_date_before_subset = julian_date_before_subset.load()
    return(julian_date_before_subset,selected_window,saved_dates2,saved_dates1)

In [128]:
def stack_models(anomaly_file):
    stack = anomaly_file.stack(init_mod_lead = ['S','M','L'])
        
    all_leads1 = stack.RZSM.values
    all_leads1 = np.where(all_leads1 == 0,np.nan,all_leads1)
    all_leads1.shape #(48, 96, 57365) #This data is the anomaly files which we need to rank against the other distribution

    return(stack,all_leads1)

In [129]:
def stack_models_ensemble_mean(anom):
    stack = anom.stack(init_mod_lead = ['S','L'])
        
    all_leads1 = stack.RZSM.values
    all_leads1 = np.where(all_leads1 == 0,np.nan,all_leads1)
    all_leads1.shape #(48, 96, 57365) #This data is the anomaly files which we need to rank against the other distribution

    return(stack,all_leads1)

In [130]:

def pos_func(all_values1, all_leads1, mask_anom,out_array):
    # X,Y=10,10
    # print("all_values1 shape:", all_values1.shape)
    # print("all_leads1 shape:", all_leads1.shape)
    # print("mask_anom shape:", mask_anom.shape)
    # print("out_array shape:", out_array.shape)
    # assert all_values1.shape[3:5] == mask_anom.shape, "Dimension mismatch between all_values1 and mask_anom"
    # assert all_leads1.shape[0:2] == mask_anom.shape, "Dimension mismatch between all_leads1 and mask_anom"
    # assert out_array.shape[0:2] == mask_anom.shape, "Dimension mismatch between out_array and mask_anom"
    
    for Y in range(all_leads1.shape[0]):
        for X in range(all_leads1.shape[1]):
            if mask_anom[Y,X] == 1:
                # print(f'Working on X: {X} and Y: {Y}')
                a = all_values1[:, :, :, Y, X].flatten() #(1021097,)
                a = a.flatten()
                a = a[~np.isnan(a)]
                a = a[a != 0]
                a.shape
                
                score = all_leads1[Y, X, :] #(57365,)
                good_index = ~np.isnan(score)
                score = score[good_index]
                # print("score shape:", score.shape)
                n = a.size #(1021097,)
                score = score[..., None] #(57365, 1)
                
                def count(x):
                    return np.count_nonzero(x, axis=-1)
    
                # print('Right before division')
                # print("a:", a.shape)
                # print("score shape:", score.shape)
                div = a < score
                # print("div shape:", div.shape)
                count_ = np.count_nonzero(div,axis=-1)
                
                # perct = count() * (100.0 / n) #Strict
                # print('Right before the ranking')
                
                perct = count_ * (100.0 / n) #weak
                out_array[Y,X,good_index] = perct
                
    print("Finished processing")
    return(out_array)


In [131]:

def pos_func_ensemble_mean(all_values1, all_leads1, mask_anom,out_array):
    # X,Y=10,10
    # print("all_values1 shape:", all_values1.shape)
    # print("all_leads1 shape:", all_leads1.shape)
    # print("mask_anom shape:", mask_anom.shape)
    # print("out_array shape:", out_array.shape)
    # assert all_values1.shape[3:5] == mask_anom.shape, "Dimension mismatch between all_values1 and mask_anom"
    # assert all_leads1.shape[0:2] == mask_anom.shape, "Dimension mismatch between all_leads1 and mask_anom"
    # assert out_array.shape[0:2] == mask_anom.shape, "Dimension mismatch between out_array and mask_anom"
    
    for Y in range(all_leads1.shape[0]):
        for X in range(all_leads1.shape[1]):
            if mask_anom[Y,X] == 1:
                # print(f'Working on X: {X} and Y: {Y}')
                a = all_values1[:, :, Y, X].flatten() #(1021097,)
                a = a.flatten()
                a = a[~np.isnan(a)]
                a = a[a != 0]
                a.shape
                
                score = all_leads1[Y, X, :] #(57365,)
                good_index = ~np.isnan(score)
                score = score[good_index]
                # print("score shape:", score.shape)
                n = a.size #(1021097,)
                score = score[..., None] #(57365, 1)
                
                def count(x):
                    return np.count_nonzero(x, axis=-1)
    
                # print('Right before division')
                # print("a:", a.shape)
                # print("score shape:", score.shape)
                div = a < score
                # print("div shape:", div.shape)
                count_ = np.count_nonzero(div,axis=-1)
                
                # perct = count() * (100.0 / n) #Strict
                # print('Right before the ranking')

                try:
                    perct = count_ * (100.0 / n) #weak
                except ZeroDivisionError:
                    perct = np.nan
                    
                out_array[Y,X,good_index] = perct
                
    print("Finished processing")
    return(out_array)


In [132]:

def rank_data_func(all_values1, all_leads1, mask_anom,out_array):
    # X,Y=10,10
    # print("all_values1 shape:", all_values1.shape)
    # print("all_leads1 shape:", all_leads1.shape)
    # print("mask_anom shape:", mask_anom.shape)
    # print("out_array shape:", out_array.shape)
    # assert all_values1.shape[3:5] == mask_anom.shape, "Dimension mismatch between all_values1 and mask_anom"
    # assert all_leads1.shape[0:2] == mask_anom.shape, "Dimension mismatch between all_leads1 and mask_anom"
    # assert out_array.shape[0:2] == mask_anom.shape, "Dimension mismatch between out_array and mask_anom"
    
    for Y in range(all_leads1.shape[0]):
        for X in range(all_leads1.shape[1]):
            if mask_anom[Y,X] == 1:
                # print(f'Working on X: {X} and Y: {Y}')
                a = all_values1[:, :, :, Y, X].flatten() #(1021097,)
                a = a.flatten()
                a = a[~np.isnan(a)]
                a = a[a != 0]
                a.shape
                
                score = all_leads1[Y, X, :] #(57365,)
                good_index = ~np.isnan(score)
                score = score[good_index]
                # print("score shape:", score.shape)
                n = a.size #(1021097,)
                score = score[..., None] #(57365, 1)
                
                def count(x):
                    return np.count_nonzero(x, axis=-1)
    
                # print('Right before division')
                # print("a:", a.shape)
                # print("score shape:", score.shape)
                div = a < score
                # print("div shape:", div.shape)
                count_ = np.count_nonzero(div,axis=-1)
                
                # perct = count() * (100.0 / n) #Strict
                # print('Right before the ranking')
                
                perct = count_ * (100.0 / n) #weak
                out_array[Y,X,good_index] = perct
                
    print("Finished processing")
    return(out_array)


In [133]:
#Now loop through each of the non_completed dates
def run_multi_dates(date_to_run,anomaly_file):
    # break
    # date_to_run = '12-31'
    julian_date_before_subset,selected_window2,saved_dates2,saved_dates1 = return_inputs(date_to_run,selected_window,anomaly_file)
    stack,all_leads1 = stack_models(anomaly_file)
    all_leads1.shape #(48, 96, 57365)
    
    all_values1 = julian_date_before_subset.RZSM.values
    all_values1.shape # (1043, 11, 89, 48, 96)
    
    # percentile_arr = pos_func(all_values1, all_leads1,mask_anom)
    percentile_arr = np.empty_like(all_leads1)
    percentile_arr.shape #(48, 96, 57365)
    
    percentile_arr = pos_func(all_values1, all_leads1,mask_anom,percentile_arr)
    
    stack_cp = stack.copy(deep=True)
    stack_cp.RZSM[:,:,:] = percentile_arr
    unstack = stack_cp.unstack('init_mod_lead').transpose('S','M','L','Y','X')
    unstack.RZSM[:,0,0,10,10].values
    
    for idx,date_run in enumerate(saved_dates1):
        unstack.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')
    return(f'Completed {date_to_run}')

In [134]:
def run_multi_dates_ensemble_mean(date_to_run,anomaly_file,selected_window,save_dir):
    # break
    # date_to_run = '01-01'
    julian_date_before_subset,selected_window2,saved_dates2,saved_dates1 = return_inputs(date_to_run,selected_window,anomaly_file)
    stack,all_leads1 = stack_models_ensemble_mean(anomaly_file)
    all_leads1.shape #(48, 96, 5215)
    
    all_values1 = julian_date_before_subset.RZSM.values
    all_values1.shape # (1043, 89, 48, 96)
    
    # percentile_arr = pos_func(all_values1, all_leads1,mask_anom)
    percentile_arr = np.empty_like(all_leads1)
    percentile_arr.shape #(48, 96, 5215)
    
    percentile_arr = pos_func_ensemble_mean(all_values1, all_leads1,mask_anom,percentile_arr)
    
    stack_cp = stack.copy(deep=True)
    stack_cp.RZSM[:,:,:] = percentile_arr
    unstack = stack_cp.unstack('init_mod_lead').transpose('S','L','Y','X')
    unstack.RZSM[:,0,10,10].values
    
    for idx,date_run in enumerate(saved_dates1):
        unstack.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')
    return(f'Completed {date_to_run}')

In [172]:
def run_multi_dates_ensemble_mean_UNET_EXPERIMENT(date_to_run,anomaly_file,selected_window,save_dir):
    # break
    # date_to_run = '12-31'
    anom = anomaly_file.mean(dim='M')
    
    julian_date_before_subset,selected_window2,saved_dates2,saved_dates1 = return_inputs(date_to_run,selected_window,anomaly_file)
    stack,all_leads1 = stack_models_ensemble_mean(anom)
    all_leads1.shape #(48, 96, 5215)
    
    all_values1 = julian_date_before_subset.RZSM.values
    all_values1.shape # (1043, 89, 48, 96)
    
    # percentile_arr = pos_func(all_values1, all_leads1,mask_anom)
    percentile_arr = np.empty_like(all_leads1)
    percentile_arr.shape #(48, 96, 5215)
    
    percentile_arr = pos_func_ensemble_mean(all_values1, all_leads1,mask_anom,percentile_arr)
    
    stack_cp = stack.copy(deep=True)
    stack_cp.RZSM[:,:,:] = percentile_arr
    unstack = stack_cp.unstack('init_mod_lead').transpose('S','L','Y','X')
    unstack.RZSM[:,0,10,10].values
    
    for idx,date_run in enumerate(saved_dates1):
        unstack.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')
    return(f'Completed {date_to_run}')



# GEFSv12 ensemble mean

In [153]:

def run_GEFSv12_ensemble_mean(region_name):
    # GEFSv12
    # template
    julian_file = per.create_data_julian_dates_GEFS_testing_distribution(template,region_name).mean(dim='M')
    
    anomaly_file = gefs_bias_corrected.mean(dim='M')
    save_dir=f'{conf.gefsv12_data}/{region_name}/soilw_bgrnd/percentiles_MEM_testing_distribution'
    MEM_or_by_model='model'

    global RZSM_name
    total_days, final_dates, RZSM_name = step_1_return_info(anomaly_file,julian_file,save_dir,MEM_or_by_model)
    num_cpus_available = numba.config.NUMBA_NUM_THREADS
    print("Number of CPUs available for Numba:", num_cpus_available)
    
    
    for date_index in range(0, total_days, day_to_grab):
        # date_index=0
        # break
        non_completed_dates = per.get_non_completed_days(final_dates,date_index,day_to_grab, anomaly_file, RZSM_name, save_dir)
        print(f'Non completed dates:')
        print(non_completed_dates) 
        
        #Run function
        if len(non_completed_dates) > 0 :
            # global selected_window
            selected_window = return_selected_window_to_save_memory_ensemble_mean(non_completed_dates,anomaly_file,julian_file)                
    
            # p=Pool(n_processes)
            # p.map(run_multi_dates_ensemble_mean,non_completed_dates)
    
            for date_to_run in non_completed_dates:
                run_multi_dates_ensemble_mean(date_to_run,anomaly_file,selected_window,save_dir)



In [154]:
run_GEFSv12_ensemble_mean(region_name)


Loading the julian day anomaly dataset
Number of CPUs available for Numba: 1
Non completed dates:
['12-26', '12-25', '12-19', '12-18', '12-12', '12-11', '12-05', '12-04', '11-28', '11-27']
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-26.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-25.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-19.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-18.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-12.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-11.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-05.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-04.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-11-28.nc']
Finished processing
Starting percentiles on:
['s

# UNET experiment ensemble mean create percentiles

In [173]:
anomaly_file = create_reforecast_with_predictions(experiment_name = 'EX29_regular_RZSM').load()
julian_file = per.create_data_julian_dates_UNET_experiment_testing_distribution(anomaly_file, 'EX29', region_name).mean(dim='M').load()

save_dir=f'predictions/{region_name}/UNET_percentiles_MEM_testing_distribution/EX29'
MEM_or_by_model='model'

total_days, final_dates, RZSM_name = step_1_return_info_UNET_experiment(anomaly_file,julian_file,save_dir,MEM_or_by_model)
num_cpus_available = numba.config.NUMBA_NUM_THREADS
print("Number of CPUs available for Numba:", num_cpus_available)


for date_index in range(0, total_days, day_to_grab):
    # date_index=0
    # break
    non_completed_dates = per.get_non_completed_days(final_dates,date_index,day_to_grab, anomaly_file, RZSM_name, save_dir)
    print(f'Non completed dates:')
    print(non_completed_dates) 
    
    #Run function
    if len(non_completed_dates) > 0:
        # global selected_window
        selected_window = return_selected_window_to_save_memory_ensemble_mean(non_completed_dates,anomaly_file,julian_file)                

        # p=Pool(n_processes)
        # p.map(run_multi_dates_ensemble_mean_UNET_EXPERIMENT,non_completed_dates)

        for date_to_run in non_completed_dates:
            run_multi_dates_ensemble_mean_UNET_EXPERIMENT(date_to_run, anomaly_file,selected_window,save_dir)

# stop

Loading the julian day anomaly dataset
Number of CPUs available for Numba: 1
Non completed dates:
['12-26', '12-25', '12-19', '12-18', '12-12', '12-11', '12-05', '12-04', '11-28', '11-27']
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-26.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-25.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-19.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-18.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-12.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-11.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-12-05.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2019-12-04.nc']
Finished processing
Starting percentiles on:
['soilw_bgrnd_percentiles_2018-11-28.nc']
Finished processing
Starting percentiles on:
['s

# GEFSv12 indivdual forecast realization

In [None]:
# # GEFSv12
# template
# reforecast_anomaly_julian = per.create_data_julian_dates_GEFS(template)

# anomaly_file = template
# julian_file = reforecast_anomaly_julian
# save_dir=f'Data/GEFSv12_reforecast/soilw_bgrnd/percentiles'
# MEM_or_by_model='model'


# total_days, final_dates, RZSM_name = step_1_return_info(anomaly_file,julian_file,save_dir,MEM_or_by_model)
# num_cpus_available = numba.config.NUMBA_NUM_THREADS
# print("Number of CPUs available for Numba:", num_cpus_available)


# for date_index in range(0, total_days, day_to_grab):
#     # date_index=0
#     # break
#     non_completed_dates = per.get_non_completed_days(final_dates,date_index,day_to_grab, anomaly_file, RZSM_name, save_dir)
#     print(f'Non completed dates:')
#     print(non_completed_dates) 
    
#     #Run function
#     if len(non_completed_dates) > 0 :
#         global selected_window
#         selected_window = return_selected_window_to_save_memory(non_completed_dates)
        
#         p=Pool(n_processes)
#         p.map(run_multi_dates,non_completed_dates)



# ECMWF ensemble mean

In [181]:
# ECMWF
ecmwf_bias_corrected
julian_file = create_data_julian_dates_ECMWF(ecmwf_bias_corrected).mean(dim='M')

anomaly_file = ecmwf_bias_corrected.mean(dim='M')
save_dir=f'{conf.ecmwf_data}/{region_name}/soilw_bgrnd/percentiles_MEM_testing_distribution'
MEM_or_by_model='model'


total_days, final_dates, RZSM_name = step_1_return_info(anomaly_file,julian_file,save_dir,MEM_or_by_model)
num_cpus_available = numba.config.NUMBA_NUM_THREADS
print("Number of CPUs available for Numba:", num_cpus_available)

days_to_grab = n_processes #this should help with not needing so much memory

for date_index in range(0, total_days, day_to_grab):
    # date_index=0
    # break
    non_completed_dates = per.get_non_completed_days(final_dates,date_index,day_to_grab, anomaly_file, RZSM_name, save_dir)
    print(f'Non completed dates:')
    print(non_completed_dates) 
    
    #Run function
    if len(non_completed_dates) > 0 :
        # global selected_window
        selected_window = return_selected_window_to_save_memory_ensemble_mean(non_completed_dates,anomaly_file,julian_file)

        #Multi-process (split selected window in next function)
        # p=Pool(n_processes)
        # p.map(run_multi_dates_ensemble_mean,non_completed_dates)

        for date_to_run in non_completed_dates:
            run_multi_dates_ensemble_mean(date_to_run,anomaly_file,selected_window,save_dir)


Loading the ECMWF julian day anomaly dataset


NameError: name 'template_ECM' is not defined

# ECMWF individual realization

In [None]:
# # ECMWF
# template_ECM
# reforecast_anomaly_julian = create_data_julian_dates_ECMWF(template_ECM)

# anomaly_file = template_ECM
# julian_file = reforecast_anomaly_julian
# save_dir=f'Data/ECMWF/soilw_bgrnd_processed/CONUS/percentiles'
# MEM_or_by_model='model'


# total_days, final_dates, RZSM_name = step_1_return_info(anomaly_file,julian_file,save_dir,MEM_or_by_model)
# num_cpus_available = numba.config.NUMBA_NUM_THREADS
# print("Number of CPUs available for Numba:", num_cpus_available)

# days_to_grab = n_processes #this should help with not needing so much memory

# for date_index in range(0, total_days, day_to_grab):
#     # date_index=0
#     # break
#     non_completed_dates = per.get_non_completed_days(final_dates,date_index,day_to_grab, anomaly_file, RZSM_name, save_dir)
#     print(f'Non completed dates:')
#     print(non_completed_dates) 
    
#     #Run function
#     if len(non_completed_dates) > 0 :
#         global selected_window
#         selected_window = return_selected_window_to_save_memory_ensemble_mean(non_completed_dates)

#         #Multi-process (split selected window in next function)
#         p=Pool(n_processes)
#         p.map(run_multi_dates,non_completed_dates)


# Testing code beneath this block

In [None]:
#Create the UNET prediction dataset to do other functions
# experiment_list = ['EX10_RZSM','EX20_RZSM','EX24_regular_RZSM','EX20_RZSM','EX20_RZSM','EX26_RZSM'] #Best model outputs (multi-week)

#Set up experiment list as week lead: experiment name
experiment_list = {'3':'EX24_regular_RZSM'} #Best model outputs (single_week)


create_data_julian_dates(experiment_name=experiment_name)

stop

#Load the new experiment anomalies that have been reformatted
prediction_reforecast_anomaly = xr.open_mfdataset(f'predictions/no_julian_dates_bias_corrected/{experiment_name}*.nc').load()
reforecast_anomaly_julian = xr.open_mfdataset(f'predictions/experiment_RZSM_julian_dates_bias_corrected/RZSM_{experiment_name}*.nc',combine='nested',concat_dim=['S'])
percentile_output = prediction_reforecast_anomaly.copy(deep=True)

In [None]:
#Create model realization percentiles 1.) For each individual model
experiment_name = 'EX29'
percentile_of_score_by_realization(anomaly_file=prediction_reforecast_anomaly, julian_file=reforecast_anomaly_julian,
                                   percentile_output=percentile_output,save_dir=f'predictions/UNET/{region_name}/percentiles/{experiment_name}',MEM_or_by_model='model')



In [None]:
#Create model ensemble mean percentiles (MEM)
percentile_of_score_by_realization(anomaly_file=prediction_reforecast_anomaly, julian_file=reforecast_anomaly_julian,
                                   percentile_output=percentile_output,save_dir=f'predictions/UNET/percentiles/{experiment_name}',MEM_or_by_model='MEM')

In [None]:
def percentile_of_score_by_realization(anomaly_file, julian_file, percentile_output,save_dir,MEM_or_by_model):
     
    os.system(f'mkdir -p {save_dir}')
    
    if MEM_or_by_model == 'MEM':
        RZSM_name = 'RZSM_percentiles_MEM'
        anomaly_file = anomaly_file.mean(dim='M')
        julian_file = julian_file.mean(dim='M')
        percentile_output = percentile_output.mean(dim='M')
    else:
        RZSM_name = 'RZSM_percentiles'
    

    #Test 
    # anomaly_file = base_reforecast_anomaly
    # julian_file = base_reforecast_anomaly_julian
    # save_dir = 'Data/GEFSv12_reforecast/soilw_bgrnd/percentiles_baseline'
    
    #first find out what the unique dates in the file
    all_dates = anomaly_file.S.values
    
    month_day = sorted([f'{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}' for i in all_dates])
    final_dates = create_final_day_list(month_day)
    final_dates.reverse()
    
    #run through several days at a time because it's a slow process to keep re-loading the same file when they are only 1 julian day apart
    total_days = len(final_dates)
    day_to_grab = 15
    
    for date_index in range(0, total_days, day_to_grab):
        # break
        selected_days = final_dates[date_index:date_index + day_to_grab]
        dates_init = selected_days[0].split('-')
        month_init = int(dates_init[0])
        day_init = int(dates_init[1])
        
        non_completed_dates = []

        
        for j in selected_days:
            dates =j.split('-')
            month = int(dates[0])
            day = int(dates[1])
            # break
            
            #Now loop through each of the month_days and create the distribution
            run_dates = anomaly_file.sel(S=(anomaly_file['S.month'] == month) & (anomaly_file['S.day'] == day ))

            #Now check if files exists (don't run otherwise)
            saved_dates1 = run_dates.S.values
            saved_dates2 = [f'{RZSM_name}_{pd.to_datetime(i).year}-{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}.nc' for i in saved_dates1]

            completed_or_not = []
            for i in saved_dates2:
                if os.path.exists(f'{save_dir}/{i}'):
                    completed_or_not.append(True)
                else:
                    completed_or_not.append(False)
        
            if len(completed_or_not) == sum(completed_or_not):
                #All files are already completed
                pass
            else:
                non_completed_dates.append(j)
                
        print(f'Non completed dates:')
        print(non_completed_dates) 
           
        #Now loop through all the non-completed days
        def run_subset_of_selected_window(date_to_run,selected_window):
            # break
            dattt = date_to_run.split('-')
            m = int(dattt[0])
            d = int(dattt[1])

            run_dates_final = anomaly_file.sel(S=(anomaly_file['S.month'] == m) & (anomaly_file['S.day'] == d))

            #Create the actual names of the files
            saved_dates1 = run_dates_final.S.values
            saved_dates2 = [f'{RZSM_name}_{pd.to_datetime(i).year}-{pd.to_datetime(i).month:02}-{pd.to_datetime(i).day:02}.nc' for i in saved_dates1]

            print(f'Starting percentiles on:')
            print(saved_dates2)
            #Now get the julian dates to properly subset
            julian_date_before,julian_date_after = return_julian_before_after(saved_dates1[0])

            #grab subset of julian dates
            if julian_date_before < julian_date_after:
                julian_date_before_subset = selected_window.sel(L=(selected_window['L'] > julian_date_before) & (selected_window['L'] < julian_date_after))
            else:
                julian_date_before_subset = selected_window.sel(L=(selected_window['L'] > julian_date_before) | (selected_window['L'] < julian_date_after))

            #Now loop through each of the actual files and create the percentile distribution

            if MEM_or_by_model == 'model':

                for idx,date_run in enumerate(saved_dates1):
                    # break
                    file_run  = anomaly_file.sel(S=date_run)

                    for X in range(anomaly_file.X.shape[0]):
                        for Y in range(anomaly_file.Y.shape[0]):
                            for model in range(anomaly_file.M.shape[0]):

                                all_leads = anomaly_file.isel(X=X,Y=Y,M=model).sel(S=date_run).RZSM.values
                                all_values = julian_date_before_subset.isel(X=X,Y=Y,M=model).RZSM.values.flatten()
                                all_values = all_values[~np.isnan(all_values)]
                                all_values = all_values[all_values != 0]
                                #percentile of score
                                percentile = pos(all_values, all_leads)

                                #Now add back to the dataset
                                #get index
                                index = int(np.where(anomaly_file['S'] == pd.to_datetime(date_run))[0])
                                percentile_output.RZSM[index,model,:, Y, X] = percentile

                    percentile_output.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')

            elif MEM_or_by_model == 'MEM':
                for idx,date_run in enumerate(saved_dates1):
                    # break
                    file_run  = anomaly_file.sel(S=date_run)

                    for X in range(anomaly_file.X.shape[0]):
                        for Y in range(anomaly_file.Y.shape[0]):

                                all_leads = anomaly_file.isel(X=X,Y=Y).sel(S=date_run).RZSM.values
                                all_values = julian_date_before_subset.isel(X=X,Y=Y).RZSM.values.flatten()
                                all_values = all_values[~np.isnan(all_values)]
                                all_values = all_values[all_values != 0]
                                #percentile of score
                                percentile = pos(all_values, all_leads)

                                #Now add back to the dataset
                                #get index
                                index = int(np.where(anomaly_file['S'] == pd.to_datetime(date_run))[0])
                                percentile_output.RZSM[index,:, Y, X] = percentile

                    percentile_output.sel(S=date_run).to_netcdf(f'{save_dir}/{saved_dates2[idx]}')

            del julian_date_before_subset

            return(0)

        def run_selected_window_to_save_memory(non_completed_dates):
            #first get the first date to only have to load all the files once
            dates_init = non_completed_dates[0].split('-')
            month_init = int(dates_init[0])
            day_init = int(dates_init[1])

            #Select the anomaly file dates just to have it to access later dates
            run_dates = anomaly_file.sel(S=(anomaly_file['S.month'] == month_init) & (anomaly_file['S.day'] == day_init))
            saved_dates1 = run_dates.S.values

            #first choose a single date for the window
            single_date = saved_dates1[0]

            #Grab the distribution (getting even days 15 after)
            selected_window = window_select_reforecast_by_realization(file=julian_file, time=single_date,window=window,add_window=day_to_grab)

            #Now loop through each of the non_completed dates
            for date_to_run in non_completed_dates:
                run_subset_of_selected_window(date_to_run,selected_window)

            del selected_window
            return(0)

        #Run function
        if len(non_completed_dates) > 0 :
            run_selected_window_to_save_memory(non_completed_dates)


In [None]:

# @njit(parallel=True)
# def pos_func_numba(all_values1, all_leads1, mask_anom,out_array):
#     # X,Y=10,10
#     print("all_values1 shape:", all_values1.shape)
#     print("all_leads1 shape:", all_leads1.shape)
#     print("mask_anom shape:", mask_anom.shape)
#     print("out_array shape:", out_array.shape)
#     assert all_values1.shape[3:5] == mask_anom.shape, "Dimension mismatch between all_values1 and mask_anom"
#     assert all_leads1.shape[0:2] == mask_anom.shape, "Dimension mismatch between all_leads1 and mask_anom"
#     assert out_array.shape[0:2] == mask_anom.shape, "Dimension mismatch between out_array and mask_anom"
    
#     for Y in prange(all_leads1.shape[0]):
#         for X in prange(all_leads1.shape[1]):
#             if mask_anom[Y,X] == 1:
#                 print(f'Working on X: {X} and Y: {Y}')
#                 a = all_values1[:, :, :, Y, X].flatten() #(1021097,)
#                 a = a.flatten()
#                 a = a[~np.isnan(a)]
#                 a = a[a != 0]
#                 a.shape
                
#                 score = all_leads1[Y, X, :] #(57365,)
#                 good_index = ~np.isnan(score)
#                 score = score[good_index]
#                 print("score shape:", score.shape)
#                 n = a.size #(1021097,)
#                 score = score[..., None] #(57365, 1)
                
#                 def count(x):
#                     return np.count_nonzero(x, axis=-1)
    
#                 print('Right before division')
#                 print("a:", a.shape)
#                 print("score shape:", score.shape)
#                 div = a < score
#                 print("div shape:", div.shape)
#                 count_ = np.count_nonzero(div,axis=-1)
                
#                 # perct = count() * (100.0 / n) #Strict
#                 print('Right before the ranking')
                
#                 perct = count_ * (100.0 / n) #weak
#                 out_array[Y,X,good_index] = perct
#     print("Finished processing")
#     return(out_array)
    
# out_array = np.empty_like(all_leads1)
# out_array.shape #(48, 96, 57365)
# o_array = pos_func_numba(all_values1, all_leads1,mask_anom,out_array)


# np.nanmax(o_array)
# o_array[Y, X, :]

#             stack_cp = stack.copy(deep=True)
#             stack_cp.RZSM[:,:,:] = percentile_arr
#             unstack = stack_cp.unstack('init_mod_lead').transpose('S','M','L','Y','X')
#             unstack.RZSM[:,0,0,10,10].values

# # stack_cp = stack.copy(deep=True)
# # stack_cp.RZSM[:,:,:] = percentile_arr

# # unstack = stack_cp.unstack('init_mod_lead').transpose('S','M','L','Y','X')
# # unstack.RZSM[:,0,0,10,10].values

In [None]:
d