In [1]:

    
import xarray as xr
import os
import sys
from pathlib import Path
import xarray as xr
from glob import glob
import numpy as np
import pandas as pd
import datetime as dt


# Only do this for RZSM. We are saving the baseline anomaly file separately.

In [2]:
# Get the current working directory
current_dir = os.getcwd()

# Navigate two directories up (where other functions are located
two_dirs_up = os.path.abspath(os.path.join(current_dir, '..', '..'))

# Add this directory to sys.path
sys.path.append(two_dirs_up)
sys.path.append('/glade/work/klesinger/FD_RZSM_deep_learning') #this is where the scripts are

In [3]:
%load_ext autoreload 
%autoreload 2
%reload_ext autoreload

from function import preprocessUtils as putils
from function import masks

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def return_file_list(variable,region_name):
    
    base_dir = f'/glade/work/klesinger/FD_RZSM_deep_learning/Data/reforecast/ECMWF/{region_name}'


    if variable == 'soilw_bgrnd':
        #First find the available dates from soil
        ecmwf_files = sorted(glob(f'{base_dir}/soilw_bgrnd/*.nc'))
    else:
        ecmwf_files = sorted(glob(f'{base_dir}/temp_pwat_dewpoint/{variable}*.nc'))
    
    return(ecmwf_files)

In [5]:
def open_file_and_move_data_by_lag(file_,num_days_lag, dt, date,variable):
    end_dir = f'/glade/work/klesinger/FD_RZSM_deep_learning/Data/reforecast/ECMWF/{region_name}'
    
    op = xr.open_dataset(file_)

    #Now shift everything back within the leads by num_days_lag
    op = op.shift(L=-num_days_lag)

    #Now change date of file and resave
    op['S'] = np.atleast_1d(dt)

    #Now resave the file as the new date
    a = file_.split('/')
    b= a[-1].split(f'{variable}_')[-1].split('.')

    if variable != 'soilw_bgrnd':
        dir_save_final = 'temp_pwat_dewpoint'
    else:
        dir_save_final = 'soilw_bgrnd'

    end_dir = f'{end_dir}/{dir_save_final}/{variable}_{date}.{b[-1]}'


    op.to_netcdf(end_dir)

    return(0)

In [6]:
def find_previous_date_from_GEFS_list(dt, ecmwf_datelist,ecmwf_files):
    #First loop through and find the nearest date
    behind_dates = [(idx,(dt-i).days) for (idx,i) in enumerate(ecmwf_datelist) if (dt-i).days > 0]
    behind_dates = behind_dates[-1] #closest date

    #Now grab the index of ecmwf datelist
    file_ = ecmwf_files[behind_dates[0]]
    num_days_lag = behind_dates[1]
    return(file_, num_days_lag)

In [7]:
def fix_any_missing_any_files(variable,region_name,mask,return_dataset):

    ecmwf_files = return_file_list(variable,region_name)
    
    final_ecmwf_list = [i for i in ecmwf_files if i.split(f'{variable}_')[-1].split('.nc')[0] in init_dates_gefs]
    ecmwf_datelist = [pd.to_datetime(i.split(f'{variable}_')[-1].split('.nc')[0]) for i in ecmwf_files]
    
    print(f'Number of files in ECMWF list is {len(final_ecmwf_list)}. Number of files in GEFSv12 is {len(init_dates_gefs)}')

    if len(final_ecmwf_list) != len(init_dates_gefs):
        print(f'There are not the same number of dates between ECMWF and GEFSv12.Number of files in ECMWF list is {len(final_ecmwf_list)}. Number of files in GEFSv12 is {len(init_dates_gefs)}')
        gefs_list_datetime = [pd.to_datetime(i) for i in init_dates_gefs]
        avail_ecmwf = [i.split(f'{variable}_')[-1].split('.nc')[0] for i in ecmwf_files if i.split(f'{variable}_')[-1].split('.nc')[0] in init_dates_gefs]

        missing_dates_ecmwf = [i for i in init_dates_gefs if i not in avail_ecmwf]
    
        for date in missing_dates_ecmwf:
            dt = pd.to_datetime(date)
            file_,num_days_lag = find_previous_date_from_GEFS_list(dt, ecmwf_datelist,ecmwf_files)
            
            #Now process file by subtracting the number of days
            open_file_and_move_data_by_lag(file_,num_days_lag, dt, date,variable)
            
        #Now re-run previous code above 
        #First find the available dates from soil
        ecmwf_files = return_file_list(variable,region_name)
        final_ecmwf_list = [i for i in ecmwf_files if i.split(f'{variable}_')[-1].split('.nc')[0] in init_dates_gefs]
        assert len(final_ecmwf_list) == len(init_dates_gefs), f'There are not the same number of dates between ECMWF and GEFSv12.Number of files in ECMWF list is {len(final_ecmwf_list)}. Number of files in GEFSv12 is {len(init_dates_gefs)}'

        if return_dataset == True:
            return(xr.open_mfdataset(final_ecmwf_list))
        else:
            return(0)
    else:
        if return_dataset == True:
            return(xr.open_mfdataset(final_ecmwf_list))
        else:
            return(0)

In [10]:
def run_function_by_variable_and_region(variable,region_name,mask):
    ecm = fix_any_missing_any_files(variable,region_name,mask,True)

    #Get GEFS reforecast_files as a template. We only want these dates

    gefs_op_files = sorted(glob(f'/glade/work/klesinger/FD_RZSM_deep_learning/Data/reforecast/GEFSv12/{region_name}/{variable}/*.n*'))


    #Just rename the coordinates
    if region_name == 'CONUS':
        new_lon_values = putils.create_new_X_coord_values(ecm)
        ecm_subset = ecm.assign_coords({'X':new_lon_values})
        ecm_subset = putils.restrict_to_bounding_box(ecm_subset, mask)
        if len(ecm_subset.Y.values) == 0:
            #just flip the Y coordinates
            ecm_subset = ecm.assign_coords({'X':new_lon_values}) #include longitude just to re-create the object
            ecm_subset = ecm_subset.sel(Y=slice(mask.Y.values[-1],mask.Y.values[0])).sel(X=slice(mask.X.values[0],mask.X.values[-1]))
    else:
        ecm_subset = ecm


    ecm_subset[putils.xarray_varname(ecm_subset)][0,0,:,40,40].values #Just check if there are values
    
    # Now construct anomalies
    ecm_anom, climatology_season = putils.create_seasonal_anomaly(ecm_subset, train_end)
    ecm_anom = ecm_anom.rolling(L=7, min_periods=7,center=False).mean().load()

    # ecm_anom.RZSM[0,0,:,10,10].values #just check if values are present for leads 6,13,20,27,34
    # a=ecm_anom.isel(S=0).sel(L=[6,13,20,27,34])
    # a.RZSM[0,:,10,10].values #just check if values are present for leads 6,13,20,27,34
    # #Select only specific leads
    
    putils.save_baseline_ECMWF_RZSM_anomaly(ecm_anom.sel(L=[6,13,20,27,34]), region_name)

    return(0)

# Now we have the same dates of files ready for pre-processing

In [24]:

for region_name in ['australia', 'CONUS', 'china']:
    mask = putils.return_proper_mask_for_bounding(region_name)
    #Now get the init dates from GEFSv12 (we can just use the CONUS files as the baseline. 
    init_dates_gefs = putils.get_init_date_list('/glade/work/klesinger/FD_RZSM_deep_learning/Data/reforecast/GEFSv12/CONUS/soilw_bgrnd')
    for variable in ['soilw_bgrnd','t2m','d2m','tcw']:
        # break
        train_end = 2015
        
        #Call
        
        
        if variable == 'soilw_bgrnd':
            #fix_any_missing_any_files function is already built into run_function_by_variable_and_region
            run_function_by_variable_and_region(variable,region_name,mask) #We only want to save soilw_bgrnd as a baseline anomaly because this is what we are going to verify as the output.
        else:
            fix_any_missing_any_files(variable,region_name,mask,False)



Latitude values for mask is [50.  49.5 49.  48.5 48.  47.5 47.  46.5 46.  45.5 45.  44.5 44.  43.5
 43.  42.5 42.  41.5 41.  40.5 40.  39.5 39.  38.5 38.  37.5 37.  36.5
 36.  35.5 35.  34.5 34.  33.5 33.  32.5 32.  31.5 31.  30.5 30.  29.5
 29.  28.5 28.  27.5 27.  26.5]

Longitude values for mask is [238.  238.5 239.  239.5 240.  240.5 241.  241.5 242.  242.5 243.  243.5
 244.  244.5 245.  245.5 246.  246.5 247.  247.5 248.  248.5 249.  249.5
 250.  250.5 251.  251.5 252.  252.5 253.  253.5 254.  254.5 255.  255.5
 256.  256.5 257.  257.5 258.  258.5 259.  259.5 260.  260.5 261.  261.5
 262.  262.5 263.  263.5 264.  264.5 265.  265.5 266.  266.5 267.  267.5
 268.  268.5 269.  269.5 270.  270.5 271.  271.5 272.  272.5 273.  273.5
 274.  274.5 275.  275.5 276.  276.5 277.  277.5 278.  278.5 279.  279.5
 280.  280.5 281.  281.5 282.  282.5 283.  283.5 284.  284.5 285.  285.5]
Number of files in ECMWF list is 1042. Number of files in GEFSv12 is 1043
There are not the same number of date

In [None]:


# gef = xr.open_mfdataset(gefs_op_files)

In [None]:
# #checking if there are bad file
# for file in final_ecmwf_list:
#     a = xr.open_dataset(file)
#     print(a.S.values)
#     # print(f'{a.S.shape[0]}-{a.M.shape[0]}-{a.L.shape[0]}-{a.Y.shape[0]}-{a.X.shape[0]}')


In [None]:
#Find the dates that do exist
