In [1]:
%load_ext autoreload 
%autoreload 2
%reload_ext autoreload
    
import xarray as xr
import os
import sys
from pathlib import Path
import xarray as xr
from glob import glob
import numpy as np
import pandas as pd
import datetime as dt


# Only do this for RZSM. We are saving the baseline anomaly file separately.

In [2]:
# Get the current working directory
current_dir = os.getcwd()

# Navigate two directories up (where other functions are located
two_dirs_up = os.path.abspath(os.path.join(current_dir, '..', '..'))

# Add this directory to sys.path
sys.path.append(two_dirs_up)

In [3]:
import preprocessUtils as putils
import masks

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def return_file_list(variable,region_name):
    if variable == 'soilw_bgrnd':
        #First find the available dates from soil
        ecmwf_files = sorted(glob(f'soilw_bgrnd_processed/{region_name}/*.nc'))
    else:
        ecmwf_files = sorted(glob(f'temp_pwat_dewpoint_processed/{region_name}/{variable}*.nc'))
    
    return(ecmwf_files)

In [5]:
def open_file_and_move_data_by_lag(file_,num_days_lag, dt, date,variable):
    op = xr.open_dataset(file_)

    #Now shift everything back within the leads by num_days_lag
    op = op.shift(L=-num_days_lag)

    #Now change date of file and resave
    op['S'] = np.atleast_1d(dt)

    #Now resave the file as the new date
    a = file_.split('/')
    b= a[-1].split(f'{variable}_')[-1].split('.')
    c = f'{a[0]}/{a[1]}/{variable}_{date}.{b[-1]}'

    op.to_netcdf(c)

    return(0)

In [6]:
def find_previous_date_from_GEFS_list(dt, ecmwf_datelist,ecmwf_files):
    #First loop through and find the nearest date
    behind_dates = [(idx,(dt-i).days) for (idx,i) in enumerate(ecmwf_datelist) if (dt-i).days > 0]
    behind_dates = behind_dates[-1] #closest date

    #Now grab the index of ecmwf datelist
    file_ = ecmwf_files[behind_dates[0]]
    num_days_lag = behind_dates[1]
    return(file_, num_days_lag)

In [18]:
def fix_any_missing_any_files(variable,region_name,mask,return_dataset):

    ecmwf_files = return_file_list(variable,region_name)
    
    final_ecmwf_list = [i for i in ecmwf_files if i.split(f'{variable}_')[-1].split('.nc')[0] in init_dates_gefs]
    ecmwf_datelist = [pd.to_datetime(i.split(f'{variable}_')[-1].split('.nc')[0]) for i in ecmwf_files]
    
    print(f'Number of files in ECMWF list is {len(final_ecmwf_list)}. Number of files in GEFSv12 is {len(init_dates_gefs)}')

    if len(final_ecmwf_list) != len(init_dates_gefs):
        print(f'There are not the same number of dates between ECMWF and GEFSv12.Number of files in ECMWF list is {len(final_ecmwf_list)}. Number of files in GEFSv12 is {len(init_dates_gefs)}')
        gefs_list_datetime = [pd.to_datetime(i) for i in init_dates_gefs]
        avail_ecmwf = [i.split(f'{variable}_')[-1].split('.nc')[0] for i in ecmwf_files if i.split(f'{variable}_')[-1].split('.nc')[0] in init_dates_gefs]

        missing_dates_ecmwf = [i for i in init_dates_gefs if i not in avail_ecmwf]
    
        for date in missing_dates_ecmwf:
            dt = pd.to_datetime(date)
            file_,num_days_lag = find_previous_date_from_GEFS_list(dt, ecmwf_datelist,ecmwf_files)
            
            #Now process file by subtracting the number of days
            open_file_and_move_data_by_lag(file_,num_days_lag, dt, date,variable)
            
        #Now re-run previous code above 
        #First find the available dates from soil
        ecmwf_files = return_file_list(variable,region_name)
        final_ecmwf_list = [i for i in ecmwf_files if i.split(f'{variable}_')[-1].split('.nc')[0] in init_dates_gefs]
        assert len(final_ecmwf_list) == len(init_dates_gefs), f'There are not the same number of dates between ECMWF and GEFSv12.Number of files in ECMWF list is {len(final_ecmwf_list)}. Number of files in GEFSv12 is {len(init_dates_gefs)}'

        if return_dataset == True:
            return(xr.open_mfdataset(final_ecmwf_list))
        else:
            return(0)
    else:
        if return_dataset == True:
            return(xr.open_mfdataset(final_ecmwf_list))
        else:
            return(0)

In [8]:
def run_function_by_variable_and_region(variable,region_name,mask):
    ecm = fix_any_missing_any_files(variable,region_name,mask,True)

    #Get GEFS reforecast_files as a template. We only want these dates
    if region_name == 'CONUS':
        gefs_op_files = sorted(glob(f'/glade/work/klesinger/FD_RZSM_deep_learning/Data/GEFSv12_reforecast/{variable}/*.n*'))
    else:
        gefs_op_files = sorted(glob(f'/glade/work/klesinger/FD_RZSM_deep_learning/Data_{region_name}/GEFSv12_reforecast/soilw_bgrnd/*.n*'))

    #Just rename the coordinates
    if region_name == 'CONUS':
        new_lon_values = [i+360 if i < 0 else i for i in ecm.X.values]
        ecm_subset = ecm.assign_coords({'X':new_lon_values})
        ecm_subset = putils.restrict_to_bounding_box(ecm_subset, mask)
    else:
        ecm_subset = ecm


    ecm_subset[putils.xarray_varname(ecm_subset)][0,0,:,10,10].values #Just check if there are values
    
    # Now construct anomalies
    ecm_anom, climatology_season = putils.create_seasonal_anomaly(ecm_subset, train_end)
    ecm_anom = ecm_anom.rolling(L=7, min_periods=7,center=False).mean().load()

    # ecm_anom.RZSM[0,0,:,10,10].values #just check if values are present for leads 6,13,20,27,34
    # a=ecm_anom.isel(S=0).sel(L=[6,13,20,27,34])
    # a.RZSM[0,:,10,10].values #just check if values are present for leads 6,13,20,27,34
    # #Select only specific leads
    
    putils.save_baseline_ECMWF_RZSM_anomaly(ecm_anom.sel(L=[6,13,20,27,34]), region_name)\

    return(0)

# Now we have the same dates of files ready for pre-processing

In [19]:

for region_name in ['australia', 'CONUS', 'china']:
    for variable in ['t2m','d2m','tcw']:

        train_end = 2015
        mask = putils.return_proper_mask_for_bounding(region_name)
        #Now get the init dates from GEFSv12 (we can just use the CONUS files as the baseline. 
        init_dates_gefs = putils.get_init_date_list('/glade/work/klesinger/FD_RZSM_deep_learning/Data/GEFSv12_reforecast/soilw_bgrnd')
    
        if variable == 'soilw_bgrnd':
            run_function_by_variable_and_region(variable,region_name,mask)
        else:
            fix_any_missing_any_files(variable,region_name,mask,False)
    


Latitude values for mask is [-13.  -13.5 -14.  -14.5 -15.  -15.5 -16.  -16.5 -17.  -17.5 -18.  -18.5
 -19.  -19.5 -20.  -20.5 -21.  -21.5 -22.  -22.5 -23.  -23.5 -24.  -24.5
 -25.  -25.5 -26.  -26.5 -27.  -27.5 -28.  -28.5 -29.  -29.5 -30.  -30.5
 -31.  -31.5 -32.  -32.5 -33.  -33.5 -34.  -34.5 -35.  -35.5 -36.  -36.5]

Longitude values for mask is [112.  112.5 113.  113.5 114.  114.5 115.  115.5 116.  116.5 117.  117.5
 118.  118.5 119.  119.5 120.  120.5 121.  121.5 122.  122.5 123.  123.5
 124.  124.5 125.  125.5 126.  126.5 127.  127.5 128.  128.5 129.  129.5
 130.  130.5 131.  131.5 132.  132.5 133.  133.5 134.  134.5 135.  135.5
 136.  136.5 137.  137.5 138.  138.5 139.  139.5 140.  140.5 141.  141.5
 142.  142.5 143.  143.5 144.  144.5 145.  145.5 146.  146.5 147.  147.5
 148.  148.5 149.  149.5 150.  150.5 151.  151.5 152.  152.5 153.  153.5
 154.  154.5 155.  155.5 156.  156.5 157.  157.5 158.  158.5 159.  159.5]
Number of files in ECMWF list is 1043. Number of files in GEFSv

In [None]:


# gef = xr.open_mfdataset(gefs_op_files)

In [None]:
# #checking if there are bad file
# for file in final_ecmwf_list:
#     a = xr.open_dataset(file)
#     print(a.S.values)
#     # print(f'{a.S.shape[0]}-{a.M.shape[0]}-{a.L.shape[0]}-{a.Y.shape[0]}-{a.X.shape[0]}')


In [None]:
#Find the dates that do exist
