In [1]:
import enspp.bma as bma
import numpy as np
import pandas as pd
from wrfpywind import data_preprocess as pp
import xarray as xr

In [2]:
# Path to data
datadir = '../data/'

In this data directory, there should be a list of data files for each initialization time. For example
- ensds_20191201-05.nc
- ensds_20191202-06.nc
- ensds_20191203-07.nc
- etc. 


I think I will use 12/9 as the test case for the AGU as that will give me 26 total days of testing data. However, for now, I'm going to use 12/8 as that's the last that I've finished processing so far.

In [3]:
# Specify the forecast initialization time
t_init = '2019-12-08'
t_init = pd.to_datetime(t_init)

In [4]:
# Decide how many days of data you would like to use
n_days = 7

In [5]:
# Find the first training day
d1_training = t_init - pd.DateOffset(days=n_days)

In [6]:
# Specify the start dates  
start_dates = pd.date_range(d1_training, periods=n_days)

# Specify the end dates by specifying how long these simlulations should last
end_dates = start_dates + pd.DateOffset(days=4)

In [7]:
# Read in the observational data
obs = pp.fmt_buoy_wspd(
                       data_path='/share/mzhang/jas983/wrf_data/oshwind/wrfpywind/wrfpywind/data/nyserda_buoy/', 
                       south_dates_str='20190904_20210207', north_dates_str='20190812_20210207', 
                       heights=[20, 40, 60, 80, 100, 120, 140, 160, 180, 200],
                       start_date='12-01-2019', end_date='12-31-2019')

I will refer to the current day as day `D`. I retrain the BMA predictive parameters every new initialization time (i.e., every day), and fit BMA predictive distributions using the same fit for each lead time in a given forecast.

In [8]:
for ii in range(0,len(start_dates)):
    # Open the xarray Dataset contianing wind speed data for the entire domain 
    # note that you must use a `Dataset` object for the `extract_buoy_da` function to work.
    ensds = xr.open_dataset(f"{datadir}ensds_{start_dates[ii].strftime('%Y%m%d')}-{end_dates[ii].strftime('%d')}.nc")

    # Get data only at the buoy locations
    ensda = pp.extract_buoy_da(ensds, varname='wspd_wrf', locations=['south', 'north'])

    # Combine ensemble data and training data into a pd.DataFrame in the correct format
    train_data_new = bma.fmt_training_data(ensda, obs)

    if ii == 0:
        # Create the train_data DataFrame
        train_data = train_data_new
    else:
        # Concat the new data into the same training DataFrame 
        train_data = pd.concat([train_data, train_data_new], axis=0)

In [9]:
# Finally remove any data from after the WRF initialization time
train_data = train_data[train_data['Time'] < t_init]

In [10]:
# And reset the index
train_data = train_data.reset_index(drop=True)

In [11]:
# Fit the BMA parameters
fit = bma.get_bma_fit(train_data)

Now, I will format the test data -- i.e., the simulation starting on `t_init`

In [12]:
# Read in and format the test data
t_end = t_init + pd.DateOffset(days=4)

# Open the xarray Dataset contianing wind speed data for the entire domain 
# note that you must use a `Dataset` object for the `extract_buoy_da` function to work.
ensds = xr.open_dataset(f"{datadir}ensds_{t_init.strftime('%Y%m%d')}-{t_end.strftime('%d')}.nc")

# Get data only at the buoy locations
ensda = pp.extract_buoy_da(ensds, varname='wspd_wrf', locations=['south', 'north'])

# Combine ensemble data and training data into a pd.DataFrame in the correct format
test_data = bma.fmt_test_data(ensda, obs)

In [13]:
# Calculate the CRPS
crps = bma.get_crps(fit, test_data, n_ens_members=5, gamma_bma=None)
# To determine the optimal amount of training data, I will use the mean CRPS
np.mean(crps)

1.6275686899042738