# Create Dataset for FMC Transfer Learning

Using processed 1h, 10h, 100h, and 1000h data together with processed weather data. Run those notebooks first. Then see `ode_fit_tutorial` notebook for an explanation on using the timelag ODE to interpolate to hourly resolution data.

Steps:
1. Define train/val/test time periods
2. For each FM observation, get 72 hours of preceeding hourly weather data and construct ML input sequences
3. Construct corresponding target values in two scenarios: end-of-sequence observation and then hourly full sequence from reanalysis

## Setup

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import read_yml, time_range

In [None]:
output_dir = "data/ml_data"

In [None]:
weather = pd.read_excel("data/processed_data/dvdk_weather.xlsx")
fm1 = pd.read_excel("data/processed_data/ok_1h.xlsx")
fm10 = pd.read_excel("data/processed_data/ok_10h.xlsx")
fm100 = pd.read_excel("data/processed_data/ok_100h.xlsx")
fm1000 = pd.read_excel("data/processed_data/ok_1000h.xlsx")
ode_params = read_yml("etc/params_models.yaml", subkey="ode")
rnn_params = read_yml("models/params.yaml")

## Define Time Periods

Train: 1 full year starting from first date

Val / Test: even split of remaining times

NOTE: We choose 1 full year of train data so the temporal relationships can be learned in principle. But, we do not have a full year of test data in this structure. This a data limitation. Accuracy metrics will be conditional on the time period

In [None]:
dates =  pd.concat([
    fm1["utc_rounded"],
    fm10["utc_rounded"],
    fm100["utc_rounded"],
    fm1000["utc_rounded"]])
tmin, tmax = dates.min(), dates.max()

# Train window
train_start = tmin + pd.DateOffset(hours=72) # buffer at start to make sure full sequences
train_end = tmin + pd.DateOffset(days=365)
train_end = train_end + pd.DateOffset(hours=24) # Spinup period at start
train_times = time_range(train_start, train_end, freq="1h")

# remaining window
remaining_start = train_end
remaining_end   = tmax

# midpoint of remaining period
midpoint = remaining_start + (remaining_end - remaining_start) / 2

# validation and test
val_start, val_end = remaining_start, midpoint
val_times = time_range(val_start, val_end, freq="1h")
test_start, test_end = midpoint, remaining_end
test_times = time_range(test_start, test_end, freq="1h")



print(f"Earlist FM Observation: {tmin}")
print(f"Latest FM Observation: {tmax}")
print(f"Train Period: {train_start} to {train_end}")
print(f"    N. Hours: {train_times.shape[0]}")
print(f"Val Period:   {val_start} to {val_end}")
print(f"    N. Hours: {val_times.shape[0]}")
print(f"Test Period:  {test_start} to {test_end}")
print(f"    N. Hours: {test_times.shape[0]}")

## Get Weather Data Arrays

In [None]:
features_list = rnn_params["features_list"]
print(features_list)
features_list_ok = ['Ed', 'Ew', 'solar', 'wind', 'rain', 'hod_utc', 'doy_utc']

In [None]:
# Static spatial features from Slapout Mesonet station
elev =  774
lon = -100.261920
lat = 36.597490

### 1h Fuels

In [None]:
train1 = fm1[(fm1.utc_rounded >= train_start) & (fm1.utc_rounded <= train_end)]
n_samples = train1.shape[0]
seq_length   = 72
n_feats   = len(features_list)
X1 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, train1.shape[0]):
    ti = train1.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    X1[i, :, :] = Xi    

In [None]:
val1 = fm1[(fm1.utc_rounded >= val_start) & (fm1.utc_rounded <= val_end)]
n_samples = val1.shape[0]
seq_length   = 72
n_feats   = len(features_list)
V1 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, val1.shape[0]):
    ti = val1.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    V1[i, :, :] = Xi    

In [None]:
test1 = fm1[(fm1.utc_rounded >= test_start) & (fm1.utc_rounded <= test_end)]
n_samples = test1.shape[0]
seq_length   = 72
n_feats   = len(features_list)
T1 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, test1.shape[0]):
    ti = test1.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    T1[i, :, :] = Xi    

## 10h Fuels

In [None]:
train10 = fm10[(fm10.utc_rounded >= train_start) & (fm10.utc_rounded <= train_end)]
n_samples = train10.shape[0]
seq_length   = 72
n_feats   = len(features_list)
X10 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, train10.shape[0]):
    ti = train10.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    X10[i, :, :] = Xi   

In [None]:
val10 = fm10[(fm10.utc_rounded >= val_start) & (fm10.utc_rounded <= val_end)]
n_samples = val10.shape[0]
seq_length   = 72
n_feats   = len(features_list)
V10 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, val10.shape[0]):
    ti = val10.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    V10[i, :, :] = Xi    

In [None]:
test10 = fm10[(fm10.utc_rounded >= test_start) & (fm10.utc_rounded <= test_end)]
n_samples = test10.shape[0]
seq_length   = 72
n_feats   = len(features_list)
T10 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, test10.shape[0]):
    ti = test10.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    T10[i, :, :] = Xi    

## 100h Fuels

In [None]:
train100 = fm100[(fm100.utc_rounded >= train_start) & (fm100.utc_rounded <= train_end)]
n_samples = train100.shape[0]
seq_length   = 72
n_feats   = len(features_list)
X100 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, train100.shape[0]):
    ti = train100.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    X100[i, :, :] = Xi   

In [None]:
val100 = fm100[(fm100.utc_rounded >= val_start) & (fm100.utc_rounded <= val_end)]
n_samples = val100.shape[0]
seq_length   = 72
n_feats   = len(features_list)
V100 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, val100.shape[0]):
    ti = val100.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    V100[i, :, :] = Xi    

In [None]:
test100 = fm100[(fm100.utc_rounded >= test_start) & (fm100.utc_rounded <= test_end)]
n_samples = test100.shape[0]
seq_length   = 72
n_feats   = len(features_list)
T100 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, test100.shape[0]):
    ti = test100.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    T100[i, :, :] = Xi    

## 1000h Fuels

In [None]:
train1000 = fm1000[(fm1000.utc_rounded >= train_start) & (fm1000.utc_rounded <= train_end)]
n_samples = train1000.shape[0]
seq_length   = 72
n_feats   = len(features_list)
X1000 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, train1000.shape[0]):
    ti = train1000.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    X1000[i, :, :] = Xi   

In [None]:
val1000 = fm1000[(fm1000.utc_rounded >= val_start) & (fm1000.utc_rounded <= val_end)]
n_samples = val1000.shape[0]
seq_length   = 72
n_feats   = len(features_list)
V1000 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, val1000.shape[0]):
    ti = val1000.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    V1000[i, :, :] = Xi    

In [None]:
test1000 = fm1000[(fm1000.utc_rounded >= test_start) & (fm1000.utc_rounded <= test_end)]
n_samples = test1000.shape[0]
seq_length   = 72
n_feats   = len(features_list)
T1000 = np.empty((n_samples, seq_length, n_feats), dtype=float)

for i in range(0, test1000.shape[0]):
    ti = test1000.iloc[i].utc_rounded
    t0 = ti - pd.DateOffset(hours=seq_length)
    wi = weather[(weather.utc > t0) & (weather.utc <= ti)][features_list_ok]
    wi["hod"] = wi["hod_utc"]
    wi["doy"] = wi["doy_utc"]
    wi["elev"] = elev
    wi["lon"] = lon
    wi["lat"] = lat
    wi = wi[features_list] # call to ensure column order 
    Xi = wi.to_numpy()
    # safety check
    if Xi.shape != (seq_length, n_feats):
        raise ValueError(f"Bad shape at i={i}: {Xi.shape}")

    T1000[i, :, :] = Xi    

## Summaries and Output

In [None]:
print("FMC Data Summary")
print(f"Ouput path: {output_dir}")

print("~"*50)
print("1h Fuels Summary")
print(f"    Train Data:")
print(f"         Weather Data: {X1.shape=}")
print(f"         FM Data: {train1['fm1'].shape=}")
print(f"    Val Data:")
print(f"         Weather Data: {V1.shape=}")
print(f"         FM Data: {val1['fm1'].shape=}")
print(f"    Test Data:")
print(f"         Weather Data: {T1.shape=}")
print(f"         FM Data: {test1['fm1'].shape=}")

print("~"*50)
print("10h Fuels Summary")
print(f"    Train Data:")
print(f"         Weather Data: {X10.shape=}")
print(f"         FM Data: {train10['fm10'].shape=}")
print(f"    Val Data:")
print(f"         Weather Data: {V10.shape=}")
print(f"         FM Data: {val10['fm10'].shape=}")
print(f"    Test Data:")
print(f"         Weather Data: {T10.shape=}")
print(f"         FM Data: {test10['fm10'].shape=}")

print("~"*50)
print("100h Fuels Summary")
print(f"    Train Data:")
print(f"         Weather Data: {X100.shape=}")
print(f"         FM Data: {train100['fm100'].shape=}")
print(f"    Val Data:")
print(f"         Weather Data: {V100.shape=}")
print(f"         FM Data: {val100['fm100'].shape=}")
print(f"    Test Data:")
print(f"         Weather Data: {T100.shape=}")
print(f"         FM Data: {test100['fm100'].shape=}")

print("~"*50)
print("1000h Fuels Summary")
print(f"    Train Data:")
print(f"         Weather Data: {X1000.shape=}")
print(f"         FM Data: {train1000['fm1000'].shape=}")
print(f"    Val Data:")
print(f"         Weather Data: {V1000.shape=}")
print(f"         FM Data: {val1000['fm1000'].shape=}")
print(f"    Test Data:")
print(f"         Weather Data: {T1000.shape=}")
print(f"         FM Data: {test1000['fm1000'].shape=}")