# Model Tutorial: Baseline Methods

The focus of this research project is the affect of custom loss functions on forecasting wildfire rate of spread. The intent is not to optimize each machine learning model to make a state-of-the-art fuel moisture forecasting tool. However, we will compare the models to two baseline methods: a phys|ics-based model using Kalman filter for data assimilation and a simple climatology method. The purpose of the comparison to baseline methods is to make sure that the machine learning methods are producing reasonably accurate forecasts and thus to ensure that conclusions drawn on the affect of the custom loss functions are meaningful. This notebook explains two baseline methods of for fuel moisture modeling and demonstrates how to deploy them.

## Climatology

### Description

In meteorology, it is a common practice to compare models to a "climatology", or a simple statistical average of past weather. Shreck 2023 compare their machine learning models of fuel moisture to...

## Physics-Based Method

The current fuel moisture model within WRF-SFIRE is a simple ODE based on the physical processes of drying and wetting. The ODE assimilates data via the Kalman Filter, a Bayesian inspired technique for reconciling a deterministic model with observed data.

## Setup

In [None]:
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from datetime import timedelta
# Local modules
from fmda_models import run_augmented_kf
from data_funcs import train_test_split_spacetime
from metrics import ros_3wind, rmse
import reproducibility

## Read Data

In [None]:
df = pd.read_pickle("../data/raws_df.pkl")
# Remove NA fm
df = df.dropna()

## Split Data

In [None]:
# Get unique month and year combos in the data
t = df.index.min()
i = 0
tdelta = 2 # number of days to shift train/test period
while t <= (df.index.max() - timedelta(days = 300)):
    print("~"*50)
    print(f"Iteration: {i}")
    print(f"t: {t}")
    # Build train/test from 30 day period after t
    df_temp = df[
        (df.index >= t) & (df.index < (t + timedelta(days=30)))
    ]

    # # Iterate test period by 2 so no overlap
    i+= 1 # iteration counter
    t = t + timedelta(days=tdelta)

In [None]:
df_temp.shape

In [None]:
df_temp.STID.unique()

In [None]:
st = 'TR563'
df2 = df_temp[df_temp.STID == st]
df2.shape

In [None]:
30*24

In [None]:
def df_to_dict(d):
    dat = {
        'fm' : d['fm'].to_numpy(),
        'Ed' : d["Ed"].to_numpy(),
        'Ew' : d["Ew"].to_numpy(),
        'rain' : d["rain"].to_numpy(),
        'times': d.index.to_numpy()
    }    
    return dat

def fit_and_eval_kf(df, test_hours = 24*2):
    if df.shape == 720:
        dat = df_to_dict(df2)
        hours = len(dat['fm'])
        h2 = hours - test_hours
        m, E = run_augmented_kf(dat, h2=h2, hours = len(dat['fm']))
        train_inds = np.arange(0, h2)
        test_inds = np.arange(h2, hours)
        preds = m[test_inds]
        y_test = dat['fm'][test_inds]
        err1 = rmse(preds, y_test)
        err2 = rmse(ros_3wind(preds), ros_3wind(y_test))
        print(f"Test RMSE for {st}: {err1}")
        print(f"Test ROS RMSE for {st}: {err2}")
        
        return err1, err2

In [None]:
dat = df_to_dict(df2)

In [None]:
dat['rain'][0:100]

In [None]:
hours = len(dat['fm'])
h2 = hours - 24*2
m, E = run_augmented_kf(dat, h2=h2, hours = len(dat['fm']))

In [None]:
train_inds = np.arange(0, h2)
test_inds = np.arange(h2, hours)

In [None]:
preds = m[test_inds]
y_test = dat['fm'][test_inds]

In [None]:
plt.plot(y_test)
plt.plot(preds)

In [None]:
print(f"Test RMSE for {st}: {rmse(preds, y_test)}")
print(f"Test ROS RMSE for {st}: {rmse(ros_3wind(preds), ros_3wind(y_test))}")

In [None]:
dat['times'][0:10]

In [None]:
plt.plot(dat['times'], dat['fm'], label = "Observed FMC")
plt.tick_params(axis='x', rotation=90)
plt.plot(dat['times'][train_inds], m[train_inds], label="Training")
plt.plot(dat['times'][test_inds], m[test_inds], label="Prediction")
plt.legend()

In [None]:
# Set seed for reproducibility
reproducibility.set_seed(123)

# Create Data
X_train, X_test, y_train, y_test = train_test_split_spacetime(df)

In [None]:
# Format as dictionaries to run through model
st = 'CHAC2'
df2 = df[df.STID == st]
dat = {
    'fm' : df2['fm'].to_numpy(),
    'Ed' : df2["Ed"].to_numpy(),
    'Ew' : df2["Ew"].to_numpy(),
    'rain' : df2["rain"].to_numpy()
}

In [None]:
np.nanmax(dat['rain'])

In [None]:
df = df.dropna()
df.rain.argmax()

In [None]:
df.iloc[524704:524708]

In [None]:
preds, E = run_augmented_kf(dat, h2=6000, hours = dat["fm"].shape[0])

In [None]:
train_inds = np.arange(0, 6000)
test_inds = np.arange(6000, dat["fm"].shape[0])

In [None]:
# plt.plot(df.date, df.fm, label = "Observed FM")
plt.plot(df.date.iloc[train_inds], preds[train_inds], label= "Train")
plt.plot(df.date.iloc[test_inds], preds[test_inds], label= "Test")
plt.axvline(df.date.iloc[len(y_train)], color= 'k', linestyle='dashed')
plt.legend()
plt.grid()

In [None]:
print(f"RMSE Test: {rmse(preds[test_inds], dat['fm'][6000:])}")
print(f"RMSE ROS Test: {rmse(ros_3wind(preds[test_inds]), ros_3wind(dat['fm'][6000:]))}")