# Test Loss Functions on Driest Month

The notebook `2_loss` was meant for getting a statistical estimate of the prediction RMSE for the various loss functions. This notebook is meant to examine specific cases on a more granular level. The goal is to identify particularly dry and particularly wet periods and see how the loss functions behave. It is hypothesized that the weighted loss functions will perform better in very dry conditions when FMC is in the critical range with respect to ROS.

## Setup

In [None]:
from scipy import stats
import sys
sys.path.append('src')
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import yaml
import pickle
import os.path as osp
from datetime import timedelta
import matplotlib.pyplot as plt
# Local modules
from data_funcs import train_test_split_spacetime
from fmda_models import LM, XGB, RF
from metrics import ros_0wind, ros_3wind, rmse
from utils import initialize_models, create_exp_function, loss_setup
import reproducibility

In [None]:
df = pd.read_pickle("data/raws_df.pkl")
df = df.dropna()

## Find Time Periods

We search for the period with the highest average ROS. We expect this period to be where the weighted loss functions perform the best.

In [None]:
# Loop over same time periods used in main analysis
# Save average FMC and average ROS
fms = []
rs= []
t = df.index.min()
ts = []
i = 0
tdelta = 2 # number of days to shift train/test period
while t <= (df.index.max() - timedelta(days = 30)):
    print("~"*50)
    ts.append(t)
    print(f"Time Period: {i}")
    print(f"Time Period Start: {t}")
    print(f"Time Period End: {t + timedelta(days=30)}")
    # Build train/test from 30 day period after t
    df_temp = df[
        (df.index >= t) & (df.index < (t + timedelta(days=30)))
    ]

    # Extract info
    print(f"Mean FMC: {df_temp.fm.mean()}")
    fms.append(df_temp.fm.mean())
    print(f"Mean ROS: {ros_3wind(df_temp.fm).mean()}")
    rs.append(ros_3wind(df_temp.fm).mean())


    # # Iterate test period by 2 so no overlap
    i+= 1 # iteration counter
    t = t + timedelta(days=tdelta)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# First histogram
ax1.hist(fms, bins=20, color='skyblue', edgecolor='black')
ax1.set_title('A: Mean FMC by 30-day Time Period')
ax1.set_ylabel("FMC (%)")

# Second histogram
ax2.hist(rs, bins=20, color='salmon', edgecolor='black')
ax2.set_title('B: Mean ROS by 30-day Time Period')
ax2.set_ylabel("ROS (m/s)")

plt.savefig('outputs/dry.png')

The time periods with the highest average ROS correspond to the lowest average FMC, and vice-versa. The wettest period (highest FMC and lowest ROS), was in January to February. The driest period (lowest FMC and highest ROS) was in October to February. In 2023 rainfall was very los in October and Nobember: https://www.weather.gov/media/bou/2023DenverClimateSummary.pdf

In [None]:
print(f"Total Time Periods: {len(fms)}")
print("~"*50)
print(f"Max FMC Period: {np.argmax(np.array(fms))}")
print(f"Min ROS Period: {np.argmin(np.array(rs))}")
print(f"Time Range: {(str(ts[np.argmax(np.array(fms))]), str(ts[np.argmax(np.array(fms))]+timedelta(days=30)))}")
print("~"*50)
print(f"Min FMC Period: {np.argmin(np.array(fms))}")
print(f"Max ROS Period: {np.argmax(np.array(rs))}")
print(f"Time Range: {(str(ts[np.argmin(np.array(fms))]), str(ts[np.argmin(np.array(fms))]+timedelta(days=30)))}")

## Modeling Setup

Loss functions will be standard MSE, the ROS based on 3m/s wind, and the weighted loss function with $\omega = 0.0367$, which performed the best in the main analysis.

In [None]:
with open('models/params.yaml', 'r') as file:
    params = yaml.safe_load(file)

params['xgb']

In [None]:
models, loss_dict = loss_setup(params, ws=np.array([0.0367]))

### Model Driest

In [None]:
t=ts[np.argmin(np.array(fms))]
cols = ["Ed", "rain", "wind", "solar", "hour", "doy", "lat", "lon", 'elev']
nreps = 10 # number of times to repeat test/train

In [None]:
# Initialize residuals entry of output dictionary
for l in loss_dict:
    for mod in loss_dict[l]['errs']:
        loss_dict[l]['errs'][mod]['residuals'] = []
        loss_dict[l]['errs'][mod]['y_test'] = []

In [None]:
reproducibility.set_seed(42)

for i in range(0, nreps):
    print("~"*75)
    print(f"Iteration: {i}")
    df_temp = df[
        (df.index >= t) & (df.index < (t + timedelta(days=30)))
    ]
    X_train, X_test, y_train, y_test = train_test_split_spacetime(
        df_temp, 
        test_days = 2,
        spatial_test_frac = 0.2,
        verbose = True
    )
    X_train = X_train[cols]
    X_test = X_test[cols]
    # Run models
    # Reinitialize models dictionary to prevent multiple fitting iterations
    if True:
        models = initialize_models(params)
    for l in loss_dict:
        print("~"*50)
        print(f"Running models for loss func: {l}")
        if loss_dict[l]['w_func'] is not None:
            weights = loss_dict[l]['w_func'](y_train)
        else:
            weights = None
        for mod in models:
            print(f"Fitting {mod}")
            models[mod].fit(X_train, y_train, weights)
            preds = models[mod].predict(X_test)
            loss_dict[l][f"errs"][mod]["t"].append(t)
            loss_dict[l][f"errs"][mod]["rmse_test"].append(rmse(preds, y_test))
            loss_dict[l][f"errs"][mod]["rmse_test_ROS"].append(rmse(ros_3wind(preds), ros_3wind(y_test)))
            print(f"Test RMSE for {mod}: {rmse(preds, y_test)}")
            print(f"Test ROS RMSE for {mod}: {rmse(ros_3wind(preds), ros_3wind(y_test))}")
            # Collect Data for Residual Plot
            loss_dict[l][f"errs"][mod]['residuals'].append((y_test-preds).to_numpy())
            loss_dict[l][f"errs"][mod]['y_test'].append(y_test.to_numpy())

In [None]:
results_fm = []
results_ros = []
for l in loss_dict:
    for mod in loss_dict[l]["errs"]:
        errs = loss_dict[l]["errs"][mod]['rmse_test']
        temp = {
            'RMSE': loss_dict[l]["errs"][mod]['rmse_test'],
            'Loss': [l] * len(errs),
            'Model': [mod] * len(errs),
            't': loss_dict[l]["errs"][mod]['t']
        }
        results_fm.append(pd.DataFrame(temp))
        errs = loss_dict[l]["errs"][mod]['rmse_test_ROS']
        temp = {
            'RMSE': loss_dict[l]["errs"][mod]['rmse_test_ROS'],
            'Loss': [l] * len(errs),
            'Model': [mod] * len(errs),
            't': loss_dict[l]["errs"][mod]['t']
        }
        results_ros.append(pd.DataFrame(temp))

In [None]:
results_fm = pd.concat(results_fm)
results_ros = pd.concat(results_ros)

In [None]:
# Write Dataframe
with open(osp.join("outputs", "results_fm_dry.pkl"), 'wb') as handle:
    pickle.dump(results_fm, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(osp.join("outputs", "results_ros_dry.pkl"), 'wb') as handle:
    pickle.dump(results_ros, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
results_fm.groupby(['Loss'], sort=False).agg(    Mean=('RMSE', 'mean'),
    Min=('RMSE', 'min'),
    Max=('RMSE', 'max'))

In [None]:
print(results_fm.groupby(['Loss'], sort=False).agg(    Mean=('RMSE', 'mean'),
    Min=('RMSE', 'min'),
    Max=('RMSE', 'max')).to_latex())

In [None]:
results_fm.groupby(['Loss', 'Model'], sort=False).agg(    Mean=('RMSE', 'mean'),
    Min=('RMSE', 'min'),
    Max=('RMSE', 'max'))

In [None]:
pd.options.display.float_format = '{:.3e}'.format
results_ros.groupby(['Loss'], sort=False).agg(    
    Mean=('RMSE', 'mean'),
    Min=('RMSE', 'min'),
    Max=('RMSE', 'max'))

In [None]:
print(results_ros.groupby(['Loss'], sort=False).agg(    
    Mean=('RMSE', 'mean'),
    Min=('RMSE', 'min'),
    Max=('RMSE', 'max')).to_latex())

In [None]:
stats.ttest_rel(results_ros[results_ros.Loss == "MSE"].RMSE, results_ros[results_ros.Loss == "exp_0.0367"].RMSE)

In [None]:
stats.ttest_rel(results_ros[results_ros.Loss == "MSE"].RMSE, results_ros[results_ros.Loss == "ROS"].RMSE)

In [None]:
# Percent Error Reduction in ROS
print(f"Percent Reduction in RMSE for ROS from Best Exp. Loss: {100*(results_ros[results_ros.Loss == 'MSE'].RMSE.mean()- results_ros[results_ros.Loss == 'exp_0.0367'].RMSE.mean())/results_ros[results_ros.Loss == 'MSE'].RMSE.mean()}")
print(f"Percent Reduction in RMSE for ROS from ROS Loss: {100*(results_ros[results_ros.Loss == 'MSE'].RMSE.mean()- results_ros[results_ros.Loss == 'ROS'].RMSE.mean())/results_ros[results_ros.Loss == 'MSE'].RMSE.mean()}")

In [None]:
results_ros.groupby(['Loss', 'Model'], sort=False).agg(    
    Mean=('RMSE', 'mean'),
    Min=('RMSE', 'min'),
    Max=('RMSE', 'max'))