# Test Loss Functions on Multiple Models

In [None]:
import sys
sys.path.append('src')
import pandas as pd
import numpy as np
import xgboost as xg
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import yaml
import matplotlib.pyplot as plt
# Local modules
from data_funcs import train_test_split_spacetime
from fmda_models import LM, XGB, RF
from metrics import ros, rmse
import reproducibility

## Read Data

In [None]:
df = pd.read_pickle("data/raws_df.pkl")
df = df.dropna()
df.columns

In [None]:
df

## Setup Models

In [None]:
with open('models/params.yaml', 'r') as file:
    params = yaml.safe_load(file)

params

In [None]:
def initialize_models():
    models = {
        'xgb' : XGB(params['xgb']),
        'lm' : LM(params['lm']),
        'rf' : RF(params['rf'])
    }

    return models

models = initialize_models()

In [None]:
def create_exp_function(w):
    def exp_function(y_train):
        return tf.exp(tf.multiply(-w, y_train))
    return exp_function

## Function test:
# fun = create_exp_function(.05)
# fun(y_train = np.array([1,2,3]))

In [None]:
def loss_setup(ws = None):
    loss_fucs = ["rss", "exp", "ros"]
    # set up return dictionary
    loss = {
        'rss' : {
            'w_func' : None
        }
    } 
    # Using input omega parameter list, add dictionary key for exponential weighting for each omega in list 
    if ws is not None:
        for w in ws:
            assert isinstance(w, float) # Check that given list of floats
            dname = f"exp_{w}" # create name of dictionary key
            loss[dname] = {
                'w_func' : create_exp_function(w)
            }
    loss["ros"] = {'w_func': ros}
    return loss

In [None]:
weight_grid=np.round(np.linspace(0.01, .25, 10), 4)
print(f"Grid of Omega Weights: {weight_grid}")
loss_dict = loss_setup(ws=weight_grid)
loss_dict

## Run Analysis

In [None]:
## COLUMNS SUBSET
cols = ["Ed", "Ew", "rain", "wind", "solar", "hour", "doy", "lat", "lon"]

# Get unique month and year combos in the data
month_year = df.index.to_period('M').unique()
print(month_year)

reproducibility.set_seed(42)

# for my in month_year:
#     print("~"*50)
#     month = my.month
#     year = my.year
#     print(f"Splitting data for month: {my}")
#     df_temp = df[(df.index.month == month) & (df.index.year == year)]
#     print(f"Total observations: {df_temp.shape}")
#     X_train, X_test, y_train, y_test = train_test_split_spacetime(
#         df_temp, 
#         test_days = 2,
#         spatial_test_frac = 0.2,
#         verbose = True
#     )
#     X_train = X_train[cols]
#     X_test = X_test[cols]

my = month_year[1]
month = my.month
year = my.year
print(f"Splitting data for month: {my}")
df_temp = df[(df.index.month == month) & (df.index.year == year)]
print(f"Total observations: {df_temp.shape}")
X_train, X_test, y_train, y_test = train_test_split_spacetime(
    df_temp, 
    test_days = 2,
    spatial_test_frac = 0.2,
    verbose = True
)
X_train = X_train[cols]
X_test = X_test[cols]

For each loss function and each model, we will collect 2 arrays of errors on the test set. One for the RMSE on the test fuel moisture observations, and another one on the RMSE for the same observations transformed to ROS.

In [None]:
for l in loss_dict:
    loss_dict[l][f"errs"]={}
    for mod in models:
        loss_dict[l][f"errs"][mod] = {
            "rmse_test" : [],
            "rmse_test_ROS" : []
        }

In [None]:
# Get unique month and year combos in the data
month_year = df.index.to_period('M').unique()
print(month_year)
reproducibility.set_seed(42)
for my in month_year:
    print("~"*80)
    month = my.month
    year = my.year
    print(f"Splitting data for month: {my}")
    df_temp = df[(df.index.month == month) & (df.index.year == year)]
    print(f"Total observations: {df_temp.shape}")
    X_train, X_test, y_train, y_test = train_test_split_spacetime(
        df_temp, 
        test_days = 2,
        spatial_test_frac = 0.2,
        verbose = True
    )
    X_train = X_train[cols]
    X_test = X_test[cols]
    for l in loss_dict:
        print("~"*50)
        print(f"Running models for loss func: {l}")
        if loss_dict[l]['w_func'] is not None:
            weights = loss_dict[l]['w_func'](y_train)
        else:
            weights = None
        # Reinitialize models dictionary to prevent multiple fitting iterations
        # if True:
        #     models = initialize_models()
        for mod in models:
            print(f"Fitting {mod}")
            models[mod].fit(X_train, y_train, weights)
            preds = models[mod].predict(X_test)
            loss_dict[l][f"errs"][mod]["rmse_test"].append(rmse(preds, y_test))
            loss_dict[l][f"errs"][mod]["rmse_test_ROS"].append(rmse(ros(preds), ros(y_test)))
            print(f"Test RMSE for {mod}: {rmse(preds, y_test)}")
            print(f"Test ROS RMSE for {mod}: {rmse(ros(preds), ros(y_test))}")

In [None]:
loss_vec = [*loss_dict.keys()]
models_vec = [*models.keys()]
df1 = pd.DataFrame(np.zeros((len(loss_vec), len(models_vec))), index=loss_vec, columns=models_vec)
df2 = pd.DataFrame(np.zeros((len(loss_vec), len(models_vec))), index=loss_vec, columns=models_vec)

for l in loss_dict:
    for mod in loss_dict[l]["errs"]:
        df1.loc[l, mod] = np.mean(loss_dict[l]["errs"][mod]['rmse_test'])
        df2.loc[l, mod] = np.mean(loss_dict[l]["errs"][mod]['rmse_test_ROS'])

In [None]:
df1

In [None]:
df2

In [None]:
plt.scatter(df1.index, df1["lm"])
plt.xlabel('Loss Function')
plt.ylabel('RMSE Test Data')
plt.xticks(rotation=90)
plt.title('Test RMSE by Loss Function - Linear Regression')

In [None]:
N = len(models_vec) # number of rows of subplots
fig, axes = plt.subplots(N, 2, figsize=(10, 5*N))
for i in range(0, len(models_vec)):
    mod = models_vec[i]
    # Access the subplot at row i, column 0
    ax1 = axes[i, 0]
    ax1.scatter(df1.index, df1[mod])
    ax1.tick_params(axis='x', rotation=90)
    ax1.set_title(f'Test RMSE - Model {mod}')

    # Access the subplot at row i, column 1
    ax2 = axes[i, 1]
    ax2.scatter(df2.index, df2[mod])
    ax2.tick_params(axis='x', rotation=90)
    ax2.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
    ax2.set_title(f'Test RMSE on ROS - Model {mod}')

plt.tight_layout()
plt.show