# Test Loss Functions on Multiple Models

In [1]:
import sys
sys.path.append('src')
import pandas as pd
import numpy as np
import xgboost as xg
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from utils import create_exp_function, loss_setup, initialize_models
import yaml
import pickle
import os.path as osp
from datetime import timedelta
import matplotlib.pyplot as plt
# Local modules
from data_funcs import train_test_split_spacetime
from fmda_models import LM, XGB, RF
from metrics import ros_0wind, ros_3wind, rmse
import reproducibility

## Read Data

In [2]:
df = pd.read_pickle("data/raws_df.pkl")
df = df.dropna()
df.columns

Index(['Ew', 'Ed', 'temp', 'rh', 'rain', 'precip_accum', 'fm', 'wind', 'solar',
       'time_raws', 'STID', 'lat', 'lon', 'elev', 'hour', 'doy', 'date'],
      dtype='object')

In [3]:
df

Unnamed: 0,Ew,Ed,temp,rh,rain,precip_accum,fm,wind,solar,time_raws,STID,lat,lon,elev,hour,doy,date
2023-05-17 03:09:00,13.147834,14.552503,284.817,48.0,0.00,470.408,7.5,1.790,1.0,2023-05-17 03:09:00,CPTC2,38.45944,-109.04694,8088,3,137,2023-05-17 03:09:00
2023-05-17 04:09:00,13.247513,14.652182,284.261,48.0,0.00,470.408,8.1,1.790,0.0,2023-05-17 04:09:00,CPTC2,38.45944,-109.04694,8088,4,137,2023-05-17 04:09:00
2023-05-17 05:09:00,12.876054,14.274707,284.261,46.0,0.00,470.408,8.6,1.790,0.0,2023-05-17 05:09:00,CPTC2,38.45944,-109.04694,8088,5,137,2023-05-17 05:09:00
2023-05-17 06:09:00,13.446692,14.851361,283.150,48.0,0.00,470.408,9.2,1.790,0.0,2023-05-17 06:09:00,CPTC2,38.45944,-109.04694,8088,6,137,2023-05-17 06:09:00
2023-05-17 07:09:00,14.000874,15.412836,283.150,51.0,0.00,470.408,9.6,1.790,0.0,2023-05-17 07:09:00,CPTC2,38.45944,-109.04694,8088,7,137,2023-05-17 07:09:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-14 19:17:00,5.988904,7.156126,293.706,20.0,0.00,6.096,9.4,1.343,970.0,2024-05-14 19:17:00,TT815,37.52272,-108.48178,7576,19,135,2024-05-14 19:17:00
2024-05-14 20:17:00,9.688784,11.031949,290.372,35.0,0.00,6.096,7.3,0.448,426.0,2024-05-14 20:17:00,TT815,37.52272,-108.48178,7576,20,135,2024-05-14 20:17:00
2024-05-14 21:17:00,11.794500,13.189754,289.261,45.0,5.08,11.176,8.0,1.343,594.0,2024-05-14 21:17:00,TT815,37.52272,-108.48178,7576,21,135,2024-05-14 21:17:00
2024-05-14 22:17:00,7.987530,9.267817,292.039,28.0,0.00,11.176,8.0,1.790,742.0,2024-05-14 22:17:00,TT815,37.52272,-108.48178,7576,22,135,2024-05-14 22:17:00


## Setup Models

In [4]:
with open('models/params.yaml', 'r') as file:
    params = yaml.safe_load(file)

params

{'xgb': {'max_depth': 4,
  'eta': 0.1,
  'min_child_weight': 1,
  'subsample': 0.8,
  'colsample_bytree': 0.9,
  'scale_pos_weight': 1,
  'n_estimators': 120,
  'gamma': 0.1},
 'rf': {'n_estimators': 50,
  'criterion': 'squared_error',
  'max_depth': 8,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_features': 0.8,
  'bootstrap': True,
  'max_samples': None,
  'random_state': None,
  'verbose': 0,
  'warm_start': False},
 'mlp': {'hidden_units': 10,
  'activation': 'relu',
  'optimizer': 'adam',
  'epochs': 10,
  'batch_size': 32,
  'validation_split': 0.2,
  'dropout': 0.2,
  'learning_rate': 0.001},
 'lm': {'fit_intercept': True}}

In [5]:
weight_grid=np.round(np.linspace(0.01, .25, 10), 4)
models, loss_dict = loss_setup(params=params, ws=weight_grid)
# loss_dict

## Run Analysis

In [6]:
## COLUMNS SUBSET
cols = ["Ed", "rain", "wind", "solar", "hour", "doy", "lat", "lon", 'elev']

# # Get unique month and year combos in the data
# month_year = df.index.to_period('M').unique()
# print(month_year)

reproducibility.set_seed(42)

# my = month_year[1]
# month = my.month
# year = my.year
# print(f"Splitting data for month: {my}")
# df_temp = df[(df.index.month == month) & (df.index.year == year)]
# print(f"Total observations: {df_temp.shape}")
# X_train, X_test, y_train, y_test = train_test_split_spacetime(
#     df_temp, 
#     test_days = 2,
#     spatial_test_frac = 0.2,
#     verbose = True
# )
# X_train = X_train[cols]
# X_test = X_test[cols]

resetting random seeds to 42


For each loss function and each model, we will collect 2 arrays of errors on the test set. One for the RMSE on the test fuel moisture observations, and another one on the RMSE for the same observations transformed to ROS.

In [7]:
t = df.index.min()
i = 0
tdelta = 2 # number of days to shift train/test period
while t <= (df.index.max() - timedelta(days = 30)):
    print("~"*50)
    print(f"Iteration: {i}")
    print(f"t: {t}")
    # Build train/test from 30 day period after t
    df_temp = df[
        (df.index >= t) & (df.index < (t + timedelta(days=30)))
    ]
    X_train, X_test, y_train, y_test = train_test_split_spacetime(
        df_temp, 
        test_days = 2,
        spatial_test_frac = 0.2,
        verbose = True
    )
    X_train = X_train[cols]
    X_test = X_test[cols]
    # Run models
    # Reinitialize models dictionary to prevent multiple fitting iterations
    if True:
        models = initialize_models(params)
    for l in loss_dict:
        print("~"*50)
        print(f"Running models for loss func: {l}")
        if loss_dict[l]['w_func'] is not None:
            weights = loss_dict[l]['w_func'](y_train)
        else:
            weights = None

        for mod in models:
            print(f"Fitting {mod}")
            models[mod].fit(X_train, y_train, weights)
            preds = models[mod].predict(X_test)
            loss_dict[l][f"errs"][mod]["t"].append(t)
            loss_dict[l][f"errs"][mod]["rmse_test"].append(rmse(preds, y_test))
            loss_dict[l][f"errs"][mod]["rmse_test_ROS"].append(rmse(ros_3wind(preds), ros_3wind(y_test)))
            print(f"Test RMSE for {mod}: {rmse(preds, y_test)}")
            print(f"Test ROS RMSE for {mod}: {rmse(ros_3wind(preds), ros_3wind(y_test))}")

    # # Iterate test period by 2 so no overlap
    i+= 1 # iteration counter
    t = t + timedelta(days=tdelta)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Iteration: 0
t: 2023-05-17 02:22:00
Number of Training Observations: 52180
Number of Training Locations: 84
Number of Features: 16
Time range Train: ('2023-05-17 02:22:00', '2023-06-14 02:18:00')
----------
Number of Test Observations: 1052
Number of Test Locations: 22
Time range Test: ('2023-06-14 02:20:00', '2023-06-16 02:20:00')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Running models for loss func: MSE
Fitting XGB
Training XGB with params: {'max_depth': 4, 'eta': 0.1, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.9, 'scale_pos_weight': 1, 'n_estimators': 120, 'gamma': 0.1}
Predicting with XGB
Test RMSE for XGB: 3.857785409141178
Test ROS RMSE for XGB: 0.006082616282438361
Fitting LM
Training LM with params: {'fit_intercept': True}
Predicting with LM
Test RMSE for LM: 4.182792561172982
Test ROS RMSE for LM: 0.0069023249236351775
Fitting RF
Training RF with params: {'n_estimators': 50, 'criterion': 'squared_

In [8]:
results_fm = []
results_ros = []
for l in loss_dict:
    for mod in loss_dict[l]["errs"]:
        errs = loss_dict[l]["errs"][mod]['rmse_test']
        temp = {
            'RMSE': loss_dict[l]["errs"][mod]['rmse_test'],
            'Loss': [l] * len(errs),
            'Model': [mod] * len(errs),
            't': loss_dict[l]["errs"][mod]['t']
        }
        results_fm.append(pd.DataFrame(temp))
        errs = loss_dict[l]["errs"][mod]['rmse_test_ROS']
        temp = {
            'RMSE': loss_dict[l]["errs"][mod]['rmse_test_ROS'],
            'Loss': [l] * len(errs),
            'Model': [mod] * len(errs),
            't': loss_dict[l]["errs"][mod]['t']
        }
        results_ros.append(pd.DataFrame(temp))

In [9]:
results_fm = pd.concat(results_fm)
results_fm

Unnamed: 0,RMSE,Loss,Model,t
0,3.857785,MSE,XGB,2023-05-17 02:22:00
1,3.301552,MSE,XGB,2023-05-19 02:22:00
2,2.768666,MSE,XGB,2023-05-21 02:22:00
3,3.432500,MSE,XGB,2023-05-23 02:22:00
4,4.353253,MSE,XGB,2023-05-25 02:22:00
...,...,...,...,...
162,3.311077,ROS,RF,2024-04-05 02:22:00
163,3.499201,ROS,RF,2024-04-07 02:22:00
164,4.323424,ROS,RF,2024-04-09 02:22:00
165,4.242069,ROS,RF,2024-04-11 02:22:00


In [10]:
results_ros = pd.concat(results_ros)
results_ros

Unnamed: 0,RMSE,Loss,Model,t
0,0.006083,MSE,XGB,2023-05-17 02:22:00
1,0.005584,MSE,XGB,2023-05-19 02:22:00
2,0.005014,MSE,XGB,2023-05-21 02:22:00
3,0.008346,MSE,XGB,2023-05-23 02:22:00
4,0.007606,MSE,XGB,2023-05-25 02:22:00
...,...,...,...,...
162,0.006433,ROS,RF,2024-04-05 02:22:00
163,0.005433,ROS,RF,2024-04-07 02:22:00
164,0.007845,ROS,RF,2024-04-09 02:22:00
165,0.007034,ROS,RF,2024-04-11 02:22:00


In [11]:
# Write Dataframe
with open(osp.join("outputs", "results_fm.pkl"), 'wb') as handle:
    pickle.dump(results_fm, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(osp.join("outputs", "results_ros.pkl"), 'wb') as handle:
    pickle.dump(results_ros, handle, protocol=pickle.HIGHEST_PROTOCOL)