# Hello and welcome

In [1]:
# import lib and data
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

# train
y_all = pd.read_csv("data/target_train.csv")
y_all["dtm"] = pd.to_datetime(y_all["dtm"])

# comp
energy_data_2 = pd.read_csv("data_comp/Energy_Data_20240119_20240519.csv")
energy_data_2["dtm"] = pd.to_datetime(energy_data_2["dtm"])

energy_data_2["Wind_MWh_credit"] = 0.5 * energy_data_2["Wind_MW"] - energy_data_2["boa_MWh"]
energy_data_2["Solar_MWh_credit"] = 0.5 * energy_data_2["Solar_MW"]
energy_data_2["total_generation_MWh"] = energy_data_2["Wind_MWh_credit"] + energy_data_2["Solar_MWh_credit"]


# Train and Comp for Random Forest

In [2]:
# Train data
# RandomForest split dates
# 65% and 35 % split
#split_date = pd.Timestamp("2022-10-01 00:00:00+00:00")

# 80 %  and 20 % split 
split_date = pd.Timestamp("2023-03-15 00:00:00+00:00")

# naive predictions are for yesterday values, shift 1 day 
y_all["nav_pred"] = y_all["total_generation_MWh"].shift(48)

# filter data with spliting date
y_filt = y_all[(y_all["dtm"] >= split_date)]

# cal MAE
mae_train = mean_absolute_error(y_filt["total_generation_MWh"], y_filt["nav_pred"])
print(f"Train Naive MAE: {mae_train} MWh")

# Comp data 
# competition dates
start_date_comp = pd.Timestamp("2024-02-20 00:00:00+00:00")
end_date_comp = pd.Timestamp("2024-05-19 23:30:00+00:00")

# nav pred, shift 1 day 
energy_data_2["nav_pred"] = energy_data_2["total_generation_MWh"].shift(48)
energy_filt = energy_data_2[(energy_data_2["dtm"] >= start_date_comp) & (energy_data_2["dtm"] <= end_date_comp)]

# cal MAE
mae_comp = mean_absolute_error(energy_filt["total_generation_MWh"], energy_filt["nav_pred"])
print(f"Comp Naive MAE: {mae_comp} MWh")


Train Naive MAE: 214.22408079035466 MWh
Comp Naive MAE: 164.0164399063994 MWh


# Quantiles 

In [3]:
# we have our lovely competition functions 
def pinball(y, q, alpha):
    return (y - q) * alpha * (y >= q) + (q - y) * (1 - alpha) * (y < q)

def pinball_score(df):
    scores = [
        pinball(
            y=df["total_generation_MWh"],
            q=df[f"q{qu}"],
            alpha=qu/100
        ).mean()
        for qu in range(10, 100, 10)
    ]
    return np.mean(scores)

def create_naive_df(y_true, y_naive):
    naive_preds = {f"q{int(q*100)}": y_naive for q in quantiles}
    naive_preds["total_generation_MWh"] = y_true
    return pd.DataFrame(naive_preds)

# Xavire ~ "Assume same value for all quantiles"
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# train
df_naive_train_total = create_naive_df(y_filt["total_generation_MWh"].values, y_filt["nav_pred"].values)
df_naive_train_solar = create_naive_df(y_filt["Solar_MWh_credit"].values, y_filt["nav_pred"].values)
df_naive_train_wind  = create_naive_df(y_filt["Wind_MWh_credit"].values, y_filt["nav_pred"].values)

score_naive_train_total = pinball_score(df_naive_train_total)
score_naive_train_solar = pinball_score(df_naive_train_solar)
score_naive_train_wind  = pinball_score(df_naive_train_wind)

print(f"Naive Pinball Score (Training - Total): {score_naive_train_total:.4f}")
print(f"Naive Pinball Score (Training - Solar): {score_naive_train_solar:.4f}")
print(f"Naive Pinball Score (Training - Wind):  {score_naive_train_wind:.4f}")

# comp
df_naive_comp_total = create_naive_df(energy_filt["total_generation_MWh"].values, energy_filt["nav_pred"].values)
df_naive_comp_solar = create_naive_df(energy_filt["Solar_MWh_credit"].values, energy_filt["nav_pred"].values)
df_naive_comp_wind  = create_naive_df(energy_filt["Wind_MWh_credit"].values, energy_filt["nav_pred"].values)

score_naive_comp_total = pinball_score(df_naive_comp_total)
score_naive_comp_solar = pinball_score(df_naive_comp_solar)
score_naive_comp_wind  = pinball_score(df_naive_comp_wind)

print(f"Naive Pinball Score (Competition - Total): {score_naive_comp_total:.4f}")
print(f"Naive Pinball Score (Competition - Solar): {score_naive_comp_solar:.4f}")
print(f"Naive Pinball Score (Competition - Wind):  {score_naive_comp_wind:.4f}")



Naive Pinball Score (Training - Total): 107.1120
Naive Pinball Score (Training - Solar): 130.6055
Naive Pinball Score (Training - Wind):  146.6069
Naive Pinball Score (Competition - Total): 82.0082
Naive Pinball Score (Competition - Solar): 123.3838
Naive Pinball Score (Competition - Wind):  118.6926


## Other Metrics

In [None]:
# COPIED FROM XGBoost 
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error, mean_squared_error, r2_score

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0  # avoid division by zero
    return 100 * np.mean(diff)

# Mean Absolute Percentage Error
# own function to forcefully ingore division by zero
def mape(y_true, y_pred):
    mask = y_true != 0
    return 100 * np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

# Continuous Ranked Probability Score
def crps(y_true, quantile_preds, quantiles):
    y = y_true.reshape(-1, 1)
    q_vals = np.array(quantiles).reshape(1, -1)
    indicator = (y < quantile_preds).astype(float)
    return np.mean((indicator - q_vals) * (quantile_preds - y))

# Weighted Interval Score
def wis(y_true, quantile_preds, quantiles):
    # Assumes symmetric quantile pairs around the median
    lower_idxs = list(range(len(quantiles) // 2))
    upper_idxs = list(range(len(quantiles) - 1, len(quantiles) // 2 - 1, -1))
    
    wis_total = 0.0
    for l_idx, u_idx in zip(lower_idxs, upper_idxs):
        alpha = quantiles[u_idx] - quantiles[l_idx]
        lower = quantile_preds[:, l_idx]
        upper = quantile_preds[:, u_idx]
        range_ = upper - lower
        below = np.maximum(lower - y_true, 0)
        above = np.maximum(y_true - upper, 0)
        wis_total += range_ + 2 / alpha * below + 2 / alpha * above

    return np.mean(wis_total / len(lower_idxs))

def evaluate_forecast(df, quantiles):
    y_true = df["total_generation_MWh"].values
    y_pred = df["q50"].values
    quantile_preds = np.stack([df[f"q{int(q*100)}"].values for q in quantiles], axis=-1)
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape_ = mape(y_true, y_pred)
    smape_ = smape(y_true, y_pred)
    crps_ = crps(y_true, quantile_preds, quantiles)
    wis_ = wis(y_true, quantile_preds, quantiles)

    return {
        "MAE": mae,
        "MAPE": mape_,
        "SMAPE": smape_,
        "RMSE": rmse,
        "R2": r2,
        "CRPS": crps_,
        "WIS": wis_
    }

# Evaluate the naive forecast
print("\nNaive Training:")
metrics_solar = evaluate_forecast(df_naive_train_solar, quantiles)
metrics_wind = evaluate_forecast(df_naive_train_wind, quantiles)
metrics_combined = evaluate_forecast(df_naive_train_total, quantiles)

print("Solar:", metrics_solar)
print("Wind:", metrics_wind)
print("Combined:", metrics_combined)

print("\nNaive Compitition:")
metrics_solar = evaluate_forecast(df_naive_comp_solar, quantiles)
metrics_wind = evaluate_forecast(df_naive_comp_wind, quantiles)
metrics_combined = evaluate_forecast(df_naive_comp_total, quantiles)

print("Solar:", metrics_solar)
print("Wind:", metrics_wind)
print("Combined:", metrics_combined)

#Naive Training:
#Solar: {'MAE': 261.2110606713105, 'MAPE': np.float64(23356705984.953804), 'SMAPE': np.float64(124.70708829117754), 'RMSE': np.float64(335.36563234985823), 'R2': -1.1981277329779472, 'CRPS': np.float64(130.60553033565526), 'WIS': np.float64(1360.4742743297422)}
#Wind: {'MAE': 293.2137349596339, 'MAPE': np.float64(3081.8617203684903), 'SMAPE': np.float64(104.03742142049852), 'RMSE': np.float64(379.9607823810718), 'R2': -2.4194630090998515, 'CRPS': np.float64(146.60686747981697), 'WIS': np.float64(1527.1548695814267)}
#Combined: {'MAE': 214.22408079035466, 'MAPE': np.float64(751654.908239123), 'SMAPE': np.float64(72.00844117266698), 'RMSE': np.float64(277.91152587505707), 'R2': 0.08762907713740009, 'CRPS': np.float64(107.11204039517733), 'WIS': np.float64(1115.7504207830973)}

#Naive Compitition:
#Solar: {'MAE': 246.76759867444252, 'MAPE': np.float64(808636177.4192668), 'SMAPE': np.float64(131.35173324830305), 'RMSE': np.float64(294.940091736018), 'R2': -0.7439452661026538, 'CRPS': np.float64(123.38379933722126), 'WIS': np.float64(1285.2479097627215)}
#Wind: {'MAE': 237.38522016747024, 'MAPE': np.float64(5622.1169310669075), 'SMAPE': np.float64(91.18835686466751), 'RMSE': np.float64(324.868068734957), 'R2': -3.734020250353444, 'CRPS': np.float64(118.69261008373513), 'WIS': np.float64(1236.381355038908)}
#Combined: {'MAE': 164.0164399063994, 'MAPE': np.float64(50248.13193458385), 'SMAPE': np.float64(64.77845581841274), 'RMSE': np.float64(221.09959971332623), 'R2': 0.24256197152292414, 'CRPS': np.float64(82.00821995319967), 'WIS': np.float64(854.2522911791634)}



Naive Training:
Solar: {'MAE': 261.2110606713105, 'MAPE': np.float64(23356705984.953804), 'SMAPE': np.float64(124.70708829117754), 'RMSE': np.float64(335.36563234985823), 'R2': -1.1981277329779472, 'CRPS': np.float64(130.60553033565526), 'WIS': np.float64(1360.4742743297422)}
Wind: {'MAE': 293.2137349596339, 'MAPE': np.float64(3081.8617203684903), 'SMAPE': np.float64(104.03742142049852), 'RMSE': np.float64(379.9607823810718), 'R2': -2.4194630090998515, 'CRPS': np.float64(146.60686747981697), 'WIS': np.float64(1527.1548695814267)}
Combined: {'MAE': 214.22408079035466, 'MAPE': np.float64(751654.908239123), 'SMAPE': np.float64(72.00844117266698), 'RMSE': np.float64(277.91152587505707), 'R2': 0.08762907713740009, 'CRPS': np.float64(107.11204039517733), 'WIS': np.float64(1115.7504207830973)}

Naive Compitition:
Solar: {'MAE': 246.76759867444252, 'MAPE': np.float64(808636177.4192668), 'SMAPE': np.float64(131.35173324830305), 'RMSE': np.float64(294.940091736018), 'R2': -0.7439452661026538, '

  diff = np.abs(y_true - y_pred) / denominator
  diff = np.abs(y_true - y_pred) / denominator
  diff = np.abs(y_true - y_pred) / denominator
  diff = np.abs(y_true - y_pred) / denominator
  diff = np.abs(y_true - y_pred) / denominator
  diff = np.abs(y_true - y_pred) / denominator
