# Hello and welcome

In [None]:
# import lib and data
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

# train
y_all = pd.read_csv("data/target_train.csv")
y_all["dtm"] = pd.to_datetime(y_all["dtm"])

# comp
energy_data_2 = pd.read_csv("data_comp/Energy_Data_20240119_20240519.csv")
energy_data_2["dtm"] = pd.to_datetime(energy_data_2["dtm"])

energy_data_2["Wind_MWh_credit"] = 0.5 * energy_data_2["Wind_MW"] - energy_data_2["boa_MWh"]
energy_data_2["Solar_MWh_credit"] = 0.5 * energy_data_2["Solar_MW"]
energy_data_2["total_generation_MWh"] = energy_data_2["Wind_MWh_credit"] + energy_data_2["Solar_MWh_credit"]


# Train and Comp for Random Forest

In [None]:
# Train data
# RandomForest split date
split_date = pd.Timestamp("2022-10-01 00:00:00+00:00") 

# naive predictions are for yesterday values, shift 1 day 
y_all["nav_pred"] = y_all["total_generation_MWh"].shift(48)

# filter data with spliting date
y_filt = y_all[(y_all["dtm"] >= split_date)]

# cal MAE
mae_train = mean_absolute_error(y_filt["total_generation_MWh"], y_filt["nav_pred"])
print(f"Train Naive MAE: {mae_train} MWh")

# Comp data 
# competition dates
start_date_comp = pd.Timestamp("2024-02-20 00:00:00+00:00")
end_date_comp = pd.Timestamp("2024-05-19 23:30:00+00:00")

# nav pred, shift 1 day 
energy_data_2["nav_pred"] = energy_data_2["total_generation_MWh"].shift(48)
energy_filt = energy_data_2[(energy_data_2["dtm"] >= start_date_comp) & (energy_data_2["dtm"] <= end_date_comp)]

# cal MAE
mae_comp = mean_absolute_error(energy_filt["total_generation_MWh"], energy_filt["nav_pred"])
print(f"Comp Naive MAE: {mae_comp} MWh")


Train Naive MAE: 210.74215825708842
Comp Naive MAE: 164.0164399063994


# Quantiles 

In [21]:
# we have our lovely competition functions 
def pinball(y, q, alpha):
    return (y - q) * alpha * (y >= q) + (q - y) * (1 - alpha) * (y < q)

def pinball_score(df):
    scores = [
        pinball(
            y=df["total_generation_MWh"],
            q=df[f"q{qu}"],
            alpha=qu/100
        ).mean()
        for qu in range(10, 100, 10)
    ]
    return np.mean(scores)

# Xavire ~ "Assume same value for all quantiles"
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# training
naive_preds = {f"q{int(q*100)}": y_filt["nav_pred"].values for q in quantiles}
naive_preds["total_generation_MWh"] = y_filt["total_generation_MWh"].values
df_naive = pd.DataFrame(naive_preds)

score = pinball_score(df_naive)
print(f"Naive pinball score (training): {score}")

# comp 
# training
naive_preds = {f"q{int(q*100)}": energy_filt["nav_pred"].values for q in quantiles}
naive_preds["total_generation_MWh"] = energy_filt["total_generation_MWh"].values
df_naive = pd.DataFrame(naive_preds)

score = pinball_score(df_naive)
print(f"Naive pinball score (competition): {score}")


Naive pinball score (training): 105.3710791285442
Naive pinball score (competition): 82.00821995319968
