In [None]:
%cd ~/projects/wind/

In [None]:
import polars as pl
import numpy as np
import plotly.express as px
from datetime import datetime, timedelta

In [None]:
from wind.preprocess.prepare_area_data import AREA_FEATURES
from wind.model.pred_local_windpower import get_hparams

dataset_path = "data/windpower_area_dataset.parquet"
features = AREA_FEATURES
val_start_date = datetime(2025, 1, 1, 0, 0)
target = "relative_power"
weight = "operating_power_max"

study_name = "em0_area_model_xgb_2"
params = get_hparams(study_name)
data = pl.scan_parquet(dataset_path).filter(
    pl.col(target).is_not_null(), pl.col("lt") <= 48
)

data_train = data.filter(pl.col("time_ref") < val_start_date).filter(pl.col("em") == 0)
data_val = data.filter(
    pl.col("time_ref") >= val_start_date,
    pl.col("time") >= pl.col("time_ref").dt.date() + timedelta(days=1),
    pl.col("time") < pl.col("time_ref").dt.date() + timedelta(days=2),
)

X_train = data_train.select(features).collect()
X_val = data_val.select(features).collect()

y_train = data_train.select(target).collect().to_series()
y_val = data_val.select(target).collect().to_series()

w_train = data_train.select(weight).collect().to_series()
w_val = data_val.select(weight).collect().to_series()

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(**params)
model.fit(X_train, y_train, sample_weight=w_train)
y_pred = model.predict(X_val)
data_val = data_val.with_columns(
    area_relative_pred=y_pred, area_power_pred=y_pred * w_val
)

In [None]:
idx = np.argsort(model.feature_importances_)
px.bar(
    x=model.feature_importances_[idx], y=model.feature_names_in_[idx], orientation="h"
)

In [None]:
data_val.group_by("time_ref", "time", "bidding_area").agg(
    pl.col("power").first(),
    pl.col("sum_local_pred").mean(),
    pl.col("area_power_pred").mean(),
    pl.col("relative_power").first(),
    pl.col("relative_sum_local_pred").mean(),
    pl.col("area_relative_pred").mean(),
).group_by("bidding_area").agg(
    RMSE_sum_local=((pl.col("sum_local_pred") - pl.col("power")) ** 2).mean().sqrt(),
    RMSE_area=((pl.col("area_power_pred") - pl.col("power")) ** 2).mean().sqrt(),
).sort("bidding_area").collect()

In [None]:
import plotly.graph_objs as go

bidding_area = "ELSPOT NO1"
df_plot = (
    data_val.filter(
        pl.col("bidding_area") == bidding_area,
        # pl.col("lt") < 24,
    )
    .group_by("time_ref", "time", "bidding_area")
    .agg(
        power=pl.col("power").first(),
        pred_mean=pl.col("area_power_pred").mean(),
        pred_std=pl.col("area_power_pred").std(),
    )
    .with_columns(
        upper=pl.col("pred_mean") + 1.96 * pl.col("pred_std"),
        lower=pl.col("pred_mean") - 1.96 * pl.col("pred_std"),
    )
    .sort("time")
    .collect()
)

fig = go.Figure(
    [
        go.Scatter(
            name="Measurement",
            x=df_plot["time"],
            y=df_plot["pred_mean"],
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="True",
            x=df_plot["time"],
            y=df_plot["power"],
            mode="lines",
            line=dict(color="rgb(214, 41, 26)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=df_plot["time"],
            y=df_plot["upper"],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=df_plot["time"],
            y=df_plot["lower"],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(68, 68, 68, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(
    yaxis=dict(title=dict(text="Area Power")),
    title=dict(text=bidding_area),
    hovermode="x",
)
fig.show()

In [None]:
px.histogram(
    data_val.filter(
        pl.col("lt") < 24,
    )
    .group_by("time_ref", "time", "bidding_area")
    .agg(
        power=pl.col("power").first(),
        pred_mean=pl.col("area_power_pred").mean(),
        pred_std=pl.col("area_power_pred").std(),
    )
    .with_columns(error=pl.col("pred_mean") - pl.col("power"))
    .sort("bidding_area")
    .collect(),
    "error",
    color="bidding_area",
    barmode="overlay",
    nbins=50,
)

In [None]:
data_train.collect()

In [None]:
times = windpower.select("time").lazy()
max_capacity = pl.scan_csv(
    "../data/windparks_enriched.csv", try_parse_dates=True
).filter(pl.col("bidding_area") == bidding_area)

bidding_area_capacity = (
    times.join(max_capacity, how="cross")
    .filter(pl.col("time") >= pl.col("prod_start_new"))
    .group_by("time")
    .agg(pl.col("operating_power_max").sum())
    .sort("time")
)
bidding_area_capacity.collect()

In [None]:
num_lagged_times = 24
ewma_spans = [3, 6, 12, 24]
ts_data = (
    windpower.join(bidding_area_capacity, on="time")
    .with_columns(relative_power=pl.col("power") / pl.col("operating_power_max"))
    .with_columns(
        sin_hod=(TAU * hour / 24.0).sin(),
        cos_hod=(TAU * hour / 24.0).cos(),
        sin_doy=(TAU * doy_frac).sin(),
        cos_doy=(TAU * doy_frac).cos(),
    )
    .with_columns(
        pl.col("power").shift(k).alias(f"lag_{k:02d}")
        for k in range(1, num_lagged_times + 1)
    )
    .with_columns(
        pl.col("lag_01").ewm_mean(span=span).alias(f"emwa_{span:02d}")
        for span in ewma_spans
    )
    .filter(pl.col("time") >= datetime(2022, 1, 1))
)

In [None]:
px.line(
    ts_data.select("time", pl.col("relative_power")).collect(), "time", "relative_power"
)

In [None]:
times = windpower.select("time").lazy()
max_capacity = pl.scan_csv("../data/windparks_enriched.csv", try_parse_dates=True)

bidding_area_capacity = (
    times.join(max_capacity, how="cross")
    .filter(pl.col("time") >= pl.col("prod_start_new"))
    .group_by("bidding_area", "time")
    .agg(pl.col("operating_power_max").sum())
    .sort("bidding_area", "time")
    .collect()
)

In [None]:
train_start = datetime(2024, 1, 1)
val_start = datetime(2025, 1, 1)
ts_train = ts_data.filter(pl.col("time") >= train_start, pl.col("time") < val_start)
ts_val = ts_data.filter(pl.col("time") >= val_start)

y_train = ts_train.select("relative_power").collect().to_numpy()[:, 0]
X_train = (
    ts_train.select(
        "sin_hod",
        "cos_hod",
        "sin_doy",
        "cos_doy",
    )
    .collect()
    .to_numpy()
)

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

mod = SARIMAX(
    y_train,
    order=(1, 0, 1),
    seasonal_order=(1, 1, 1, 24),
    exog=X_train,
    trend="c",
    enforce_stationarity=False,
    enforce_invertibility=False,
)

In [None]:
res = mod.fit()

In [None]:
last_datetime = ts_train.select(pl.col("time").max()).collect().to_series()
ts_val = ts_data.filter(
    pl.col("time") >= last_datetime,
    pl.col("time") <= last_datetime.dt.offset_by(f"{90 * 24}h"),
)
X_val = (
    ts_val.select(
        "sin_hod",
        "cos_hod",
        "sin_doy",
        "cos_doy",
    )
    .collect()
    .to_numpy()
)
val_time = ts_val.select("time").collect().to_series()
y_val = ts_val.select("relative_power").collect().to_series()
val_time.shape

In [None]:
fc = res.get_forecast(exog=X_val, steps=val_time.shape[0])
yhat = fc.predicted_mean
ci = fc.conf_int(alpha=0.05)

In [None]:
import plotly.graph_objs as go

fig = go.Figure(
    [
        go.Scatter(
            name="Measurement",
            x=val_time,
            y=yhat,
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="True",
            x=val_time,
            y=y_val,
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=val_time,
            y=ci[:, 1],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=val_time,
            y=ci[:, 0],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(68, 68, 68, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(
    yaxis=dict(title=dict(text="Local Power")),
    title=dict(text="Continuous, variable value error bars"),
    hovermode="x",
)
fig.show()

In [None]:
def _fit_and_forecast(
    y: pl.Series,
    horizon: int,
    K_yearly: int = 6,
    sarima_order=(1, 0, 1),
    sarima_seasonal=(1, 1, 1, 24),
    last_params: np.ndarray | None = None,
):
    """Fit SARIMAX with 24h seasonality + yearly Fourier exog; return (yhat, conf_int, params)."""
    # yearly period in hours (approx leap-aware)
    YEAR_HOURS = 24.0 * 365.25

    # exog for train and for future horizon
    # X_train = _fourier(y.index, period_hours=YEAR_HOURS, K=K_yearly, prefix="yr")
    # future_index = pd.date_range(
    #     y.index[-1] + pd.Timedelta(hours=1), periods=horizon, freq="h"
    # )
    # X_future = _fourier(future_index, period_hours=YEAR_HOURS, K=K_yearly, prefix="yr")

    mod = SARIMAX(
        y,
        order=sarima_order,
        seasonal_order=sarima_seasonal,  # 24h seasonality
        # exog=X_train,
        trend="c",
        enforce_stationarity=False,
        enforce_invertibility=False,
    )

    fit_kwargs = dict(disp=False, maxiter=10)
    if last_params is not None:
        # warm start for speed across rolling cutoffs
        fit_kwargs["start_params"] = last_params
        fit_kwargs["maxiter"] = 500

    res = mod.fit(**fit_kwargs)
    fc = res.get_forecast(steps=horizon)
    yhat = fc.predicted_mean
    ci = fc.conf_int(alpha=0.05)
    return yhat, ci, res.params

In [None]:
weather_forecast.select(pl.col("time_ref").unique().sort()).with_columns(
    hours_since_last=(pl.col("time_ref") - pl.col("time_ref").shift()).dt.total_hours()
).filter(pl.col("hours_since_last") > 24).collect()

In [None]:
sample = (
    pl.scan_parquet("data/single_model_pred.parquet")
    .select("time_ref", "windpark_name")
    .unique()
    .collect()
    .sample(1)
)
windpark = sample.select("windpark_name").item()
time_ref = sample.select("time_ref").item()

In [None]:
samples = (
    pl.scan_parquet("data/single_model_pred.parquet")
    .select("time_ref", "windpark_name")
    .unique()
    .collect()
)

In [None]:
sample = samples.sample(1)
windpark = sample.select("windpark_name").item()
time_ref = sample.select("time_ref").item()
title = f"{windpark} {time_ref.strftime('%Y-%m-%d %H:%M')}"

local_pred = pl.scan_parquet("data/single_model_pred.parquet").select(
    "time_ref",
    "time",
    "sid",
    "windpark_name",
    pl.col("em").cast(pl.String),
    pl.col("local_power_pred").alias("local_power"),
)
local_power = (
    pl.scan_parquet("data/windpower_single_model_dataset.parquet")
    .group_by("time_ref", "time", "sid", "windpark_name")
    .agg(em=pl.lit("target"), local_power=pl.col("local_power").first())
)

df_plot = (
    pl.concat([local_pred, local_power])
    .filter(
        pl.col("windpark_name") == windpark,
        # (pl.col("time") - pl.col("time_ref")).dt.total_hours() > 24,
        pl.col("time_ref") == time_ref,
    )
    .with_columns(target=pl.col("em") == "target")
    .sort("time", "em")
    .collect()
)

px.line(df_plot, "time", "local_power", color="em", line_dash="target", title=title)

In [None]:
def ensemble_mean(variable):
    return pl.mean_horizontal(pl.col(f"{variable}_{k:02d}" for k in range(15))).alias(
        f"{variable}_mean"
    )


def ensemble_std(variable):
    return (
        pl.concat_list(pl.col(f"{variable}_{k:02d}" for k in range(15)))
        .list.std()
        .alias(f"{variable}_std")
    )


local_preds = pl.scan_parquet("data/local_power_pred.parquet")

power = pl.scan_parquet("data/windpower_ensemble_dataset.parquet").select(
    "time_ref", "time", "lt", "sid", "windpark_name", "local_power", "power"
)


local_pred_sum = (
    local_preds.filter(pl.col("time_ref") > datetime(2025, 1, 1))
    .join(power, on=["time_ref", "time", "sid", "windpark_name"])
    .group_by("bidding_area", "time_ref", "time", "lt", maintain_order=True)
    .agg(
        pl.col("power").first(),
        *(
            pl.col(f"local_power_pred_{k:02d}").sum().alias(f"power_pred_{k:02d}")
            for k in range(15)
        ),
    )
    .select(
        "time_ref",
        "time",
        "lt",
        "bidding_area",
        "power",
        ensemble_mean("power_pred").alias("power_pred_local_mean"),
        ensemble_std("power_pred").alias("power_pred_local_std"),
    )
    .collect()
)

local_pred_sum

In [None]:
px.scatter(
    local_pred_sum, "power", "power_pred_local_mean", color="bidding_area", height=700
)

In [None]:
area_pred = (
    pl.read_parquet("data/area_power_pred.parquet")
    .filter(pl.col("time_ref") > datetime(2025, 1, 1))
    .select(
        "time_ref",
        "time",
        "lt",
        "bidding_area",
        "power",
        ensemble_mean("area_power_pred").alias("power_pred_area_mean"),
        ensemble_std("area_power_pred").alias("power_pred_area_std"),
    )
)
area_pred

In [None]:
px.scatter(area_pred, "power", "power_pred_area_mean", color="bidding_area", height=700)

In [None]:
bidding_area = "ELSPOT NO4"
px.line(
    local_pred_sum.join(area_pred, on=["time_ref", "time", "lt", "bidding_area"])
    .filter(pl.col("bidding_area") == bidding_area, pl.col("lt") < 24)
    .unpivot(
        index=["time_ref", "time", "lt"],
        on=["power", "power_pred_local_mean", "power_pred_area_mean"],
    ),
    "time",
    "value",
    color="variable",
)

In [None]:
local_pred_sum.join(area_pred, on=["time_ref", "time", "lt", "bidding_area"]).group_by(
    "bidding_area"
).agg(
    RMSE_local=((pl.col("power_pred_local_mean") - pl.col("power")) ** 2).mean().sqrt(),
    RMSE_area=((pl.col("power_pred_area_mean") - pl.col("power")) ** 2).mean().sqrt(),
).sort("bidding_area")

In [None]:
px.line(
    area_pred.filter(pl.col("bidding_area") == bidding_area, pl.col("lt") < 24).unpivot(
        index=["time_ref", "time", "lt", "bidding_area"]
    ),
    "time",
    "value",
    color="variable",
)

In [None]:
import polars as pl

ensemble_dataset = pl.scan_parquet("data/windpower_ensemble_dataset.parquet")
ensemble_preds = pl.scan_parquet("data/local_power_pred.parquet")

In [None]:
from prepare_ensemble_data import ENSEMBLE_MEMBERS

no_agg_features = [
    "ELSPOT NO1",
    "ELSPOT NO2",
    "ELSPOT NO3",
    "ELSPOT NO4",
    "sin_hod",
    "cos_hod",
    "sin_doy",
    "cos_doy",
]


def weighted_mean(variable: str) -> pl.Expr:
    return (
        (pl.col(variable) * pl.col("operating_power_max")).sum()
        / pl.col("operating_power_max").sum()
    ).alias(variable)


def em_mean(variable: str, em: int) -> pl.Expr:
    return pl.col(f"{variable}_{em:02d}").mean().alias(f"{variable}_{em:02d}")


def em_weighted_mean(variable: str, em: int) -> pl.Expr:
    return weighted_mean(f"{variable}_{em:02d}")


ensemble_dataset.join(
    ensemble_preds, on=["time_ref", "time", "sid", "windpark_name"]
).group_by(
    "time_ref",
    "time",
    "lt",
    "sid",
    "windpark_name",
    "bidding_area",
    "power",
    *no_agg_features,
).agg(
    pl.col("operating_power_max").sum(),
    pl.col("mean_production").sum(),
    pl.col("num_turbines").sum(),
    weighted_mean("location_mean_ws"),
    *(em_mean("ws_power_scaled", em) for em in ENSEMBLE_MEMBERS),
    *(em_mean("ws_turbine_scaled", em) for em in ENSEMBLE_MEMBERS),
    *(em_mean("wind_power_density_scaled", em) for em in ENSEMBLE_MEMBERS),
    *(em_weighted_mean("wind_alignment", em) for em in ENSEMBLE_MEMBERS),
).tail().collect()

In [None]:
local_power = (
    pl.scan_parquet("data/windpower_ensemble_dataset.parquet")
    .select("time_ref", "time", "sid", "windpark_name", "local_power", "power")
    .collect()
)
local_power

In [None]:
def ensemble_mean(variable):
    return pl.mean_horizontal(pl.col(f"{variable}_{k:02d}" for k in range(15))).alias(
        f"{variable}_mean"
    )


def ensemble_median(variable):
    return (
        pl.concat_list(pl.col(f"{variable}_{k:02d}" for k in range(15)))
        .list.median()
        .alias(f"{variable}_median")
    )


def ensemble_std(variable):
    return (
        pl.concat_list(pl.col(f"{variable}_{k:02d}" for k in range(15)))
        .list.std()
        .alias(f"{variable}_std")
    )


windpark_name = "Roan Vindpark"

df_eval = (
    local_preds
    # .filter(pl.col("windpark_name") == windpark_name)
    # .filter((pl.col("time") - pl.col("time_ref")).dt.total_hours() < 24)
    .join(local_power, on=["time_ref", "time", "sid", "windpark_name"])
    .select(
        "time_ref",
        "time",
        "sid",
        "windpark_name",
        "bidding_area",
        "power",
        "local_pred_time_ref",
        "local_power",
        ensemble_mean("local_power_pred"),
        ensemble_std("local_power_pred"),
    )
    .with_columns(
        lower_bound=pl.col("local_power_pred_mean")
        - 1.96 * pl.col("local_power_pred_std"),
        upper_bound=pl.col("local_power_pred_mean")
        + 1.96 * pl.col("local_power_pred_std"),
        residual=pl.col("local_power_pred_mean") - pl.col("local_power"),
    )
)
df_eval

In [None]:
bidding_area = "ELSPOT NO3"
px.histogram(
    df_eval.filter(pl.col("bidding_area") == bidding_area)
    .group_by("bidding_area", "time_ref", "time")
    .agg(
        pl.col("power").first(),
        pl.col("local_power").sum(),
        pl.col("local_power_pred_mean").sum(),
    )
    .with_columns(residual=pl.col("local_power_pred_mean") - pl.col("power")),
    "residual",
    facet_col="bidding_area",
)

In [None]:
import geopandas as gpd

gdf = gpd.read_file("data/NVEData.gdb")
gdf = gdf[gdf["sakID"].notnull()]
# gdf = gdf[gdf["totaltAntTurbiner"].notnull()]
gdf

In [None]:
gdf["status"].unique()

In [None]:
data = (
    pl.read_parquet("data/windpower_dataset.parquet")
    .join(preds, on=["time_ref", "time", "sid", "windpark_name"])
    .filter(
        pl.col("local_power").is_not_null(), pl.col("local_power_pred").is_not_null()
    )
    .with_columns(sqerror=(pl.col("local_power_pred") - pl.col("local_power")) ** 2)
)
data

In [None]:
bidding_area = "ELSPOT NO3"
daily_error = (
    data.filter(bidding_area)
    .group_by(
        "sid",
        "windpark_name",
        pl.col("time").dt.date(),
        "operating_power_max",
        "mean_production",
        "num_turbines",
    )
    .agg(MSE=pl.col("sqerror").mean())
    .sort("MSE")
)
daily_error.tail(100).group_by(pl.col("windpark_name")).agg(pl.col("MSE").mean())
# px.bar(daily_error.filter(pl.col("windpark_name") == "Geitfjellet Vindpark"), "time", "MSE")

In [None]:
from datetime import datetime
import polars as pl
import xgboost as xgb
import numpy as np
import optuna

em = 0
features = [
    "lt",
    "operating_power_max",
    "mean_production",
    "num_turbines",
    "ELSPOT NO1",
    "ELSPOT NO2",
    "ELSPOT NO3",
    "ELSPOT NO4",
    f"ws10m_{em:02d}",
    f"wd10m_{em:02d}",
    f"t2m_{em:02d}",
    f"rh2m_{em:02d}",
    f"mslp_{em:02d}",
    f"g10m_{em:02d}",
    "wind_alignment",
    "ws_magnitude",
    "ws10m_mean",
    "t2m_mean",
    "rh2m_mean",
    "mslp_mean",
    "g10m_mean",
    "ws10m_std",
    "t2m_std",
    "rh2m_std",
    "mslp_std",
    "g10m_std",
    "now_air_temperature_2m",
    "now_air_pressure_at_sea_level",
    "now_relative_humidity_2m",
    "now_precipitation_amount",
    "now_wind_speed_10m",
    "now_wind_direction_10m",
    "now_air_density",
    "location_mean_ws",
    "now_wind_power_density",
    "sin_hod",
    "cos_hod",
    "sin_doy",
    "cos_doy",
    "air_density",
    "wind_power_scaled",
    "wind_turbine_scaled",
    "wind_power_density",
    "wind_power_density_scaled",
]
target = "local_power"

cutoff_date = datetime(2024, 1, 1, 0, 0)
df = (
    pl.read_parquet("data/windpower_dataset.parquet")
    .filter(pl.col(target).is_not_null())
    .sort("time_ref", "time", "bidding_area")
)

df_train = df.filter(pl.col("time_ref") < cutoff_date)
df_val = df.filter(pl.col("time_ref") >= cutoff_date)
X_train = df_train.select(features)
X_val = df_val.select(features)
y_train = df_train.get_column(target)
y_val = df_val.get_column(target)


study = optuna.load_study(
    study_name="local_power_xgb_6",
    storage="sqlite:///optuna.db",
)
best_params = study.best_params
best_params["n_estimators"] = study.best_trial.user_attrs["n_estimators"]
best_params["objective"] = "reg:quantileerror"
best_params["quantile_alpha"] = np.array([0.05, 0.95])
best_params["device"] = "cuda"
best_params["tree_method"] = "hist"

model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_val)
pred.shape

In [None]:
df_val.group_by("windpark_name").agg(pl.col("operating_power_max").first()).sort(
    "operating_power_max"
)

In [None]:
windpark_name = "Kvitfjell"
df_eval = (
    df_val.with_columns(
        q05=np.clip(pred[:, 0], 0, None),
        q95=pred[:, 1],
    )
    .filter(pl.col("windpark_name") == windpark_name)
    .filter(pl.col("lt") < 24)
    # .filter(pl.col("time_ref") == datetime(2024, 4, 1, 9))
    .select(
        "time_ref",
        "time",
        "bidding_area",
        "power",
        "local_power",
        "lt",
        "q05",
        "q95",
    )
    .sort("time")
)
df_eval

In [None]:
df_eval.with_columns(
    under=pl.col("local_power") < pl.col("q05"),
    over=pl.col("local_power") > pl.col("q95"),
).with_columns(out=pl.col("under") | pl.col("over")).select(
    pl.col("under").mean(), pl.col("over").mean(), pl.col("out").mean()
)

In [None]:
import plotly.graph_objs as go

fig = go.Figure(
    [
        go.Scatter(
            name="Measurement",
            x=df_eval["time"],
            y=df_eval["local_power"],
            mode="lines",
            line=dict(color="rgb(31, 119, 180)"),
        ),
        go.Scatter(
            name="Upper Bound",
            x=df_eval["time"],
            y=df_eval["q95"],
            mode="lines",
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False,
        ),
        go.Scatter(
            name="Lower Bound",
            x=df_eval["time"],
            y=df_eval["q05"],
            marker=dict(color="#444"),
            line=dict(width=0),
            mode="lines",
            fillcolor="rgba(68, 68, 68, 0.3)",
            fill="tonexty",
            showlegend=False,
        ),
    ]
)
fig.update_layout(
    yaxis=dict(title=dict(text="Local Power")),
    title=dict(text="Continuous, variable value error bars"),
    hovermode="x",
)
fig.show()

In [None]:
import requests

url = "https://api.nve.no/web/WindPowerplant/GetWindPowerPlantsInOperation"
# url = "https://api.nve.no/web/WindPowerplant/GetWindPowerPlants"

# Make the request, return data
response = requests.get(url)
data = response.json()

In [None]:
windparks = pl.json_normalize(data)
# windparks.select(pl.col("Turbiner")).explode("Turbiner").unnest("Turbiner").group_by(
#     pl.col("TurbinType")
# ).agg(n=pl.col("AntallTurbiner").sum()).sort("n")
windparks  # .explode("Turbiner").unnest("Turbiner").filter(pl.col("DatoUtavdrift").is_not_null(), pl.col("InstallertEffekt_MW") > 0)

## Nordpool API

In [None]:
import requests

In [None]:
url = "https://ummapi.nordpoolgroup.com/infrastructure/fueltypes"
fueltypes = requests.get(url).json()
fueltypes

In [None]:
url = "https://ummapi.nordpoolgroup.com/messages"
res = requests.get(
    url,
    params={"limit": 2000, "units": ["50WP00000001827F"]},
)
res.json()

In [None]:
areas = [
    {"name": "NO1", "code": "10YNO-1--------2"},
    {"name": "NO2", "code": "10YNO-2--------T"},
    {"name": "NO3", "code": "10YNO-3--------J"},
    {"name": "NO4", "code": "10YNO-4--------9"},
    {"name": "NO5", "code": "10Y1001A1001A48H"},
]
url = "https://ummapi.nordpoolgroup.com/messages"

messages = []
skip = 0
while True:
    res = requests.get(
        url,
        params={
            "limit": 2000,
            # "messageTypes": "ProductionUnavailability",
            "areas": [a["code"] for a in areas],
            # "areas": areas[4]["code"],
            # "fuelTypes": 19,
            # "publicationStartDate": "2020-01-01T00:00:00",
            "skip": skip,
        },
    )
    if res.status_code != 200:
        print(res.status_code)
        break

    content = res.json()
    if len(content["items"]) == 0:
        break
    messages.extend(content["items"])
    skip += len(content["items"])
    print(
        f"Retrieved: {len(content['items'])} ---- Progress: {skip}/{content['total']}"
    )
    if skip >= content["total"]:
        break

In [None]:
unit_fields = [
    "name",
    "eic",
    "areaName",
    "installedCapacity",
    "timePeriods",
    "fuelType",
]
outages = (
    pl.json_normalize(messages, infer_schema_length=1000).select(
        pl.when(pl.col("productionUnits").is_not_null())
        .then(pl.col("productionUnits").list.first().struct.field(u))
        .otherwise(pl.col("generationUnits").list.first().struct.field(u))
        .alias(u)
        for u in unit_fields
    )
    # .explode("timePeriods")
    # .with_columns(pl.col("timePeriods").struct.field(p) for p in period_fields)
    # .drop("timePeriods")
)

In [None]:
px.histogram(outages, "fuelType")

In [None]:
unit_fields = ["name", "eic", "areaName", "installedCapacity", "timePeriods"]
period_fields = ["unavailableCapacity", "availableCapacity", "eventStart", "eventStop"]
outages = (
    pl.json_normalize(messages, infer_schema_length=500)
    .with_columns(
        pl.when(pl.col("productionUnits").is_not_null())
        .then(pl.col("productionUnits").list.first().struct.field(u))
        .otherwise(pl.col("generationUnits").list.first().struct.field(u))
        .alias(u)
        for u in unit_fields
    )
    .explode("timePeriods")
    .with_columns(pl.col("timePeriods").struct.field(p) for p in period_fields)
    .drop("timePeriods")
)

In [None]:
outages.write_csv("data/outages.csv")

In [None]:
windparks = windparks.collect()

In [None]:
windparks.join(outages, left_on="eid_code", right_on="eic").filter(pl.col())

In [None]:
url = "https://ummapi.nordpoolgroup.com/infrastructure/stations"
# url = "https://ummapi.nordpoolgroup.com/infrastructure/assets"

res = requests.get(url)
res

In [None]:
windparks = pl.read_csv("data/windparks_bidzone.csv", try_parse_dates=True)
windparks.height

In [None]:
windpark_lookup = pl.json_normalize(res.json())
windpark_lookup

In [None]:
windparks_nve = (
    pl.read_csv("data/windparks_nve.csv", separator=";", decimal_comma=True)
    .select(
        windpark_name=pl.col("Kraftverknavn"),
        windpark_id=pl.col("KraftverkID"),
        bidding_area=pl.col("Prisområde"),
    )
    .filter(pl.col("bidding_area").is_not_null())
    .sort("windpark_id")
)
windparks_nve

In [None]:
def get_local_windpower(path: str) -> pl.LazyFrame:
    from datetime import date

    local_windpower = pl.scan_csv(
        path,
        separator=";",
        decimal_comma=True,
        infer_schema=False,
    )

    local_windpower_id = (
        local_windpower.slice(0, 1)
        .drop("time")
        .unpivot(variable_name="windpark_name", value_name="windpark_id")
        .with_columns(windpark_id=pl.col("windpark_id").cast(pl.Int64))
    )

    local_windpower = (
        local_windpower.slice(1)
        .select(
            # This time is in UTC+1 tz, we convert to UTC, then drop tz since all other datetimes are UTC.
            pl.col("time")
            .str.to_datetime("%Y-%m-%d %H:%M:%S", time_unit="ns", time_zone="+01:00")
            .dt.convert_time_zone("UTC")
            .dt.replace_time_zone(None),
            pl.exclude("time").str.replace(",", ".").cast(pl.Float32),
        )
        .filter(pl.col("time") > date(2020, 1, 1))
        .unpivot(index="time", variable_name="windpark_name", value_name="local_power")
        .drop_nulls()
        .join(local_windpower_id, on="windpark_name")
    )
    return local_windpower


local_power = (
    get_local_windpower("data/windpower2002-2024_utcplus1.csv")
    .collect()
    .join(windparks_nve, on="windpark_id", how="full")
)
local_power.filter(pl.col("bidding_area").is_null())

In [None]:
px.line(
    local_power.filter(pl.col("windpark_name").is_in(["Sandøy", "Nye Sandøy"])),
    "time",
    "local_power",
)

In [None]:
windpower = (
    pl.read_parquet("data/wind_power_per_bidzone.parquet")
    .rename({"__index_level_0__": "time"})
    .unpivot(index="time", variable_name="bidding_area", value_name="power")
    .with_columns(pl.col("bidding_area").str.tail(1).cast(pl.Int64))
)
windpower

In [None]:
px.line(
    pl.scan_parquet("data/single_model_pred.parquet")
    .filter(
        pl.col("sid") == "Raggovidda",
        (pl.col("time") - pl.col("time_ref").dt.total_hours() < 24),
    )
    .collect(),
    "time",
    "local_power_pred",
    color="em",
)

In [None]:
pl.Config.set_tbl_rows(20)
local_power.group_by("time", "bidding_area").agg(
    local_power=pl.col("local_power").sum().cast(pl.Float64)
).join(windpower, on=["time", "bidding_area"]).group_by(
    "bidding_area", year=pl.col("time").dt.year()
).agg(RMSE=((pl.col("local_power") - pl.col("power")) ** 2).mean().sqrt()).sort(
    "bidding_area", "year"
)

In [None]:
ratio = (
    local_power.group_by("time", "bidding_area")
    .agg(local_power=pl.col("local_power").sum().cast(pl.Float64))
    .join(windpower, on=["time", "bidding_area"])
    .filter(
        pl.col("time").dt.year() <= 2022,
        pl.col("bidding_area").is_in([2, 3]),
        pl.col("local_power") > 10,
        # ).with_columns(ratio=pl.col("power").log()/pl.col("local_power").log()), "ratio", color="bidding_area", barmode="overlay")
    )
    .with_columns(
        ratio=pl.col("power") / pl.col("local_power"),
        log_ratio=pl.col("power").log() / pl.col("local_power").log(),
    )
    .with_columns(
        pl.col("ratio").median().over("bidding_area"),
        pl.col("log_ratio").median().over("bidding_area"),
    )
    .with_columns(
        ratio_adjusted=pl.col("local_power") * pl.col("ratio"),
        log_adjusted=pl.col("local_power") ** pl.col("log_ratio"),
    )
    .group_by("bidding_area")
    .agg(
        RMSE_ratio=((pl.col("ratio_adjusted") - pl.col("power")) ** 2).mean().sqrt(),
        RMSE_log=((pl.col("log_adjusted") - pl.col("power")) ** 2).mean().sqrt(),
    )
    .sort("bidding_area")
)
ratio

In [None]:
px.line(
    pl.concat(
        [
            windpower.with_columns(source=pl.lit("statkraft")),
            local_power.group_by("time", "bidding_area").agg(
                power=pl.col("local_power").sum().cast(pl.Float64), source=pl.lit("NVE")
            ),
        ]
    )
    .filter(pl.col("time").dt.year() <= 2022)
    .sort("time", "bidding_area", "source"),
    "time",
    "power",
    color="bidding_area",
    line_dash="source",
)

In [None]:
windpark_lookup = pl.read_csv("data/windparks_lookup.csv")

windparks_nve = pl.read_csv(
    "data/windparks_nve.csv", separator=";", decimal_comma=True, try_parse_dates=True
).with_columns(
    pl.col("Middelproduksjon [GWh]").str.replace_all(" ", "").cast(pl.Int64),
    windpark_nve=pl.col("Kraftverknavn"),
)
windparks = (
    pl.read_csv("data/windparks_bidzone.csv", try_parse_dates=True)
    .join(windpark_lookup, on="eic_code", how="inner")
    .with_columns(windpark=pl.col("name"))
)
windparks_match = windparks.join(
    windparks_nve, left_on="windpark", right_on="windpark_nve", how="left"
)  # .with_columns(no_match=pl.col("windpark_nve").is_null() | pl.col("windpark").is_null())

In [None]:
windparks_match

In [None]:
mw_per_turbine = windparks_match.select(
    (pl.col("Installert effekt [MW]") / pl.col("Antall turbiner")).mean()
).item()
GWh_per_MW = windparks_match.select(
    (pl.col("Middelproduksjon [GWh]") / pl.col("Installert effekt [MW]")).mean()
).item()

windparks_enriched = (
    windparks_match.with_columns(
        operating_power_max=pl.coalesce("Installert effekt [MW]", "operating_power_max")
    )
    .with_columns(
        num_turbines=pl.coalesce(
            "Antall turbiner", pl.col("operating_power_max") / mw_per_turbine
        ),
        mean_production=pl.col("operating_power_max") * GWh_per_MW,
    )
    .select(
        "bidding_area",
        "substation_name",
        pl.col("name").alias("windpark_name"),
        "prod_start_new",
        "Første turbin",
        "Fullt idriftsatt",
        "Fylke",
        "Kommune",
        "operating_power_max",
        "mean_production",
        "num_turbines",
    )
)

In [None]:
windparks_enriched.filter(pl.col("substation_name").is_duplicated())
# windparks_match.filter(pl.col("substation_name") == "Stokkeland")

In [None]:
import torch

blob = torch.load("data/torch_dataset_all_zones.pt")
X = blob["X"].float()
y = blob["y"].float()
y.shape

In [None]:
nan_filter = ~torch.isnan(y)
X = X[nan_filter]
y = y[nan_filter]
torch.save(
    {
        "X": X,
        "y": y,
    },
    "data/torch_dataset_all_zones.pt",
)