In [None]:
import polars as pl
import numpy as np
import plotly.express as px

In [None]:
forecast = pl.read_csv("data/ts_forecast.csv").select(
    "time_ref", pl.col("time").cast(pl.Datetime("ns")), "power_forecast"
)
forecast

In [None]:
from datetime import datetime

windpower = (
    (
        (
            pl.scan_parquet("data/wind_power_per_bidzone.parquet").rename(
                {"__index_level_0__": "time"}
            )
        )
        .unpivot(index="time", variable_name="bidding_area", value_name="power")
        .filter(
            pl.col("bidding_area") == "ELSPOT NO3",
            pl.col("time") > datetime(2025, 1, 1),
        )
        .with_columns(
            pl.col("power")
            .shift()
            .ewm_mean(com=com)
            .over("bidding_area", order_by="time")
            .alias(f"EWM {com}")
            for com in [0, 6, 12, 24]
        )
    )
    .unpivot(index=["time", "bidding_area"])
    .collect()
)
px.line(windpower, "time", "value", color="variable")

In [None]:
start_time = forecast.select(pl.col("time").min()).item()
end_time = forecast.select(pl.col("time").max()).item()
bidding_area = "ELSPOT NO3"
windpower = (
    pl.scan_parquet("data/wind_power_per_bidzone.parquet")
    .select(
        pl.lit("true_power").alias("time_ref"),
        pl.col("__index_level_0__").alias("time"),
        pl.col(bidding_area).alias("power_forecast"),
    )
    .filter(pl.col("time").is_between(start_time, end_time))
    .sort("time")
    .collect()
)
windpower

In [None]:
pl.concat([forecast, windpower]).with_columns(
    is_forecast=pl.col("time_ref") == "true_power"
)

In [None]:
px.line(
    pl.concat([forecast, windpower]).with_columns(
        is_forecast=pl.col("time_ref") == "true_power"
    ),
    "time",
    "power_forecast",
    color="is_forecast",
    line_group="time_ref",
)

In [None]:
windparks = pl.scan_csv("data/windparks_enriched.csv", try_parse_dates=True)
weather_forecast = pl.scan_parquet("data/met_forecast.parquet").join(
    windparks, left_on="sid", right_on="substation_name"
)

windpower = (
    (
        pl.scan_parquet("data/wind_power_per_bidzone.parquet").rename(
            {"__index_level_0__": "power_time"}
        )
    )
    .unpivot(index="power_time", variable_name="bidding_area", value_name="power")
    .with_columns(
        (pl.col("bidding_area") == f"ELSPOT NO{k}").alias(f"ELSPOT NO{k}")
        for k in range(1, 5)
    )
)
windpower.tail(10).collect()

In [None]:
weather_forecast.select("bidding_area", "time_ref").unique().join(
    windpower, on="bidding_area"
).filter(
    pl.col("power_time") < pl.col("time_ref"),
    pl.col("power_time") >= pl.col("time_ref") - pl.duration(hours=24),
).group_by("bidding_area", "time_ref").agg(
    last_day_mean=pl.col("power").mean(),
    last_values_mean=pl.col("power")
    .filter(pl.col("power_time") >= pl.col("time_ref") - pl.duration(hours=3))
    .mean(),
).head(10).collect()

In [None]:
weather_forecast = pl.scan_parquet("data/met_forecast.parquet")

# If your timestamps are naive UTC strings, uncomment:
# df = df.with_columns(pl.col("ts").str.strptime(pl.Datetime, strict=False, fmt="%Y-%m-%d %H:%M:%S"))

# If you want LOCAL features (recommended for power data), convert tz first:
# df = df.with_columns(pl.col("ts").dt.replace_time_zone("UTC").dt.convert_time_zone("Europe/Oslo"))

TAU = 2 * np.pi  # 2π

hour = pl.col("time").dt.hour().cast(pl.Float64)  # 0..23
doy = pl.col("time").dt.ordinal_day().cast(pl.Float64)  # 1..365/366
# Use fractional day-of-year so the seasonal cycle is smooth across days:
doy_frac = (doy - 1 + hour / 24.0) / 365.2425  # ~[0,1)

weather_forecast = weather_forecast.with_columns(
    # Hour-of-day (period = 24)
    (TAU * hour / 24.0).sin().alias("sin_hod"),
    (TAU * hour / 24.0).cos().alias("cos_hod"),
    # Day-of-year (period ≈ 365.2425 to handle leap years smoothly)
    (TAU * doy_frac).sin().alias("sin_doy"),
    (TAU * doy_frac).cos().alias("cos_doy"),
)

weather_forecast.head(5).collect()

In [None]:
weather_forecast.group_by(pl.col("sid")).agg(n=pl.count("sid")).sort(
    pl.col("n")
).collect()

In [None]:
import datetime
# px.histogram(
#     weather_forecast.filter(pl.col("time_ref") == datetime.datetime(2024,2,16,12,0,0))
#     .collect(),
#     "sid",
# )

px.histogram(
    weather_forecast.filter(
        pl.col("time_ref") > datetime.datetime(2024, 2, 15, 12, 0, 0),
        pl.col("sid") == "Havøygavlen",
    ).collect(),
    "time_ref",
)

In [None]:
weather_forecast.select(
    pl.col("ws10m_00"),
    ws=pl.mean_horizontal(pl.col(f"ws10m_{k:02d}" for k in range(15))),
    ws_std=pl.concat_list(pl.col(f"ws10m_{k:02d}" for k in range(15))).list.std(),
).head(100).collect()

In [None]:
weather_nowcast = pl.scan_parquet("data/met_nowcast.parquet")
weather_nowcast.select(
    pl.col("windpark").alias("sid"),
    "time",
    pl.exclude("windpark", "time").name.prefix("now_"),
).head(5).collect_schema().names()

In [None]:
bid_zone = "ELSPOT NO3"
windpower = (
    pl.scan_parquet("data/wind_power_per_bidzone.parquet")
    .select(
        pl.col("__index_level_0__").alias("time"),
        pl.col(bid_zone).alias("bid_zone_power"),
    )
    .sort("time")
    .with_columns(issue_date=pl.col("time").dt.date())
)
windpower_prev = windpower.group_by(
    issue_date=pl.col("issue_date") + pl.duration(days=1)
).agg(
    last_day_mean=pl.col("bid_zone_power").mean(),
    last_value=pl.col("bid_zone_power").sort_by("time").last(),
)

windpower = windpower.join(windpower_prev, on="issue_date")

windpower.head(5).collect()  #  = windpower.tz_localize('UTC').tz_convert('CET')

In [None]:
windpower = (
    (
        pl.scan_parquet("data/wind_power_per_bidzone.parquet").rename(
            {"__index_level_0__": "time"}
        )
    )
    .unpivot(index="time", variable_name="bid_zone", value_name="power")
    .with_columns(
        (pl.col("bid_zone") == f"ELSPOT NO{k}").alias(f"ELSPOT NO{k}")
        for k in range(1, 5)
    )
    .with_columns(issue_date=pl.col("time").dt.date())
)

windpower_prev = windpower.group_by(
    "bid_zone", issue_date=pl.col("issue_date") + pl.duration(days=1)
).agg(
    last_day_mean=pl.col("power").mean(),
    last_value=pl.col("power").sort_by("time").last(),
)
windpower = windpower.join(windpower_prev, on=["bid_zone", "issue_date"], how="left")

windpower.head(26).collect()  #  = windpower.tz_localize('UTC').tz_convert('CET')

In [None]:
windpower_plot = windpower.unpivot(
    index=["time", "bid_zone"], on=["power", "last_day_mean", "last_value"]
).sort("time", "bid_zone", "variable")
px.line(
    windpower_plot.tail(3000).collect(),
    "time",
    "value",
    color="bid_zone",
    line_group="variable",
)

In [None]:
windparks = pl.scan_csv("data/windparks_bidzone.csv", try_parse_dates=True)
# .filter(
#     pl.col("eic_code") == pl.col("eic_code").first().over("substation_name")
# )
windparks.collect()  # .group_by("substation_name").agg(n=pl.count("eic_code")).sort("n").collect()

In [None]:
windparks.filter(pl.col("substation_name") == "Stokkeland").select("eic_code").collect()

In [None]:
windparks.group_by("bidding_area").agg(
    num_stations=pl.count("eic_code"),
    total_power_max=pl.col("operating_power_max").sum(),
    mean_power_max=pl.col("operating_power_max").mean(),
    min_valid_date=pl.col("prod_start_new").max(),
).collect()

In [None]:
# Modelling this bid zone
bid_zone = "ELSPOT NO2"

# Selecting the windparks in bid zone from metadata
_windparks_in_bid_zone = windparks.filter(pl.col("bidding_area") == bid_zone)

# Selecting the windpower from bid zone
_windpower_in_bid_zone = windpower.select("time", bid_zone)

# Caclulcating the mean observed weather for the windparks in the bid zone
_weather_nowcast_in_bid_zone = (
    weather_nowcast.join(
        _windparks_in_bid_zone,
        left_on="windpark",
        right_on="substation_name",
        how="inner",
    )
    # .filter(pl.col("windpark").is_in(_windparks_in_bid_zone["substation_name"].implode()))
    .group_by("time")
    .agg(
        mean_wind_speed=pl.col("wind_speed_10m").mean(),
        median_wind_speed=pl.col("wind_speed_10m").median(),
        std_wind_speed=pl.col("wind_speed_10m").std(),
        wind_speed_weighted=(
            pl.col("wind_speed_10m") * pl.col("operating_power_max")
        ).sum()
        / pl.col("operating_power_max").sum(),
    )
)

# Concatenating datasets (weather and power) into one dataframe
data_bidzone = _windpower_in_bid_zone.join(
    _weather_nowcast_in_bid_zone, on="time"
)  # pd.concat([_windpower_in_bid_zone, _weather_nowcast_in_bid_zone], axis=1)

# Filtering out data where not all windparks are operational
min_valid_date = (
    _windparks_in_bid_zone.select(pl.col("prod_start_new").max()).collect().item()
)
data_bidzone = data_bidzone.filter(pl.col("time") > min_valid_date).drop_nulls()

In [None]:
import plotly.express as px

px.scatter(
    data_bidzone.collect(),
    "median_wind_speed",
    bid_zone,
    color="std_wind_speed",
    opacity=0.5,
    height=700,
)

In [None]:
from scipy.optimize import curve_fit
import numpy as np


def logistic_curve(x, L: float, k: float, x0: float):
    if isinstance(x, np.ndarray):
        return L / (1 + np.exp(-k * (x - x0)))
    else:
        return L / (1 + pl.exp(-k * (x - x0)))


xdata = data_bidzone.select("wind_speed_weighted").collect().to_numpy()[:, 0]
ydata = data_bidzone.select(bid_zone).collect().to_numpy()[:, 0]

res = curve_fit(logistic_curve, xdata, ydata, p0=[1000, 1, 6])
res

In [None]:
hourly_station = (
    weather_nowcast.join(
        _windparks_in_bid_zone,
        left_on="windpark",
        right_on="substation_name",
        how="inner",
    )
    .filter(pl.col("time") > pl.col("prod_start_new"))
    .select("time", "windpark", "wind_speed_10m", "operating_power_max")
    .collect()
    .pivot("windpark", index="time", values="wind_speed_10m")
    .drop_nulls()
    .join(_windpower_in_bid_zone.collect(), on="time")
    .drop("time")
)

In [None]:
px.histogram(
    weather_nowcast.join(
        _windparks_in_bid_zone,
        left_on="windpark",
        right_on="substation_name",
        how="inner",
    )
    .filter(pl.col("time") > pl.col("prod_start_new"))
    .select("time", "windpark", "wind_speed_10m", "operating_power_max")
    .with_columns(log_wind=(pl.col("wind_speed_10m")) ** (1 / 3))
    .collect(),
    "log_wind",
    facet_col="windpark",
    facet_col_wrap=4,
    height=1000,
)

In [None]:
parks = hourly_station.drop(bid_zone).columns
park_weights = {
    x["substation_name"]: x["operating_power_max"]
    for x in _windparks_in_bid_zone.select("substation_name", "operating_power_max")
    .collect()
    .to_dicts()
}


def P(x, L: float, k: float, x0: float):
    total_p = 0
    for i, park in enumerate(parks):
        total_p += park_weights[park] * logistic_curve(x[:, i], L, k, x0)
    return total_p

In [None]:
xdata = hourly_station.select(parks).to_numpy()
ydata = hourly_station.select(bid_zone).to_numpy()[:, 0]

res = curve_fit(P, xdata, ydata, p0=[1, 1, 6])
L, k, x0 = res[0]

In [None]:
print(L, k, x0)

In [None]:
hourly_station = hourly_station.with_columns(pred=P(xdata, L, k, x0))

In [None]:
px.scatter(
    hourly_station.with_columns(mean_wind=pl.mean_horizontal(parks)).unpivot(
        [bid_zone, "pred"], index="mean_wind"
    ),
    "mean_wind",
    "value",
    color="variable",
)

## Nordpool API

In [None]:
windpower.select(pl.col("time").min()).collect().item().isoformat()

In [None]:
import requests

areas = [
    {"name": "NO1", "code": "10YNO-1--------2"},
    {"name": "NO2", "code": "10YNO-2--------T"},
    {"name": "NO3", "code": "10YNO-3--------J"},
    {"name": "NO4", "code": "10YNO-4--------9"},
    # {"name": "NO5", "code": "10Y1001A1001A48H"},
]
url = "https://ummapi.nordpoolgroup.com/messages"

messages = []
skip = 0
while True:
    res = requests.get(
        url,
        params={
            "limit": 2000,
            # "messageTypes": "ProductionUnavailability",
            "areas": [a["code"] for a in areas],
            "fuelTypes": 19,
            # "publicationStartDate": "2020-01-01T00:00:00",
            "skip": skip,
        },
    )
    if res.status_code != 200:
        print(res.status_code)
        break

    content = res.json()
    if len(content["items"]) == 0:
        break
    messages.extend(content["items"])
    skip += len(content["items"])
    print(
        f"Retrieved: {len(content['items'])} ---- Progress: {skip}/{content['total']}"
    )
    if skip >= content["total"]:
        break

In [None]:
unit_fields = ["name", "eic", "areaName", "installedCapacity", "timePeriods"]
period_fields = ["unavailableCapacity", "availableCapacity", "eventStart", "eventStop"]
outages = (
    pl.json_normalize(messages, infer_schema_length=500)
    .select(
        pl.when(pl.col("productionUnits").is_not_null())
        .then(pl.col("productionUnits").list.first().struct.field(u))
        .otherwise(pl.col("generationUnits").list.first().struct.field(u))
        .alias(u)
        for u in unit_fields
    )
    .explode("timePeriods")
    .with_columns(pl.col("timePeriods").struct.field(p) for p in period_fields)
    .drop("timePeriods")
)

In [None]:
outages.write_csv("data/outages.csv")

In [None]:
windparks = windparks.collect()

In [None]:
windparks.join(outages, left_on="eid_code", right_on="eic").filter(pl.col())

In [None]:
url = "https://ummapi.nordpoolgroup.com/infrastructure/stations"
# url = "https://ummapi.nordpoolgroup.com/infrastructure/assets"

res = requests.get(url)
res

In [None]:
windparks = pl.read_csv("data/windparks_bidzone.csv", try_parse_dates=True)
windparks.height

In [None]:
windpark_lookup = pl.json_normalize(res.json())
windpark_lookup

In [None]:
windparks_nve = pl.read_csv(
    "data/windparks_nve.csv", separator=";", decimal_comma=True
).with_columns(windpark_nve=pl.col("Kraftverknavn"))
windparks_nve

In [None]:
windpark_lookup = pl.read_csv("data/windparks_lookup.csv")

windparks_nve = pl.read_csv(
    "data/windparks_nve.csv", separator=";", decimal_comma=True, try_parse_dates=True
).with_columns(
    pl.col("Middelproduksjon [GWh]").str.replace_all(" ", "").cast(pl.Int64),
    windpark_nve=pl.col("Kraftverknavn"),
)
windparks = (
    pl.read_csv("data/windparks_bidzone.csv", try_parse_dates=True)
    .join(windpark_lookup, on="eic_code", how="inner")
    .with_columns(windpark=pl.col("name"))
)
windparks_match = windparks.join(
    windparks_nve, left_on="windpark", right_on="windpark_nve", how="left"
)  # .with_columns(no_match=pl.col("windpark_nve").is_null() | pl.col("windpark").is_null())

In [None]:
windparks_match

In [None]:
mw_per_turbine = windparks_match.select(
    (pl.col("Installert effekt [MW]") / pl.col("Antall turbiner")).mean()
).item()
GWh_per_MW = windparks_match.select(
    (pl.col("Middelproduksjon [GWh]") / pl.col("Installert effekt [MW]")).mean()
).item()

windparks_enriched = (
    windparks_match.with_columns(
        operating_power_max=pl.coalesce("Installert effekt [MW]", "operating_power_max")
    )
    .with_columns(
        num_turbines=pl.coalesce(
            "Antall turbiner", pl.col("operating_power_max") / mw_per_turbine
        ),
        mean_production=pl.col("operating_power_max") * GWh_per_MW,
    )
    .select(
        "bidding_area",
        "substation_name",
        pl.col("name").alias("windpark_name"),
        "prod_start_new",
        "Første turbin",
        "Fullt idriftsatt",
        "Fylke",
        "Kommune",
        "operating_power_max",
        "mean_production",
        "num_turbines",
    )
)

In [None]:
windparks_enriched.filter(pl.col("substation_name").is_duplicated())
# windparks_match.filter(pl.col("substation_name") == "Stokkeland")

In [None]:
import torch

blob = torch.load("data/torch_dataset_all_zones.pt")
X = blob["X"].float()
y = blob["y"].float()
y.shape

In [None]:
nan_filter = ~torch.isnan(y)
X = X[nan_filter]
y = y[nan_filter]
torch.save(
    {
        "X": X,
        "y": y,
    },
    "data/torch_dataset_all_zones.pt",
)