In [128]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [57]:
df_parameters_ref = pd.read_csv("../data/raw/facilities/site_parameters.csv")
df_site_wind = pd.read_parquet("../data/processed/site_wind.parquet")
df_facilities_wind = pd.read_parquet("../data/raw/facilities/facilities_wind.parquet")

In [58]:
df_site_wind = df_site_wind.merge(df_facilities_wind, on = 'code')
df_site_wind = df_site_wind[['code','lat','lon','network_region']]
df_site_wind['state'] = df_site_wind["network_region"].str.replace(r"[^A-Za-z]", "", regex=True)
df_site_wind = df_site_wind.drop(columns=['network_region'])
df_site_wind

Unnamed: 0,code,lat,lon,state
0,ALBANY,-35.09,117.85,WEM
1,ARWF,-37.29,143.04,VIC
2,BADGINGARRA,-30.47,115.32,WEM
3,BHWF,-38.72,146.01,VIC
4,BANGOWF,-34.76,148.87,NSW
...,...,...,...,...
96,WOOLNTH1,-40.70,144.69,TAS
97,YSWF,-37.73,144.25,VIC
98,YAMBUK,-38.28,142.05,VIC
99,YANDIN,-30.69,115.65,WEM


In [None]:
# Haversine distance
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(np.asarray(lat2))
    lon2 = np.radians(np.asarray(lon2))

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    return 2 * R * np.arcsin(np.sqrt(a))


def prepare_donors(df_site_wind, df_parameters_ref, COL, PARAM):

    donors = df_parameters_ref.copy()

    donors = donors.rename(columns={
        PARAM["id"]: "wind_farm",
        PARAM["state"]: "state",
        PARAM["cut_in"]: "cut_in_speed",
        PARAM["V0"]: "V0",
        PARAM["b"]: "b",
        PARAM["cut_out"]: "cut_out_speed",
        PARAM.get("rated", "rated_power"): "rated_power",
        PARAM.get("lat", "lat"): "lat",
        PARAM.get("lon", "lon"): "lon",
    })

    donors["state"] = donors["state"].astype(str).str.strip().str.upper()
    for c in ["cut_in_speed", "V0", "b", "cut_out_speed", "rated_power"]:
        if c in donors.columns:
            donors[c] = pd.to_numeric(donors[c], errors="coerce")

    if ("lat" not in donors.columns) or donors["lat"].isna().all():
        coords = (
            df_site_wind.rename(columns={
                COL["id"]: "wind_farm",
                COL["state"]: "state",
                COL["lat"]: "lat",
                COL["lon"]: "lon",
            })[["wind_farm", "state", "lat", "lon"]]
            .drop_duplicates()
        )

        coords["state"] = coords["state"].astype(str).str.strip().str.upper()

        donors = (
            donors.drop(columns=["lat", "lon"], errors="ignore")
                  .merge(coords, on=["wind_farm", "state"], how="left")
        )

    donors["lat"] = pd.to_numeric(donors["lat"], errors="coerce")
    donors["lon"] = pd.to_numeric(donors["lon"], errors="coerce")

    donors = donors.dropna(subset=[
        "wind_farm", "state", "lat", "lon",
        "cut_in_speed", "V0", "b", "cut_out_speed", "rated_power"
    ])

    return donors

# KNN IDW
def assign_params_knn(
    df_site_wind,
    donors,
    COL,
    K=5,
    eps=1e-6,
    fallback="national"
):

    df = df_site_wind.copy()

    df = df.rename(columns={
        COL["id"]: "wind_farm",
        COL["state"]: "state",
        COL["lat"]: "lat",
        COL["lon"]: "lon",
    })

    df["state"] = df["state"].astype(str).str.strip().str.upper()
    df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
    df["lon"] = pd.to_numeric(df["lon"], errors="coerce")

    donors_by_state = {s: g.copy() for s, g in donors.groupby("state")}

    df = df.merge(
        donors[[
            "wind_farm", "state",
            "cut_in_speed", "V0", "b",
            "cut_out_speed", "rated_power"
        ]],
        on=["wind_farm", "state"],
        how="left"
    )

    has_params = df["V0"].notna() & df["rated_power"].notna()

    results = []

    for idx, r in df.loc[~has_params].iterrows():

        lat, lon, s = r["lat"], r["lon"], r["state"]

        if not (np.isfinite(lat) and np.isfinite(lon)):
            results.append((idx, *([np.nan] * 5), 0, "", np.nan))
            continue

        pool = donors_by_state.get(s, pd.DataFrame())
        if pool.empty and fallback == "national":
            pool = donors

        if pool.empty:
            results.append((idx, *([np.nan] * 5), 0, "", np.nan))
            continue

        dist = haversine_km(lat, lon, pool["lat"].values, pool["lon"].values)
        pool = pool.assign(_dist=dist).sort_values("_dist")

        k_used = min(K, len(pool))
        nn = pool.iloc[:k_used]

        w = 1.0 / (nn["_dist"].values + eps)
        w_sum = w.sum()

        def wavg(col):
            return float((nn[col].values * w).sum() / w_sum)

        results.append((
            idx,
            wavg("cut_in_speed"),
            wavg("V0"),
            wavg("b"),
            wavg("cut_out_speed"),
            wavg("rated_power"),
            k_used,
            ",".join(nn["wind_farm"].astype(str)),
            float(nn["_dist"].min())
        ))

    if results:
        tmp = pd.DataFrame(
            results,
            columns=[
                "_idx",
                "cut_in_speed_new",
                "V0_new",
                "b_new",
                "cut_out_speed_new",
                "rated_power_new",
                "K_used",
                "donors_used",
                "min_dist_km"
            ]
        ).set_index("_idx")

        for old, new in [
            ("cut_in_speed", "cut_in_speed_new"),
            ("V0", "V0_new"),
            ("b", "b_new"),
            ("cut_out_speed", "cut_out_speed_new"),
            ("rated_power", "rated_power_new"),
        ]:
            df.loc[tmp.index, old] = tmp[new].values

        df.loc[tmp.index, ["K_used", "donors_used", "min_dist_km"]] = \
            tmp[["K_used", "donors_used", "min_dist_km"]].values

    df.loc[has_params, ["K_used", "donors_used", "min_dist_km"]] = [0, "", 0.0]

    return df



In [None]:

COL = {
    "id": "code",
    "state": "state",
    "lat": "lat",
    "lon": "lon",
}

PARAM = {
    "id": "code",
    "state": "state",
    "cut_in": "cut_in_speed",
    "V0": "V0",
    "b": "b",
    "cut_out": "cut_out_speed",
    "rated": "rated_power",
    "lat": "lat",
    "lon": "lon",
}


donors = prepare_donors(
    df_site_wind=df_site_wind,
    df_parameters_ref=df_parameters_ref,
    COL=COL,
    PARAM=PARAM
)

df_site_wind_full = assign_params_knn(
    df_site_wind=df_site_wind,
    donors=donors,
    COL=COL,
    K=5
)


In [95]:
df_site_wind_full = df_site_wind_full.set_index("wind_farm")

In [96]:
df_site_wind_full.to_csv("../data/processed/parameters_wind_site.csv")

In [100]:
dir_sfcwind = Path("../data/processed/sfcwind")
files_sfcwind = list(dir_sfcwind.glob("*"))
dict_wind = {file.stem: pd.read_parquet(file) for file in files_sfcwind}

In [120]:
def wind_power_curve(df, cut_in_speed, V0, b, cut_out_speed, rated_power):
    df['q'] = rated_power / (1 + np.exp(-1 * b * (df['wind_100'] - V0)))
    df['power'] = np.where((df['wind_100'] <= cut_in_speed) | (
        df['wind_100'] >= cut_out_speed), 0, df['q'])

    return df.drop(columns=['q'])

In [121]:
def extrapolate_wind(
    sfcwind_10,
    z_ref=10.0,
    z_hub=100.0,
    alpha=1/7
):
    return np.asarray(sfcwind_10) * (z_hub / z_ref) ** alpha


In [122]:
for site, df in dict_wind.items():
    df["wind_100"] = extrapolate_wind(df["sfcWind"])

In [None]:
def wind_to_power(dict_wind, params_by_site, time_col="time"):
    dict_power = {}

    for site, df in dict_wind.items():
        d = df.copy()
        d[time_col] = pd.to_datetime(d[time_col])

        d = d.sort_values(time_col)

        d = wind_power_curve(
            d,
            cut_in_speed=df_site_wind_full.loc[site,"cut_in_speed"],
            V0=df_site_wind_full.loc[site,"V0"],
            b=df_site_wind_full.loc[site,"b"],
            cut_out_speed=df_site_wind_full.loc[site,"cut_out_speed"],
            rated_power=df_site_wind_full.loc[site,"rated_power"]
        )

        dict_power[site] = d

    return dict_power


In [126]:
dict_power_wind = wind_to_power(dict_wind, df_site_wind_full)


In [129]:
out_dir = "../data/processed/wind_power"
os.makedirs(out_dir, exist_ok=True)

for key, df in dict_power_wind.items():
    file_path = os.path.join(out_dir, f"{key}.parquet")
    df.to_parquet(file_path, index=False)
