In [1]:
# import libraries
from pathlib import Path
import pandas as pd
import numpy as np
import os
from scipy.stats import norm
import pickle

In [3]:
dir_rsds = Path("../data/processed/rsds_filter")
files_rsds = list(dir_rsds.glob("*"))
dict_rsds = {file.stem: pd.read_parquet(file) for file in files_rsds}

In [None]:

def build_ecdf_model(values: np.ndarray, dropna: bool = True) -> dict:
    v = np.asarray(values)

    if dropna:
        v = v[~np.isnan(v)]

    if v.size < 10:
        raise ValueError(f"Not enough samples to build ECDF: {v.size}")

    x_sorted = np.sort(v)
    N = x_sorted.size

    eps = 1.0 / (N + 1)

    return {"x_sorted": x_sorted, "N": N, "eps": eps}


def ecdf_p(x: np.ndarray, model: dict, clip: bool = True) -> np.ndarray:

    xs = model["x_sorted"]
    N = model["N"]
    p = np.searchsorted(xs, x, side="right") / (N + 1)

    if clip:
        eps = model["eps"]
        p = np.clip(p, eps, 1 - eps)

    return p


def add_ecdf_p_column(
    df: pd.DataFrame,
    value_col: str,
    time_col: str = "time",
    out_col: str = "p",
    model: dict | None = None,
    sort_by_time: bool = True,
    clip: bool = True
) -> tuple[pd.DataFrame, dict]:

    d = df.copy()
    if sort_by_time and time_col in d.columns:
        d[time_col] = pd.to_datetime(d[time_col])
        d = d.sort_values(time_col)

    if model is None:
        model = build_ecdf_model(d[value_col].to_numpy())

    d[out_col] = ecdf_p(d[value_col].to_numpy(), model, clip=clip)
    return d, model


In [None]:

def window_to_n(window: str) -> int:
    # "24h" -> 24
    return int(pd.Timedelta(window).total_seconds() // 3600)

def add_rolling_mean_by_samples(
    df: pd.DataFrame,
    value_col="rsds",
    time_col="time",
    window="24h",
    coverage=0.6,
    remove_night=True,
    night_threshold=0.0,
):
    d = df.copy()
    d[time_col] = pd.to_datetime(d[time_col])
    d = d.sort_values(time_col)

    if remove_night:
        d = d[d[value_col] > night_threshold].copy()

    if len(d) == 0:
        out_col = f"{value_col}_w{window}"
        d[out_col] = np.nan
        return d, out_col

    n = window_to_n(window)
    min_periods = max(1, int(np.ceil(n * coverage)))

    out_col = f"{value_col}_w{window}"
    d[out_col] = d[value_col].rolling(window=n, min_periods=min_periods).mean()

    return d, out_col


MIN_ECDF_SAMPLES = 50 

p_results_by_window = {w: {} for w in WINDOWS}
ecdf_models_by_window = {w: {} for w in WINDOWS}

for w in WINDOWS:
    for site, df_site in dict_rsds.items():

        df_w, value_w_col = add_rolling_mean_by_samples(
            df_site,
            value_col="rsds",
            time_col="time",
            window=w,
            coverage=0.6,
            remove_night=True,
            night_threshold=0.0,   # 夜间阈值：>0 保留
        )

        n_valid = df_w[value_w_col].notna().sum()
        if n_valid < MIN_ECDF_SAMPLES:
            continue

        df_p, model = add_ecdf_p_column(
            df_w,
            value_col=value_w_col,
            time_col="time",
            out_col="p",
            model=None,
            sort_by_time=True,
            clip=True
        )

        p_results_by_window[w][site] = df_p
        ecdf_models_by_window[w][site] = model


In [25]:
out_dir = "../data/processed/solar_ecdf"
os.makedirs(out_dir, exist_ok=True)
for window, data in p_results_by_window.items():
    out_dir_window = out_dir + "/" + window
    os.makedirs(out_dir_window, exist_ok=True)
    for key, df in data.items():
        file_path = os.path.join(out_dir_window, f"{key}.parquet")
        df.to_parquet(file_path, index=False)


In [14]:

with open("../data/processed/solar_ecdf_model.pkl", "wb") as f:
    pickle.dump(ecdf_models, f, protocol=pickle.HIGHEST_PROTOCOL)