In [None]:
import pandas as pd
import numpy as np


In [2]:
srepi_solar = pd.read_parquet("../data/processed/solar_srepi.parquet")

In [None]:
def build_threshold_by_window(
    df,
    srepi_col="solar_srepi",
    window_col="window",
    q=0.10,
):
    thr = (
        df.dropna(subset=[srepi_col])
          .groupby(window_col)[srepi_col]
          .quantile(q)
          .to_dict()
    )
    return thr

threshold_by_window = build_threshold_by_window(
    srepi_solar, 
    srepi_col="solar_srepi",
    window_col="window",
    q=0.10
)

threshold_by_window


{'120h': -1.2803520764856924,
 '12h': -1.2813736578264643,
 '1h': -1.2814346156278105,
 '24h': -1.2812624100524386,
 '48h': -1.2810364566962482,
 '4h': -1.2814275072695596,
 '72h': -1.2807959965375157}

In [None]:
def extract_drought_events_window_threshold(
    df,
    threshold_by_window: dict,
    srepi_col="solar_srepi",
    time_col="time",
    window_col="window",
    code_col="code",
    min_duration_h=2,
    max_duration_h=15*24,
    require_consecutive=True,
):
    d0 = df.copy()
    d0[time_col] = pd.to_datetime(d0[time_col], errors="coerce")

    d0["thr"] = d0[window_col].map(threshold_by_window)
    d0["is_drought"] = d0[srepi_col] < d0["thr"]

    events = []

    for (w, code), d in d0.groupby([window_col, code_col]):
        d = d.sort_values(time_col).reset_index(drop=True)

        if require_consecutive:
            dt_h = d[time_col].diff().dt.total_seconds().div(3600)
            seg_break = (d["is_drought"] != d["is_drought"].shift()) | (dt_h > 1.5)
            d["_seg"] = seg_break.cumsum()
        else:
            d["_seg"] = (d["is_drought"] != d["is_drought"].shift()).cumsum()

        for seg_id, g in d.groupby("_seg"):
            if not bool(g["is_drought"].iloc[0]):
                continue

            start_time = g[time_col].iloc[0]
            end_time = g[time_col].iloc[-1]
            duration_h = (end_time - start_time).total_seconds() / 3600 + 1

            events.append({
                "window": w,
                "code": code,
                "threshold": g["thr"].iloc[0],
                "start_time": start_time,
                "end_time": end_time,
                "duration_hours": duration_h,
                "n_points": len(g),
                "min_srepi": g[srepi_col].min(),
                "mean_srepi": g[srepi_col].mean(),
            })

    df_events_raw = pd.DataFrame(events)
    if df_events_raw.empty:
        return df_events_raw

    df_events = df_events_raw[
        (df_events_raw["duration_hours"] >= min_duration_h) &
        (df_events_raw["duration_hours"] <= max_duration_h)
    ].reset_index(drop=True)

    return df_events


In [11]:
df_events = extract_drought_events_window_threshold(
    srepi_solar,
    threshold_by_window=threshold_by_window,
    srepi_col="solar_srepi",
    time_col="time",
    window_col="window",
    code_col="code",
    min_duration_h=2,
    max_duration_h=15*24,
    require_consecutive=True,
)

df_events

Unnamed: 0,window,code,threshold,start_time,end_time,duration_hours,n_points,min_srepi,mean_srepi
0,120h,0CULCSF,-1.280352,2014-06-03 23:30:00,2014-06-04 05:30:00,7.0,7,-1.459466,-1.436642
1,120h,0CULCSF,-1.280352,2014-06-04 23:30:00,2014-06-05 05:30:00,7.0,7,-1.518400,-1.506600
2,120h,0CULCSF,-1.280352,2014-06-05 23:30:00,2014-06-06 05:30:00,7.0,7,-1.556063,-1.535677
3,120h,0CULCSF,-1.280352,2014-06-06 22:30:00,2014-06-07 05:30:00,8.0,8,-1.593204,-1.555933
4,120h,0CULCSF,-1.280352,2014-06-07 22:30:00,2014-06-08 05:30:00,8.0,8,-1.590348,-1.529420
...,...,...,...,...,...,...,...,...,...
486566,72h,YATSF1,-1.280796,2023-07-08 23:30:00,2023-07-09 06:30:00,8.0,8,-2.912880,-2.270571
486567,72h,YATSF1,-1.280796,2023-07-09 23:30:00,2023-07-10 05:30:00,7.0,7,-1.987290,-1.931042
486568,72h,YATSF1,-1.280796,2023-07-10 23:30:00,2023-07-11 06:30:00,8.0,8,-1.851703,-1.648571
486569,72h,YATSF1,-1.280796,2023-07-11 23:30:00,2023-07-12 06:30:00,8.0,8,-1.556792,-1.402962


In [12]:
df_events_solar = df_events

In [13]:
df_events.to_parquet("../data/curated/solar_drought_events.parquet", index=False)


In [None]:
# 