In [None]:
import pandas as pd
from glob import glob
from pathlib import Path


rg_raw_data_dir = "data/7-23-25-scrape"
rg_fps = glob(f"{rg_raw_data_dir}/*.csv")

# read + show an example set of raw vals for a rain-gauge
ex_fp = rg_fps[0]
ex_df = pd.read_csv(ex_fp)
ex_df.head()

### 1. Grouping rain gauge data into one table
---
- Aggregate rain gauge raw data (i.e., bucket tips) into a ***sparse*** table with timestep index $\mathcal{T}_{CCRFCD}$
    - Why is this table "sparse"?
        - Timesteps $u \in \mathcal{T}_{CCRFCD}$ are not uniformly spaced
        - Each timestep $u$ (in theory) corresponds to **one** datapoint for **one** rain gauge
    - Sort `datetime-local` column 

| gauge-idx | datetime-local | utc-offset | datetime-utc | val |
| :---: | :---: | :---: | :---: | :---: |
| 4 | 2025-03-03-07:03 | -7 | 2025-03-03-14:03 | 3.5 |
| 4 | 2025-03-03-07:35 | -7 | 2025-03-03-14:35 | 3.6 |

In [None]:
from datetime import datetime
from zoneinfo import ZoneInfo


# PST/PDT
las_vegas_tz = ZoneInfo("America/Los_Angeles")


def gen_rg_df(df: pd.DataFrame, gauge_idx: int) -> pd.DataFrame:

    local_dts = []
    utc_dts = []

    date_arr = df['Date']; time_arr = df['Time']
    concat_str_arr = date_arr + ' ' + time_arr
    
    for dt_str in concat_str_arr:
        
        # local_dt
        naive_dt = datetime.strptime(dt_str, "%m/%d/%Y %H:%M:%S")
        local_dt = naive_dt.replace(tzinfo=las_vegas_tz)

        # local -> UTC
        utc_dt = local_dt.astimezone(ZoneInfo("UTC"))

        local_dts.append(local_dt)
        utc_dts.append(utc_dt)

    instant_rain_acc = df["Value"]

    gauge_df = pd.DataFrame({
        'gauge_idx': [gauge_idx] * len(local_dts), # repeat gauge_id for number of rows
        'local_time': local_dts,
        'utc_time': utc_dts,
        'gauge_acc_in': instant_rain_acc,
    })

    return gauge_df

In [None]:
from tqdm import tqdm


master_df: pd.DataFrame | None = None

# iterate through CCRFCD rain gauge raw data `.csv` files
# open each dataframe; calculate utc datetime, append df to `master_df`
for fp in tqdm(rg_fps):

    # hackey way of getting the rain gauge's idx from the fp
    rg_idx = int(fp.split("_")[-1: ][0].strip(".csv"))

    _df = pd.read_csv(fp)
    rg_df = gen_rg_df(_df, rg_idx)

    if master_df is None:
        master_df = rg_df
    else:
        
        # vertical concatination
        _temp_df = pd.concat([master_df, rg_df], axis=0)
        master_df = _temp_df

In [None]:
# hmm... there are some unrealistically large/small values in our dataset
master_df.sort_values("gauge_acc_in", ascending=False)

### 1.1 Removing non-rain gauges

- Some of these sensors arn't rain gauges! For example, `4651.0` is an annemometer, explaining the unusally high values.
- We'll simply mask out any sensor that isn't a rain gauge.

In [None]:
sensor_metadata = pd.read_csv("data/ccrfcd_rain_gauge_metadata.csv")

# every valid rain gauge contains the substring "(Rain)" in it's description
mask = sensor_metadata["old_name"].str.contains(r"\(Rain\)", na=False)

# set of valid rg ids
valid_rain_gauges = set(sensor_metadata[mask]["station_id"])

# mask out any row that doesn't contain a valid rain gauge id
valid_rain_gauges_mask = [idx in valid_rain_gauges for idx in master_df['gauge_idx']]

# let's check our highest/lowest precip. accumulation values again
master_df_cleaned = master_df[valid_rain_gauges_mask].sort_values("gauge_acc_in", ascending=False)
master_df_cleaned

In [None]:
checkpoint_one_fp = "data/__checkpoints__/ds_ckpt_1.csv"
master_df_cleaned.sort_values("local_time").to_csv(checkpoint_one_fp)

### 2. Selecting which MRMS 1H QPE files to download
---

- Construct $\mathcal{T}_{MRMS}$; i.e., determine which MRMS 1H-QPE timesteps to download
    - LV valley: (35.8, 36.4 / -115.4, -114.8)
    - 1 1km $\times$ 1km grid-cell in the LV valley >= 0.25 in. 1H QPE in a 24H period (i.e., 00:00-23:59 UTC)
- Next steps
    - All MRMS data is downloaded to: `data/events/2021-01-01_2025-07-25_all_events.csv`

In [21]:
import pandas as pd


checkpoint_one_fp = "data/__checkpoints__/ds_ckpt_1.csv"
ckpt_one_df = pd.read_csv(checkpoint_one_fp, )
ckpt_one_df.head()

Unnamed: 0.1,Unnamed: 0,gauge_idx,local_time,utc_time,gauge_acc_in
0,3753,4054.0,2021-01-01 08:00:25-08:00,2021-01-01 16:00:25+00:00,0.0
1,4652,4394.0,2021-01-01 08:01:13-08:00,2021-01-01 16:01:13+00:00,0.0
2,3263,4309.0,2021-01-01 08:01:44-08:00,2021-01-01 16:01:44+00:00,0.04
3,3643,8.0,2021-01-01 08:05:16-08:00,2021-01-01 16:05:16+00:00,0.0
4,3603,4574.0,2021-01-01 08:09:26-08:00,2021-01-01 16:09:26+00:00,0.0


In [25]:
# the previous aligned dataset we constructed
# generally, we can trust the mrms_qpe values?
prev_aligned_df_fp = 'data/events/2021-01-01_2025-07-25_all_events.csv'
prev_aligned_df = pd.read_csv(prev_aligned_df_fp)
prev_aligned_df

Unnamed: 0,start_time,end_time,station_id,lat,lon,gauge_qpe,mrms_qpe,delta_qpe,cum_gauge_qpe,cum_mrms_qpe
0,2021-01-23 23:00:00,2021-01-24 00:00:00,4709,35.925000,244.883000,0.00,0.003937,-0.003937,0.00,0.003937
1,2021-01-23 23:00:00,2021-01-24 00:00:00,4564,36.028250,244.996361,0.04,0.000000,0.040000,0.04,0.000000
2,2021-01-23 23:00:00,2021-01-24 00:00:00,4779,36.021861,245.041250,0.04,0.000000,0.040000,0.04,0.000000
3,2021-01-23 23:00:00,2021-01-24 00:00:00,4984,35.729250,244.811972,0.00,0.011811,-0.011811,0.00,0.011811
4,2021-01-23 23:00:00,2021-01-24 00:00:00,4724,35.939417,244.922250,0.00,0.015748,-0.015748,0.00,0.015748
...,...,...,...,...,...,...,...,...,...,...
2342106,2025-07-20 20:52:00,2025-07-20 21:52:00,3301,35.995444,245.136194,0.01,0.000000,0.010000,96.48,535.342526
2342107,2025-07-20 20:54:00,2025-07-20 21:54:00,3301,35.995444,245.136194,0.01,0.000000,0.010000,96.49,535.342526
2342108,2025-07-20 20:56:00,2025-07-20 21:56:00,3301,35.995444,245.136194,0.01,0.000000,0.010000,96.50,535.342526
2342109,2025-07-20 20:58:00,2025-07-20 21:58:00,3301,35.995444,245.136194,0.01,0.000000,0.010000,96.51,535.342526


### 3. Generate a ***dense*** table from `1` with a timestep index of $\mathcal{T}_{MRMS}$
---
- > $\forall_{t \in \mathcal{T}_{MRMS}} \forall_{k \in \mathcal{I}} \exists $ *row in datatable*

- 3a. $\forall_{t \in \mathcal{T}_{MRMS}} \forall_{k \in \mathcal{I}}$ ...
    - Lookup rows for $k$ between $[t_{start}, t_{end}]$
    - Calculate the *sum* of **positive** differences only
        - e.g., `[1, 1.2, 0.0, 0.3] -> [NaN, 0.2, NaN, 0.3] -> 0.5`
        - Rain gauges occasionally *reset*; negative rainfall amounts are impossible

| gauge-idx | start-datetime-utc | end-datetime-utc | gauge-1h-acc |
| :---: | :---: | :---: | :---: |
| 4 | 2025-03-03-14:00 | 2025-03-03-15:00 | 0.1 |
| 4 | 2025-03-03-14:02 | 2025-03-03-15:02 | 0.2 |
| 4 | ... | ... | ... |
| 4 | 2025-03-03-14:30 | 2025-03-03-15:30 | 0.1 |
| 5 | 2025-03-03-14:30 | 2025-03-03-15:30 | `NaN` |

In [None]:
from glob import glob
from datetime import datetime


# set of sparse datetime points in the rain gauge dataset
T_ccrfcd = set([datetime.fromisoformat(_dt) for _dt in ckpt_one_df["utc_time"]])

In [29]:
# set of all unique datetimes in MRMS dataset
T_mrms = set([datetime.fromisoformat(_dt) for _dt in prev_aligned_df["end_time"]])

# all_mrms_data_dir = "__temp"
# all_mrms_grib_files = glob(f"{all_mrms_data_dir}/*.grib2")

# for fp in all_mrms_grib_files:
#     dt_str = fp.split("_")[-1:][0].split(".")[0]
#     dt = datetime.strptime(dt_str, "%Y%m%d-%H%M%S")
#     T_mrms.add(dt)

len(T_mrms)

82658

##### 3.1 $\forall_{t \in \mathcal{T}_{MRMS}} \forall_{k \in \mathcal{I}} \exists $ *row in datatable*

From NSSL
> Product Creation
The one-hour QPE – Radar Only product is an aggregation of the Surface Precipitation Rate (SPR) field, which is updated every 2 minutes. The SPR fields from the previous 60 minutes are summed to create the QPE – Radar Only field, with the product ending at the indicated time. For instance, **a QPE – Radar Only field that is valid at 15:04Z is a summation of SPR fields from 14:04Z, 14:06Z, 14:08Z …to 15:04Z**. Values at or below 0.01 inches are removed to reduce the areal coverage of what is most likely false light precipitation.

In [30]:
import numpy as np
from datetime import timedelta


# set of unique rain gauge indicies
I = set(ckpt_one_df['gauge_idx'])

gauge_idxs = []
end_dts = []
start_dts = []


for dt in T_mrms:
    for gauge_idx in I:

        # MRMS 'valid time' is the end of a 1hr window
        end_dts.append(dt)
        
        start_dts.append(dt - timedelta(hours=1))
        gauge_idxs.append(gauge_idx)


dense_aligned_df = pd.DataFrame({
    'gauge_idx': gauge_idxs,
    'start_datetime_utc': start_dts,
    'end_datetime_utc': end_dts,
    'gauge_1h_acc': [np.nan] * len(end_dts)
})


dense_aligned_sorted_df = dense_aligned_df.sort_values(["start_datetime_utc", "gauge_idx"])
dense_aligned_sorted_df

Unnamed: 0,gauge_idx,start_datetime_utc,end_datetime_utc,gauge_1h_acc
16351501,2.0,2021-01-23 23:00:00,2021-01-24 00:00:00,
16351521,4.0,2021-01-23 23:00:00,2021-01-24 00:00:00,
16351504,5.0,2021-01-23 23:00:00,2021-01-24 00:00:00,
16351509,7.0,2021-01-23 23:00:00,2021-01-24 00:00:00,
16351507,8.0,2021-01-23 23:00:00,2021-01-24 00:00:00,
...,...,...,...,...
4439657,5224.0,2025-07-20 21:00:00,2025-07-20 22:00:00,
4439664,5234.0,2025-07-20 21:00:00,2025-07-20 22:00:00,
4439674,5244.0,2025-07-20 21:00:00,2025-07-20 22:00:00,
4439691,5274.0,2025-07-20 21:00:00,2025-07-20 22:00:00,


- we have: constructed a table with every unique (rain gauge, MRMS-timestep) combo
- want to: derive the 1hr accum precipitation for every row using the `ckpt_one` table

- $\mathcal{G}$: dataset of raw, CCRFCD rain gauge tips
    - $\mathcal{G}_{k} = \{(x_1, u_1), ..., (x_n, u_n) \}$
- $\mathcal{D}$: dense dataset containing MRMS timestamps that we want to populate
    - $\mathcal{D}_{k, t, t'} = \sum_{i}^m \delta x_i $ 
    - where $u_i \ge t$ and $u_m \le t'$

- > we can calculate the diffs for an entire gauge easily upfront

In [32]:
from zoneinfo import ZoneInfo

# we now have a table with blank entries for all unqiue (rain_gauge, MRMS-timestamp) combinations
# how can calculate the 1hr precipitation accumulation values for each of these entries?
# let's work through an example

# how much rain fell on gauge (4) between june and july of 2025?
t1 = datetime(year=2025, month=6, day=1).astimezone(ZoneInfo("UTC"))
t2 = datetime(year=2025, month=7, day=31).astimezone(ZoneInfo("UTC"))

subset = ckpt_one_df.copy()
subset = subset.set_index(pd.to_datetime(ckpt_one_df['utc_time']))
subset = subset[subset["gauge_idx"] == 4.0]

# element wise diffs; append a nan value to the start to match # rows
subset = subset.sort_index()
subset['gauge_acc_in_diff'] = [np.nan] + list(np.diff(subset['gauge_acc_in']))

start = pd.to_datetime(t1)
end   = pd.to_datetime(t2)

subset[start:end]['gauge_acc_in_diff'].sum()

np.float64(0.040000000000000036)

In [33]:
# dataframe we constructed in 1.; contains raw gauge precip. accumulation values
ckpt_one_df_idxd = ckpt_one_df.set_index(pd.to_datetime(ckpt_one_df['utc_time'])).sort_index()

dense_aligned_sorted_df_clone = dense_aligned_sorted_df.copy().sort_values("start_datetime_utc")

In [34]:
dense_aligned_sorted_df_clone.to_csv("data/__checkpoints__/ds_ckpt_3_1.csv")

In [None]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")


gauge_1hr_accums = []


# NOTE: think carefully through this section...
for i, row in tqdm(
    enumerate(dense_aligned_sorted_df_clone.itertuples()),
    total=len(dense_aligned_sorted_df_clone),
):

    # convert -> pd.Timestamp
    _t1 = pd.to_datetime(row.start_datetime_utc, utc=True)
    _t2 = pd.to_datetime(row.end_datetime_utc, utc=True)
    assert _t1 < _t2

    # limit subset to values for the current row's rain gauge
    gauge_all_df = ckpt_one_df_idxd.loc[ckpt_one_df_idxd["gauge_idx"] == row.gauge_idx]
    gauge_all_df['diffs'] = [np.nan] + list(np.diff(gauge_all_df['gauge_acc_in']))

    # grab subset of rain gauge tips that occured within [_t1, _t2]
    gauge_1hr_df = gauge_all_df.loc[_t1:_t2]
    gauge_prev_df = gauge_all_df.loc[:_t1]

    # if there are 0 data points prior to _t1, no way to compute diffs
    if len(gauge_prev_df) < 1:
        gauge_1hr_accums.append(np.nan)
        continue

    if len(gauge_1hr_df) == 0:
        gauge_1hr_accums.append(0.0)
        continue

    acc_precip = np.array(gauge_1hr_df['diffs']).clip(0).sum()

    gauge_1hr_accums.append(acc_precip)

In [38]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")


gauge_1hr_accums = []

# --- 1) Make sure index + window endpoints are ready (tz-aware + sorted) ---
if not isinstance(ckpt_one_df_idxd.index, pd.DatetimeIndex):
    ckpt_one_df_idxd.index = pd.to_datetime(ckpt_one_df_idxd.index, utc=True)
ckpt_one_df_idxd = ckpt_one_df_idxd.sort_index()

dense_aligned_sorted_df_clone["start_datetime_utc"] = pd.to_datetime(
    dense_aligned_sorted_df_clone["start_datetime_utc"], utc=True
)
dense_aligned_sorted_df_clone["end_datetime_utc"] = pd.to_datetime(
    dense_aligned_sorted_df_clone["end_datetime_utc"], utc=True
)

# --- 2) Precompute per-gauge diffs once, clip negatives, and build per-gauge cumsums ---
#    (This exactly mirrors your "[np.nan] + np.diff" logic via groupby().diff())
ckpt_one_df_idxd["diffs"] = (
    ckpt_one_df_idxd.groupby("gauge_idx", sort=False)["gauge_acc_in"].diff()
)
ckpt_one_df_idxd["diffs"] = ckpt_one_df_idxd["diffs"].clip(lower=0)
ckpt_one_df_idxd["csum"] = (
    ckpt_one_df_idxd.groupby("gauge_idx", sort=False)["diffs"].cumsum()
)

# Keep tiny per-gauge views for super fast lookups via searchsorted
gauge_groups = {
    g: df[["csum"]]  # DatetimeIndex retained as the index
    for g, df in ckpt_one_df_idxd.groupby("gauge_idx", sort=False)
}

# Micro-optimization: local binding for tight loop
_append = gauge_1hr_accums.append

# --- 3) Main loop: O(log n) boundary lookups + O(1) window sum via cumsum ---
for i, row in tqdm(
    enumerate(dense_aligned_sorted_df_clone.itertuples()),
    total=len(dense_aligned_sorted_df_clone),
):
    _t1 = row.start_datetime_utc
    _t2 = row.end_datetime_utc
    
    # (If any row still isn't a Timestamp for some reason, coerce once)
    if not isinstance(_t1, pd.Timestamp):
        _t1 = pd.to_datetime(_t1, utc=True)
    if not isinstance(_t2, pd.Timestamp):
        _t2 = pd.to_datetime(_t2, utc=True)
    assert _t1 < _t2

    gdf = gauge_groups.get(row.gauge_idx)
    if gdf is None or len(gdf) == 0:
        _append(np.nan)
        continue

    idx = gdf.index                  # DatetimeIndex
    csum = gdf["csum"].to_numpy()    # cumulative sum of clipped diffs

    # "No prior data" check matches your len(gauge_prev_df) < 1 logic (<= _t1)
    prev_count = idx.searchsorted(_t1, side="right")
    if prev_count < 1:
        _append(np.nan)
        continue

    # Window [t1, t2] inclusive
    left = idx.searchsorted(_t1, side="left")
    right = idx.searchsorted(_t2, side="right")

    # If no points in the window, it's 0.0 (same as len(gauge_1hr_df) == 0)
    if right - left == 0:
        _append(0.0)
        continue

    # Sum of diffs in [left, right) via cumulative sum (O(1))
    acc_precip = float(csum[right - 1] - (csum[left - 1] if left > 0 else 0.0))

    _append(acc_precip)

100%|██████████| 18184760/18184760 [06:41<00:00, 45326.45it/s]


In [39]:
dense_aligned_sorted_df_clone["gauge_acc_in"] = gauge_1hr_accums
dense_aligned_sorted_df_clone[['gauge_idx', 'start_datetime_utc', 'end_datetime_utc', 'gauge_acc_in']].dropna().to_csv("data/__checkpoints__/ds_ckpt_3.csv")

In [44]:
das_sub_df = dense_aligned_sorted_df_clone[['gauge_idx', 'start_datetime_utc', 'end_datetime_utc', 'gauge_acc_in']].dropna().sort_values("gauge_acc_in")
das_sub_df[::-1][:50]

Unnamed: 0,gauge_idx,start_datetime_utc,end_datetime_utc,gauge_acc_in
12927651,12.0,2021-03-16 01:44:00+00:00,2021-03-16 02:44:00+00:00,2.4
7272111,12.0,2021-03-16 01:50:00+00:00,2021-03-16 02:50:00+00:00,2.4
12776731,12.0,2021-03-16 01:42:00+00:00,2021-03-16 02:42:00+00:00,2.4
11724911,12.0,2021-03-16 01:32:00+00:00,2021-03-16 02:32:00+00:00,2.4
4753111,12.0,2021-03-16 01:46:00+00:00,2021-03-16 02:46:00+00:00,2.4
4197171,12.0,2021-03-16 01:34:00+00:00,2021-03-16 02:34:00+00:00,2.4
12843611,12.0,2021-03-16 01:48:00+00:00,2021-03-16 02:48:00+00:00,2.4
9639751,12.0,2021-03-16 01:36:00+00:00,2021-03-16 02:36:00+00:00,2.4
15968491,12.0,2021-03-16 01:38:00+00:00,2021-03-16 02:38:00+00:00,2.4
16022611,12.0,2021-03-16 01:40:00+00:00,2021-03-16 02:40:00+00:00,2.4


### 4. Append MRMS 1H QPE; $\forall_{t \in \mathcal{T}_{MRMS}} \forall_{k \in \mathcal{I}}$
---

- 4a. Get gauge location: $\mathcal{I}_k = (lat, lon)$
- 4b. Find nearest MRMS grid-cell to rain gauge $k' = (lat', lon') \approx (lat, lon)$
- 4c. Get $MRMS_{t, k'}$; append to aligned dataset $D$
    - $D_{t, k, mrms} = $ $ MRMS_{t, k'} \over{25.4} $
    - > *Note: we divide by 25.4 to convert MRMS data from millimeters to inches*

| gauge-idx | start-datetime-utc | end-datetime-utc | gauge-1h-acc | mrms-1h-qpe |
| :---: | :---: | :---: | :---: | :---: |
| 4 | 2025-03-03-14:00 | 2025-03-03-15:00 | 0.1 | 0.13 |
| 4 | 2025-03-03-14:02 | 2025-03-03-15:02 | 0.2 | 0.19 |
| 4 | ... | ... | ... | ... |
| 4 | 2025-03-03-14:30 | 2025-03-03-15:30 | 0.1 | 0.25 |
| 5 | 2025-03-03-14:30 | 2025-03-03-15:30 | `NaN` | 0.08 |

- > *Note: we should have `mrms-1h-acc` values for all rows in this dataset, but not necessarily `gauge-1h-acc` values*

In [45]:
import pandas as pd


ckpt_3_df = pd.read_csv("data/__checkpoints__/ds_ckpt_3.csv")[['gauge_idx', 'start_datetime_utc', 'end_datetime_utc', 'gauge_acc_in']]
ckpt_3_df.head()

Unnamed: 0,gauge_idx,start_datetime_utc,end_datetime_utc,gauge_acc_in
0,2.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0
1,4464.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0
2,4474.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0
3,4479.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0
4,4484.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0


In [46]:
prev_aligned_df.head()

Unnamed: 0,start_time,end_time,station_id,lat,lon,gauge_qpe,mrms_qpe,delta_qpe,cum_gauge_qpe,cum_mrms_qpe
0,2021-01-23 23:00:00,2021-01-24 00:00:00,4709,35.925,244.883,0.0,0.003937,-0.003937,0.0,0.003937
1,2021-01-23 23:00:00,2021-01-24 00:00:00,4564,36.02825,244.996361,0.04,0.0,0.04,0.04,0.0
2,2021-01-23 23:00:00,2021-01-24 00:00:00,4779,36.021861,245.04125,0.04,0.0,0.04,0.04,0.0
3,2021-01-23 23:00:00,2021-01-24 00:00:00,4984,35.72925,244.811972,0.0,0.011811,-0.011811,0.0,0.011811
4,2021-01-23 23:00:00,2021-01-24 00:00:00,4724,35.939417,244.92225,0.0,0.015748,-0.015748,0.0,0.015748


In [17]:
gauge_metadata_df = pd.read_csv("data/ccrfcd_rain_gauge_metadata.csv")
gauge_metadata_df = gauge_metadata_df.loc[gauge_metadata_df['station_id'] > 0.0]
gauge_metadata_df.head()

Unnamed: 0.1,Unnamed: 0,station_id,name,old_name,type,oos,lat,lon
1,1,2.0,Willow Beach 2 (NPS),Willow Beach 2 (NPS) RAIN (Rain),NPS System,False,35.87789,-114.58875
3,3,4.0,Malpais Flattop Mesa (NPS),Malpais Flattop Mesa (NPS) RAIN (Rain),NPS System,False,35.82014,-114.66325
4,4,5.0,Householder Pass (NPS),Householder Pass (NPS) RAIN (Rain),NPS System,False,35.82578,-114.59625
6,6,7.0,Eldorado (NPS) - New Repeater,Eldorado (NPS) RAIN (Rain),NPS System,False,35.74642,-114.576
7,7,8.0,Willow Beach 8 (NPS),Willow Beach 8 (NPS) RAIN (Rain),NPS System,False,35.961,-114.62764


In [None]:
mrms_qpes = []

for row in tqdm(ckpt_3_df.itertuples(), total=len(ckpt_3_df)):

    station_id = row.gauge_idx
    end_time = row.end_datetime_utc
    
    _df = prev_aligned_df.loc[prev_aligned_df['station_id'] == station_id]
    _df = _df.loc[_df['end_time'] == end_time[:-6]]

    if _df.empty: 
        mrms_qpes.append(np.nan)
        continue

    assert len(_df['mrms_qpe']) == 1
    mrms_qpe = _df['mrms_qpe'].iloc[0]
    mrms_qpes.append(mrms_qpe)

In [71]:
ckpt_3_df['mrms_qpe'] = mrms_qpes
ckpt_3_df.dropna().to_csv("data/__checkpoints__/ds_ckpt_4.csv")

### 5. QC & Validation

In [2]:
import pandas as pd


ckpt_4_df = pd.read_csv("data/___checkpoints___/ds_ckpt_4.csv").drop("Unnamed: 0", axis=1)
ckpt_4_df

Unnamed: 0,gauge_idx,start_datetime_utc,end_datetime_utc,gauge_acc_in,mrms_qpe
0,4564.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0,0.000000
1,4779.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0,0.000000
2,4709.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0,0.003937
3,4724.0,2021-01-23 23:00:00+00:00,2021-01-24 00:00:00+00:00,0.0,0.015748
4,4564.0,2021-01-23 23:02:00+00:00,2021-01-24 00:02:00+00:00,0.0,0.000000
...,...,...,...,...,...
1557501,3229.0,2025-07-20 03:40:00+00:00,2025-07-20 04:40:00+00:00,0.0,0.007874
1557502,3229.0,2025-07-20 03:42:00+00:00,2025-07-20 04:42:00+00:00,0.0,0.007874
1557503,3229.0,2025-07-20 03:44:00+00:00,2025-07-20 04:44:00+00:00,0.0,0.003937
1557504,3229.0,2025-07-20 03:46:00+00:00,2025-07-20 04:46:00+00:00,0.0,0.003937


In [5]:
ckpt_4_df.sort_values("mrms_qpe")[::-1]

Unnamed: 0,gauge_idx,start_datetime_utc,end_datetime_utc,gauge_acc_in,mrms_qpe
1553634,3914.0,2025-07-18 19:20:00+00:00,2025-07-18 20:20:00+00:00,0.95,2.606299
1553599,3914.0,2025-07-18 19:18:00+00:00,2025-07-18 20:18:00+00:00,0.95,2.598425
1553670,3914.0,2025-07-18 19:22:00+00:00,2025-07-18 20:22:00+00:00,0.95,2.586614
1553706,3914.0,2025-07-18 19:24:00+00:00,2025-07-18 20:24:00+00:00,0.95,2.570866
1553564,3914.0,2025-07-18 19:16:00+00:00,2025-07-18 20:16:00+00:00,0.91,2.547244
...,...,...,...,...,...
24,4564.0,2021-01-23 23:12:00+00:00,2021-01-24 00:12:00+00:00,0.00,0.000000
566225,3284.0,2023-03-22 12:14:00+00:00,2023-03-22 13:14:00+00:00,0.04,-0.118110
566224,3284.0,2023-03-22 12:12:00+00:00,2023-03-22 13:12:00+00:00,0.04,-0.118110
566223,3284.0,2023-03-22 12:10:00+00:00,2023-03-22 13:10:00+00:00,0.04,-0.118110
