In [None]:
import sys
sys.path.append('src')
import pandas as pd
import numpy as np
import os
import os.path as osp
import subprocess
from src.data_funcs import get_stids
import json
import pickle
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

## Retrieve Data From OpenWFM

The data is a formatted RAWS dictionary using code from `wrfxpy`. To reproduce data creation, see: _________

In [None]:
filename = "raws_rocky_202305-202405.pkl"

# Check for file locally, retrieve with wget if not
if not osp.exists(filename):
    import subprocess
    base_url = "https://demo.openwfm.org/web/data/fmda/dicts/"
    print(f"Retrieving data {osp.join(base_url, filename)}")
    subprocess.call(f"wget -P data {osp.join(base_url, filename)}", shell=True)

In [None]:
# Read Data
dat = pd.read_pickle(f"data/{filename}")

## Format Data in DataFrame

The data from openwfm is a nested dictionary. The top-level dictionary keys are organized by RAWS station ID. Data is collected from each RAWS station that has fuel moisture observations. A subset of those stations have more atmospheric data sensors. In this analysis, we will limit the RAWS stations to those with a complete set of sensor ID for variables of theoretical interest to fuel moisture modeling.

In [None]:
raws_vars = ["Ew", "Ed", "temp", "rh", "rain", "precip_accum", "fm", "wind", "solar", "time_raws"]

In [None]:
def filter_moisture(arr, name=None, verbose=True):
    # Function to filter data moisture data, apply to fm, Ed, and Ew
    # Filters: 
        # values less than 1: not physically reasonable
    arr[arr < 1] = np.nan

    return arr


def filter_rain(rain, verbose=True):
    # Filter rain data observations
    # Filters:
        # Less than zero
        # Greater than 50
    rain[rain > 50] = np.NaN # filter out erroneously high
    rain[rain < 0] = np.NaN # filter out negative, results from diff function after precipa goes to zero

    return rain

In [None]:
def fix_data(d):
    # Input: dictionary d

    d["fm"] = filter_moisture(d["fm"], name="fm")
    d["Ed"] = filter_moisture(d["Ed"], name="Ed")
    d["Ew"] = filter_moisture(d["Ew"], name="Ew")
    d["rain"] = filter_rain(d["rain"])

    return d

In [None]:
def dict_to_df(d, tvars=raws_vars):
    # RAWS timeseries vars
    data1 = {key: d["RAWS"][key] for key in tvars if key in d["RAWS"]}
    # Static RAWS station location vars, fill to length of previous
    data2 = {key: np.full(len(d["RAWS"]["time_raws"]), value) for key, value in d['loc'].items()}
    # Combine into DF
    df = pd.DataFrame({**data1, **data2})
    # Add Derived Time Fields: day of year, hour of day
    df["time_raws"]=np.array([datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%SZ") for dt_str in subdict["RAWS"]["time_raws"]])
    df.index=np.array([datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%SZ") for dt_str in subdict["RAWS"]["time_raws"]])
    df["hour"]=df.index.hour
    df["doy"]=df.index.dayofyear
    df["date"]=df.index
    
    return df

In [None]:
dfs = [] # empty list to collect data

for k in dat.keys():
    print("~"*50)
    subdict = dat[k]
    loc = subdict["loc"]
    print(loc)
    if all(key in subdict["RAWS"] for key in raws_vars):
        print(f"Formatting data for {loc['STID']}")
        subdict["RAWS"] = fix_data(subdict["RAWS"])
        dfs.append(dict_to_df(subdict))
        # plt.figure()
        # plt.plot(raws['fm'])
        # plt.title(f"RAWS Station {loc['STID']}")
    else:
        print(f"Incomplete sensor variables for {loc['STID']}")

In [None]:
# Concatenate the dataframes
df = pd.concat(dfs)

In [None]:
# Write Dataframe
with open(osp.join("data", "raws_df0.pkl"), 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)