# Set observation values, weights and noise

In [None]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyemu

Load the "observation data"

In [None]:
obs_csv_fname = os.path.join(
    "..", "models", "synthetic-valley-truth-advanced-monthly", "raw_obs.csv"
)
assert os.path.exists(obs_csv_fname)
obsdf = pd.read_csv(obs_csv_fname, index_col=0, parse_dates=True)
obsdf

Since we know the "true" values for our forecast, we will load them up and put the truth values in the control file to make plotting easier (and more interesting)

In [None]:
fore_csv_fname = os.path.join(
    "..", "models", "synthetic-valley-truth-advanced-monthly", "swgw-longterm-means.csv"
)
assert os.path.exists(fore_csv_fname)
foredf = pd.read_csv(fore_csv_fname, index_col=0)
foredf

In [None]:
working_d = "model_and_pest_files"
assert os.path.exists(working_d)

Models are always going to be low-pass filters compared to the complex natural systems that generated the observations.  So its usually a good idea to filter out high freq signal components:

In [None]:
smoothed = obsdf.rolling(window=18, center=True, min_periods=1).mean()
for col in smoothed.columns:
    fig, ax = plt.subplots(1, 1, figsize=(6, 3))
    smoothed.loc[:, col].plot(ax=ax, c="g")
    obsdf.loc[:, col].plot(ax=ax, c="m")
    ax.set_title(col, loc="left")

plt.show()

Now load the control file:

In [None]:
pst = pyemu.Pst(os.path.join(working_d, "pest.pst"))

In [None]:
obs = pst.observation_data

In [None]:
obs.columns

First set the truth values for the forecasts (just for plotting later)

In [None]:
for col in foredf.columns:
    fobs = obs.loc[obs.obsnme.str.contains(col), :]
    for name, q in zip(fobs.obsnme, fobs.quantity):
        obs.loc[name, "obsval"] = foredf.loc[q, col]
        print(col, q)

Now for the tricky part: we need to find each simulated output that we have an observed counterpart for.  In practice, this usually requires some bespoke code/hackery (we are also going to set "observed" values from the forecast period, just so we can plot it later)

In [None]:
nnobs = obs.loc[pd.notna(obs.usecol), :]

In [None]:
prefixes = [
    "wt",
    "aq",
    "lake-stage",
    "lake-swgw",
    "riv-flow",
    "riv-swgw",
    "diff1",
    "diff0",
]
for prefix in prefixes:
    uobs = nnobs.loc[nnobs.usecol.str.contains(prefix), :].copy()
    print(prefix, uobs.shape)
    uobs["datetime"] = pd.to_datetime(uobs.datetime)
    for usecol in uobs.usecol.unique():
        uuobs = uobs.loc[uobs.usecol == usecol, :].copy()
        for dt, name in zip(uuobs.datetime, uuobs.obsnme):
            oval = smoothed.loc[dt, usecol]
            obs.loc[name, "obsval"] = oval

now we need to set the weights and expected noise for each observation datum:

In [None]:
obs["weight"] = 0.0
obs["standard_deviation"] = np.nan
obs["lower_bound"] = np.nan
obs["upper_bound"] = np.nan

In [None]:
obs_dict = {}
hist_prefixes = ["wt", "aq", "lake-stage", "diff1", "diff0"]
for prefix in hist_prefixes:
    uobs = nnobs.loc[nnobs.usecol.str.startswith(prefix), :].copy()
    print(uobs.usecol.unique())
    uobs["datetime"] = pd.to_datetime(uobs.datetime)
    hist_uobs = uobs.loc[uobs.datetime.dt.year < 2015, :]
    obs.loc[hist_uobs.obsnme, "datetime"] = hist_uobs.datetime
    if "lake" in prefix:
        obs.loc[hist_uobs.obsnme, "weight"] = 3.0
        obs.loc[hist_uobs.obsnme, "standard_deviation"] = 0.3
    elif "diff" in prefix:
        print(prefix)
        obs.loc[hist_uobs.obsnme, "weight"] = [
            5.0 if oval > 0.1 else 1.0 for oval in hist_uobs.obsval
        ]
        obs.loc[hist_uobs.obsnme, "standard_deviation"] = [
            max(0.01, oval * 0.1) for oval in hist_uobs.obsval
        ]
        obs.loc[hist_uobs.obsnme, "lower_bound"] = 0.0
    else:
        obs.loc[hist_uobs.obsnme, "weight"] = 2.0
        obs.loc[hist_uobs.obsnme, "standard_deviation"] = 0.5

if this is an advanced model, we can also use riv-flow information for history matching (one benefit of a more complex model) 

In [None]:
if "riv-flow" in obs.usecol.unique():
    uobs = nnobs.loc[nnobs.usecol == "riv-flow", :].copy()
    uobs["datetime"] = pd.to_datetime(uobs.datetime)
    hist_uobs = uobs.loc[uobs.datetime.dt.year < 2015, :].copy()
    hist_uobs["standard_deviation"] = [
        max(0.2, oval * 0.1) for oval in np.abs(hist_uobs.obsval.values)
    ]
    hist_uobs.loc[hist_uobs.obsnme, "weight"] = 1 / hist_uobs.standard_deviation.values
    obs.loc[hist_uobs.obsnme, "standard_deviation"] = (
        hist_uobs.standard_deviation.values
    )
    obs.loc[hist_uobs.obsnme, "weight"] = hist_uobs.weight.values
    obs.loc[hist_uobs.obsnme, "datetime"] = hist_uobs.datetime

    print(hist_uobs.loc[:, ["obsval", "standard_deviation", "weight"]])

In [None]:
pst.nnz_obs_groups

Set noptmax to 0, save the control file and do a test run:

In [None]:
pst.control_data.noptmax = 0
pst.write(os.path.join(working_d, "pest.pst"), version=2)
pyemu.os_utils.run("pestpp-ies pest.pst", cwd=working_d)

Now we are going to generate some autocorrelated timeseries noise to use in the history matching:

In [None]:
nzobs = obs.loc[obs.weight > 0, :].copy()
obs["distance"] = np.nan
grps = nzobs.obgnme.unique()
grps.sort()
struct_dict = {}
for grp in grps:
    gobs = nzobs.loc[nzobs.obgnme == grp, :].copy()
    gobs["datetime"] = pd.to_datetime(gobs.datetime)
    gobs["distance"] = (gobs.datetime - gobs.datetime.min()).dt.days
    obs.loc[gobs.obsnme, "distance"] = gobs.distance
    v = pyemu.geostats.ExpVario(contribution=1.0, a=365 * 20)
    gs = pyemu.geostats.GeoStruct(variograms=v, name=grp)
    struct_dict[gs] = gobs.obsnme.to_list()

In [None]:
noise = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=1000)

We know from expert knowledge that the vertical head differences are never negative between the water table and aquifer, so let's repair any noise realizations that have that condition

In [None]:
lbnd = obs.loc[pd.notna(obs.lower_bound), "lower_bound"]
for name, bnd in zip(lbnd.index, lbnd.values):
    vals = noise.loc[:, name].values
    vals[vals < bnd] = bnd
    noise.loc[:, name] = vals
    print(name)

Save the noise ensemble, tell ies about it, and an noptmax=-2 test run:

In [None]:
noise.to_csv(os.path.join(working_d, "noise.csv"))
pst.pestpp_options = {"ies_par_en": pst.pestpp_options["ies_par_en"]}
pst.pestpp_options["ies_obs_en"] = "noise.csv"
pst.control_data.noptmax = -2
pst.write(os.path.join(working_d, "pest.pst"), version=2)
pyemu.os_utils.run("pestpp-ies pest.pst", cwd=working_d)

As you can see from the phi group summary, we need some rebalanced weights.  One way to do this is the the ies_phi_factor_file:

In [None]:
phi_factors = {"lake": 0.2, "aq": 0.3, "wt": 0.3, "diff": 0.2}
if "riv-flow" in obs.usecol.unique():
    phi_factors = {"lake": 0.15, "aq": 0.25, "wt": 0.25, "diff": 0.15, "riv-flow": 0.2}
# phi_factors["diff"] = 1e-20
ser = pd.Series(phi_factors)
ser.to_csv(os.path.join(working_d, "phi_facs.csv"), index=True, header=False)
pst.pestpp_options["ies_phi_factor_file"] = "phi_facs.csv"
pst.write(os.path.join(working_d, "pest.pst"), version=2)
pyemu.os_utils.run("pestpp-ies pest.pst", cwd=working_d)