In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pyemu
import pandas as pd

from pyemu.emulators import DSI

# Recap: Emulators

An emulator or surrogate is a model that mimics the behavior of physical models (FOSM). Instead of running the full, expensive simulation, you use the emulator to quickly predict outputs for new inputs.

- It is trained on data from the FOSM.
- It learns the relationship between inputs (features) and outputs (targets).
- Once trained, it provides fast approximations of the system’s response

<font color="#ff7700ff">

Un <code>emulador</code> es un modelo que imita el comportamiento de un modelo(s) fisico (FOSM). En vez de correr el conjunto de modelos completo (que puede ser lenta o costosa), usamos el <code>emulador</code> para predecir rápidamente los resultados para nuevos valores de entrada.

- Se entrena con datos del ensamble.
- Aprende la relación entre las <code>entradas</code> (features) y las <code>salidas</code> (targets).
- Una vez entrenado, permite obtener aproximaciones rápidas de la respuesta del sistema.

</font>

# Data Space Inversion (DSI)
DSI is a surrogate modelling approach that works by mapping statistical relationships between observations. Its super fast and relatively robust. It is good for uncertainty quantification, data assimilation and even optimization.

<font color="#ff7700ff">
<code>DSI</code> es un emulador que funciona mapeando relaciones estadísticas entre observaciones. Es súper rápida y bastante robusta. Sirve para cuantificar incertidumbre, asimilación de datos y hasta para optimización.
</font>



# Generate training data from the FOSM
Undertaking DSI relies on the existence of an ensemble of model-generated outputs (i.e., observations in the pest control file) for both historical observation quantities (eg heads in the pit, flows in the GDE drain etc) AND forecast quantities of interest - this is important so we will say it again: DSI requires the results of a Monte Carlo set of runs for both historic and future/scenario (prediction) conditions. These results are generated by running the model with a range of parameter values, which are usually sampled from the prior parameter distribution. Note that this distribution does not need to be Gaussian, and each model "parameterization" can be as complex as the user desires. Generating the the combined historic-future/scenario output ensemble is the only time that the numerical model needs to be run. Ideally, the ensemble size should be as large as you can afford. However, once generated, the DSI data-driven/emulator "model" runs very quickly.

<font color="#ff7700ff">
Para usar <code>DSI</code> necesitas tener un <code>ensamble</code> de salidas generadas por el modelo (o sea, observaciones en el archivo de control de pest) tanto para cantidades históricas (ejemplo: niveles en los pozos, caudales en el dren GDE, etc) como para los pronósticos que te interesan. Esto es clave: <code>DSI</code> requiere los resultados de un set Monte Carlo de corridas para condiciones históricas y de pronóstico/escenario. Estos resultados se generan corriendo el modelo con distintos valores de parámetros, que normalmente se muestrean de la distribución previa. Ojo, esa distribución no tiene que ser gaussiana, y cada "parametrización" puede ser tan compleja como quieras. Generar el ensamble combinado de salidas históricas y de pronóstico es la única vez que necesitas correr el modelo numérico. Idealmente, el tamaño del ensamble debería ser lo más grande que puedas permitirte. Pero una vez generado, el <code>modelo</code> emulador/data-driven de <code>DSI</code> corre rapidísimo.
</font>

## Run the Prior ensemble

In [None]:
t_d = "pst_template_pmc"
if not os.path.exists(t_d):
    raise Exception("need to run pstfrom.ipynb notebook")

In [None]:
pst = pyemu.Pst(os.path.join(t_d,"pest.pst"))

pst

In [None]:
obs = pst.observation_data
obs.head()

In [None]:
pst.pestpp_options['ies_num_reals']
# pst.pestpp_options

In [None]:
pr_pe = pyemu.ParameterEnsemble.from_binary(pst, os.path.join(t_d,"prior_pe.jcb"))
pr_pe.shape


In [None]:
pst.pestpp_options['ies_num_reals'] = pr_pe.shape[0]
pst.pestpp_options['ies_num_reals'] = 100


In [None]:
pst.control_data.noptmax = -1

In [None]:
pst.write(os.path.join(t_d,"pest.pst"),version=2)


In [None]:
m_d = "master_train"

In [None]:
pyemu.os_utils.start_workers(t_d,
                            "pestpp-ies",
                            "pest.pst",
                            num_workers=15,
                            worker_root=".",
                            master_dir=m_d,
                            )

# Training your emulator

## Feature engineering

Data preprocessing technique that involves transforming raw data into more effective features (input variables) for machine learning models, improving model accuracy and performance

In [None]:
pst = pyemu.Pst(os.path.join(m_d, "pest.pst"))
pst

In [None]:
oe_pr = pst.ies.obsen0.copy()
oe_pr.shape

In [None]:
data = oe_pr.copy()

data.head()

In [None]:
forecasts = pst.pestpp_options['forecasts'].split(',')
forecasts

data.loc[:,forecasts].hist(figsize=(10,5))
plt.tight_layout()

In [None]:
obs.oname.unique()

In [None]:
logcols = [c for c in data.columns if c.startswith("oname:hk")]
data.loc[:, logcols[2]].hist()

The `DSI` object accepts a list of dictionaries to specy trnsforms on a column by column basis. You can specify implemented transforms (log10, normal-score, standardscaling, min/max scaling) or pass in any standard scikit-learn transformer. The latter case does not have built in error and edge case handling, so user-beware.

<font color="#ff7700ff">
El objeto <code>DSI</code> acepta una lista de diccionarios para especificar transformaciones columna por columna. Puedes usar transformaciones ya implementadas en <code>pyEMU</code> (<code>log10</code>, <code>normal-score</code>, <code>standardscaling</code>, <code>min/max scaling</code>) o pasar cualquier transformador estándar de scikit-learn. En este último caso, no nos hacemos cargo!
</font>

In [None]:
transforms = [
            {"type":"log10", "columns":logcols},
            {"type":"normal_score"}, #MUST BE SEQUENTIAL!
            ]

dsi = DSI(pst=pst, #optional...
          data=data,
          transforms=transforms,
        #   energy_threshold=0.999, # the truncated-svd energy threshold
          )

dsi.fit()

In [None]:
dsi.__dict__.keys()

Let's have a look at our transformed data

In [None]:
data.head()

In [None]:
data_transformed = dsi.data_transformed.copy()
data_transformed.head()

In [None]:
fig,axs=plt.subplots(1,2,figsize=(10,4))

col = logcols[-1]

axs[0].set_title("Original Data")
axs[0].hist(data.loc[:,col])
axs[1].set_title("Transformed Data")
axs[1].hist(data_transformed.loc[:,col])

fig.tight_layout()

In [None]:
fig,axs=plt.subplots(1,2,figsize=(10,4))

col = logcols[-1]


axs[0].set_title("Original Data")
axs[0].hist(data.loc[:,forecasts[1]])
axs[1].set_title("Transformed Data")
axs[1].hist(data_transformed.loc[:,col])
plt.suptitle("Forecast: {}".format(forecasts[1]))
fig.tight_layout()

In [None]:
dsi.s.shape #singular values

## Let's do some fun predicting

To run the dsi model directly, you can call `dsi.predict()` and passing in a `pvals` array (i.e., a vector of random normal values with shape equal to `dsi.s`). This is effectively the dsi "forward run". When we setup pestpp, we are parameterizing the `pvals` vector and allowing pestpp to adjust the values to improive the fit with observations.

<font color="#ff7700ff">
Para correr el modelo <code>DSI</code> directamente, puedes llamar a <code>dsi.predict()</code> y pasarle un array <code>pvals</code> (o sea, un vector de valores normales aleatorios con la misma forma que <code>dsi.s</code>). Esto es como hacer un "forward run" con DSI. Cuando armamos <code>pestpp</code>, estamos parametrizando el vector <code>pvals</code> y dejando que <code>pestpp</code> ajuste esos valores para mejorar el ajuste con las observaciones.
</font>

In [None]:
# p values of 0 should give us the mean
pvals = np.zeros_like(dsi.s)
pvals

In [None]:
svals = dsi.predict(pvals)
svals

In [None]:
data.mean()

In [None]:
(svals - data.mean()).sort_values()

In [None]:
svals - data.mean()

# Prepare the DSI-PEST setup

In pyEMU, `pyemu.emulators` is the entry point for all things emulation. The minimum requirement to instantiate a `DSI` object is the training data set. When initialized it prepares in memory the various components required for DSI and associated analyses. 

Optionaly you can specify the energy level truncation for SVD, data transfromations (we will ge tto this later) and an existing `Pst` object from the full-order model. `DSI` will use oinfromation in the `Pst` to help construct a dsi pestpp template directory later. 

<font color="#ff7700ff">
En <code>pyEMU</code>, <code>pyemu.emulators</code> es el punto de entrada para todo lo relacionado con emulación. Lo mínimo que necesitas para crear un objeto <code>DSI</code> es el set de datos de entrenamiento. Al inicializarlo, prepara en memoria los componentes necesarios para DSI y los análisis asociados.

Opcionalmente puedes especificar el nivel de energía para truncar el SVD, las transformaciones de datos (esto lo vemos más adelante) y un objeto <code>Pst</code> del modelo completo. <code>DSI</code> usa la info del <code>Pst</code> para ayudarte a armar el template de pestpp para DSI más adelante.
</font>

In [None]:
dsi_t_d = "pst_template_dsi"

In [None]:
dpst = dsi.prepare_pestpp(t_d = dsi_t_d)
dpst

In [None]:
dpst.control_data.noptmax = 2
dpst.pestpp_options["ies_num_reals"] = 100

In [None]:
dpst.write(os.path.join(dsi_t_d, "dsi.pst"),version=2)

In [None]:
pvals = pd.read_csv(os.path.join(dsi_t_d, "dsi_pars.csv"), index_col=0)

md = f"master_dsi"
num_workers = 30
worker_root = "."

Right on! We are ready to get cracking. Let's run pestpp-ies and see what we get

In [None]:
pyemu.os_utils.start_workers(
    dsi_t_d, # the master dsi template folder
    "pestpp-ies", # the pestpp to use
    "dsi.pst", # the control file
    num_workers=num_workers,
    worker_root=worker_root,
    master_dir=md, # the master directory
    #port=_get_port(),
    ppw_function=pyemu.helpers.dsi_pyworker, #pyworkers!
    ppw_kwargs={
        "dsi": dsi, "pvals": pvals,
    }
)

# For the WIN!

In [None]:
dpst.nnz_obs

In [None]:
dpst = pyemu.Pst(os.path.join(md, "dsi.pst"))
dpst.ies.phiactual.head()

In [None]:
phidf = dpst.ies.phiactual

fig,ax=plt.subplots(1,1,figsize=(7,6))
ax.plot(phidf.index,phidf['mean'],"bo-", label='dsi')

ax.set_yscale('log')
ax.set_ylabel('Phi')
ax.set_xlabel('Iteration')

ax.text(0.7,0.9,
        (f"nnz_obs: {pst.nnz_obs}\n phi_dsi: {phidf['mean'].iloc[-1]:.2e}"
        ),
        transform=ax.transAxes,ha="right",va="top")

ax.legend()

Let's check how we did with that posterior!

In [None]:
oe_pr = dpst.ies.obsen0
oe_pt = dpst.ies.get("obsen", dpst.ies.phiactual.iteration.max())
# oe_pt = dpst.ies.get("obsen", 1)

forecasts

for f in forecasts:
    ax = oe_pr.loc[:,f].plot(kind="hist",fc="0.5",alpha=0.5,density=True)
    ax = oe_pt.loc[:,f].plot(kind="hist",fc="b",alpha=0.5,density=True)
    ylim = ax.get_ylim()
    v = obs.loc[f,"obsval"]
    ax.plot([v,v],ylim,"r-",lw=2)
    ax.set_title(f)
    plt.show()

In [None]:
hkobs = obs.loc[obs.oname=="hk",:].copy()
hkobs["i"] = hkobs.i.astype(int)
hkobs["j"] = hkobs.j.astype(int)

fig,axes = plt.subplots(1,2,figsize=(10,5))
prarr = np.zeros((hkobs.i.max()+1,hkobs.j.max()+1))
prarr[hkobs.i,hkobs.j] = np.log10(oe_pr.loc["base",
                                            hkobs.obsnme])
ptarr = np.zeros((hkobs.i.max()+1,hkobs.j.max()+1))
ptarr[hkobs.i,hkobs.j] = np.log10(oe_pt.loc["base",
                                            hkobs.obsnme])
vmin = min(prarr.min(),ptarr.min())
vmax = max(prarr.max(),ptarr.max())

cb = axes[0].imshow(prarr, vmin=vmin, vmax=vmax)
axes[0].set_title("Prior H$_k$ field base realization")
plt.colorbar(cb,ax=axes[0], fraction=0.046, pad=0.04)
cb = axes[1].imshow(ptarr, vmin=vmin, vmax=vmax)
axes[1].set_title("Posterior H$_k$ field base realization")
plt.colorbar(cb,ax=axes[1], fraction=0.046, pad=0.04)
plt.tight_layout()

In [None]:
fig,ax = plt.subplots(1,1)
noise = dpst.ies.noise
hobs = obs.loc[(~obs.obsnme.str.contains("gde")) & (obs.weight>0),:].copy()
hobs.sort_values(by="obsval",inplace=True)
hvals = hobs.obsval.values
for real in oe_pt.index:
    vals = noise.loc[real,hobs.obsnme].values
    ax.plot(hvals,vals,"r-",marker=".",alpha=0.5)
    vals = oe_pr.loc[real,hobs.obsnme].values
    ax.plot(hvals, 
            vals,
            "0.5",
            # marker=".",
            alpha=0.5)
    vals = oe_pt.loc[real,hobs.obsnme].values
    ax.plot(hvals, 
            vals,
            "b",
            # marker=".",
            alpha=0.5,
            )

xlim = ax.get_xlim()
ylim = ax.get_ylim()
mn = min(xlim[0],ylim[0])
mx = max(xlim[1],ylim[1])
ax.plot([mn,mx],[mn,mx],"k--",lw=3)
ax.set_xlim(mn,mx)
ax.set_ylim(mn,mx)

# Effect of training data

In [None]:

nzobsnmes = dpst.nnz_obs_names

obs = dpst.observation_data
obsnmes = obs.loc[obs.usecol=='ghb'].obsnme.tolist()
obsnmes

In [None]:
realseq = [5, 10,50,100]
for nreal in realseq:

    transforms = [#{"type":"log10", "cols":logcols},
                  {"type":"log10", "columns":obsnmes},
                  {"type":"normal_score"}# "columns":nzobsnmes },
                  ]

    data_redux = data.loc[:,obsnmes+nzobsnmes].copy()
    data_redux = data_redux.iloc[:nreal].copy()

    obs_redux = obs.copy()
    obs_redux = obs_redux.loc[obsnmes+nzobsnmes]
    obs_redux.index.name = "index"
    obs_redux   
    
    dsi = DSI(
        #pst=pst,
            data = data_redux,
            transforms = transforms,
            # energy_threshold = .999,
            verbose = False)

    dsi.fit()
    dsi_t_d = f"template_dsi_{nreal}"


    dpst_tmp = dsi.prepare_pestpp(t_d = dsi_t_d,
                              observation_data=obs_redux)
    
    dpst_tmp.control_data.noptmax = 1 #NOTE THIS!
    dpst_tmp.pestpp_options["ies_num_reals"] = 100 #NOTE this!

    dpst_tmp.write(os.path.join(dsi_t_d, "dsi.pst"),version=2)


    pvals = pd.read_csv(os.path.join(dsi_t_d, "dsi_pars.csv"), index_col=0)
    md = f"master_dsi_{nreal}"
    num_workers = 30
    worker_root = "."
    pyemu.os_utils.start_workers(
        dsi_t_d,"pestpp-ies","dsi.pst", num_workers=num_workers,
        worker_root=worker_root, master_dir=md, 
        ppw_function=pyemu.helpers.dsi_pyworker,
        ppw_kwargs={
            "dsi": dsi, "pvals": pvals,
        }
    )
    dpst_tmp = pyemu.Pst(os.path.join(md, "dsi.pst"))
    dpst_tmp.ies.phiactual.head()

In [None]:
realseq = [5, 10,50,100]
phis = []
for nreal in realseq:    
    md = f"master_dsi_{nreal}"
    dpst = pyemu.Pst(os.path.join(md, "dsi.pst"))

    phis.append(dpst.ies.phiactual[dpst.ies.phiactual.iteration==1][['mean']])

fig, ax = plt.subplots(1,1,figsize=(7,6))
ax.scatter(realseq, phis)
ax.set_yscale("log")
