# Scenario B - Baseline Variation (multiple runs)

In this scenario the baseline underlying a spectrum with a fixed number of Gaussian peaks is varied between no baseline, a constant y-offset and a linear baseline. All datasets contain 3 peaks and the noise level is kept constant at 1%.

The models used in the inference of the parameters are formulated as follows:

1. No baseline:
\begin{equation}
\large y = f(x) = \sum\limits_{m=1}^M \big[A_m \cdot e^{-\frac{(x-\mu_m)^2}{2\cdot\sigma_m^2}}\big] + \epsilon
\end{equation}

2. Constant offset:
\begin{equation}
\large y = f(x) = \sum\limits_{m=1}^M \big[A_m \cdot e^{-\frac{(x-\mu_m)^2}{2\cdot\sigma_m^2}}\big] + a_0 + \epsilon
\end{equation}

3. Linear baseline:
\begin{equation}
\large y = f(x) = \sum\limits_{m=1}^M \big[A_m \cdot e^{-\frac{(x-\mu_m)^2}{2\cdot\sigma_m^2}}\big] + a_0 + a_1 \cdot x + \epsilon
\end{equation}

This file runs a series of inference runs for a set of generated spectra. New spectra are generated for each run and stored. After running inference, only the summary statistics are stored and the next run is started.

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymc3 as pm
import arviz as az

#az.style.use('arviz-darkgrid')

print('Running on PyMC3 v{}'.format(pm.__version__))

## Import local modules

In [None]:
import os
import sys
sys.path.append('../../modules')
import datagen as dg
import models as mdl
import results as res
import figures as fig
import settings as cnf

## Local configuration

In [None]:
# output for results and images
out_path      = './output_mruns'
file_basename = out_path + '/scenario_baseline'

conf = {}
    
# scenario name
conf['scenario'] = 'peak variation'
    
# initialization method for sampler
conf['init_mode'] = 'adapt_diag'

# probabilistic model (priors)
conf['prior_model'] = 'lognormal'

# provide peak positions to the model as testvalues ('yes'/'no')
conf['peak_info'] = 'yes'

# data mode ('generate'/'preload')
conf['data_mode'] = 'generate'

# number of runs
conf['nruns'] = 4

# number of cores to run sampling chains on
conf['ncores'] = 2

# number of samples per chain
conf['nsamples'] = 2000

In [None]:
# if the output dir does not exist, create it
if not os.path.exists(out_path):
    os.makedirs(out_path)

conf

## Save configuration

In [None]:
cnf.save(out_path, conf)

# Generate data and plot

In [None]:
# list of wavelengths (x-values)
xval = [i for i in range(200, 400, 2)]

# number of spectra per peak number
nsets  = 4

# baseline variation
baselines = ['none', 'offset', 'linear']

lbline = [bl for bl in baselines for i in range(nsets)]

# total number of datasets
tsets = nsets * len(baselines)

# total number of inference runs (per run)
truns = nsets * len(baselines)**2

# generate nruns sets of spectra
for r in range(conf['nruns']):
    print("Generating dataset {0} of {1}".format(r+1,conf['nruns']))
    
    ldata, lpeaks = [], []
    
    # create output directory for data
    out_dir = out_path + '/run_{0:02d}'.format(r+1)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
            
    for blv in baselines:
        for i in range(nsets):
            df, peaks, _ = dg.data_generator(xvalues=xval, nsamples=15, npeaks=3, tbaseline=blv)
            ldata.append(df)
            lpeaks.append(peaks)
            
    # save data and peak information to disk
    for i in range(len(ldata)):
        ldata[i].to_csv(out_dir + '/dataset_{0:02d}.csv'.format(i+1), index=False)
    dg.data_save(out_dir + '/peakinfo.csv', lpeaks)
    
    # plot datasets
    filen = out_dir + '/scenario_baseline'
    fig.plot_datasets(ldata, lpeaks, dims=(int(tsets/2),2), figure_size=(12,int(tsets*(1.8))), 
                                            savefig='yes', fname=filen, scenario='baseline', labels=lbline)

In [None]:
print("total number of multiple runs                    : {0}".format(conf['nruns']))
print("total number of baseline variations              : {0}".format(len(baselines)))
print("total number of datasets per baseline variation  : {0}".format(nsets))
print("total number of datasets per model               : {0}".format(tsets))
print("total number of inference runs (per single loop) : {0}".format(truns))

# Load data, run inference, visualize, collect results and save 

In [None]:
# convert pandas data to numpy arrays
x_val = np.array(xval, dtype='float32')

In [None]:
# dataframe to hold multiple run results
res_df = pd.DataFrame()

# run the whole loop of inference, posterior sampling, results collection and saving
for r in range(conf['nruns']):
    print("starting loop {0}/{1}".format(r+1,conf['nruns']))

    models, traces, lmodbase = [], [], []

    # load datasets from disk
    data_dir = out_path + '/run_{0:02d}'.format(r+1)
    ldata, lpeaks, _ = dg.data_load(tsets, data_dir)

    # store dataset y-values in list
    cols = ldata[0].columns
    y_val = [ldata[i][cols].values for i in range(len(ldata))]

    # actual inference run number
    inf_run = 1

    for bl in baselines:
        print("running: baseline-{0} model".format(bl))
        for i in range(len(ldata)):
            if conf['peak_info'] == 'yes':
                plist = np.array(lpeaks[i], dtype=float).flatten()
                plist.sort()
                model_g = mdl.model_pvoigt(xvalues=x_val, observations=y_val[i], npeaks=3, 
                                      mu_peaks=plist, pmodel=conf['prior_model'], baseline=bl)
            else:
                model_g = mdl.model_pvoigt(xvalues=x_val, observations=y_val[i], npeaks=3,
                                                      pmodel=conf['prior_model'], baseline=bl)  
            models.append(model_g)

            with model_g:
                print("({6}:{2}/{3}) running inference on dataset #{0}/{1} [{4} model: {5} data]"
                      .format(i+1,len(ldata),inf_run,truns,bl,lbline[i],r+1))
                trace_g = pm.sample(conf['nsamples'], init=conf['init_mode'], cores=conf['ncores'])
                lmodbase += [(bl,lbline[i])]
                traces.append(trace_g)
                inf_run += 1

    # save model figure as image (once), foreach model
    if r == 0:
        z = 0
        for i in range(len(models)):
            if i % len(lbline) == 0:
                img = pm.model_to_graphviz(models[i])
                img.render(filename=file_basename + '_model_{0}'.format(baselines[z]), format='png');
                z += 1

    # posterior predictive traces
    ppc = [pm.sample_posterior_predictive(traces[i], samples=500, model=models[i]) for i in range(len(traces))]

    # collect the results, concat single run result to overall result 
    lruns = ['{0}'.format(r+1) for i in range(truns)]
    df = res.get_results_summary(traces, ppc, y_val, epsilon_real=0.05, sets=tsets, 
                                     labels=lmodbase, multimodels='yes', scenario='baseline', runlist=lruns)
    
    res_df = res_df.append(df, ignore_index=True)

## Show results and save

In [None]:
res_df

In [None]:
# save results to .csv
res_df.to_csv(out_path + '/scenario_baseline_mruns.csv', index=False)

In [None]:
cnf.close(out_path)