In [None]:
"""
This can be the preprocessing demonstration and serve as an automated test until a more formal module can be established. It will need to individually demonstrate smoothing, sharpening and baseline subtraction.
"""


In [None]:
%reload_ext autoreload
%autoreload 3 --print

import logging
from pca_analysis import xr_signal

from pca_analysis.definitions import PARAFAC2_TESTSET
import plotly.io as pio
import xarray as xr
import darkdetect

logger = logging.getLogger(__name__)

logger.setLevel(logging.DEBUG)

xr.set_options(display_expand_data=False, display_expand_coords=False)

if darkdetect.isDark():
    pio.templates.default = "plotly_dark"

ds = xr.load_dataset(PARAFAC2_TESTSET)
ds


## Smoothing

The criteria is that with the default find_peaks params, no peaks are detected before the first 0.77 seconds. This can be achieved through savgol smoothing.

## Sharpening

## Baseline Subtraction

To simplify tool development, we should first subtract the baseline from each sample. Whether or not there is a baseline is questionable, however the rise and fall does roughly correspond with the change in concentration of methanol in the mobile phase, potentially introducing background absorption. Either way, the data will be easier to work with with zeroed baselines.

In [None]:
import numpy as np
from pybaselines.smooth import snip


def apply_snip(da: xr.DataArray, **kwargs):
    """
    docs: https://pybaselines.readthedocs.io/en/latest/api/pybaselines/smooth/index.html
    """
    blines = []
    for x in da:
        bline, _ = snip(x, **kwargs)
        blines.append(bline)

    blines_ = np.stack(blines)
    blines_da = da.copy(data=blines_)
    blines_da.name = "baselines"

    da_ = xr.merge([da, blines_da])
    da_ = da_.assign(data_corr=lambda x: x["raw_data"] - x["baselines"])
    return da_


ds = ds.raw_data.pipe(apply_snip, max_half_window=30).where(
    lambda x: x.mins < 30, drop=True
)
(
    ds.pipe(
        xr_signal.facet_plot_multiple_traces,
        grouper=["id_rank"],
        data_keys=["raw_data", "data_corr"],
        x="mins",
        trace_kwargs=[
            dict(mode="lines", line=dict(color="cadetblue"), opacity=0.55),
            dict(mode="lines", line=dict(color="red"), opacity=0.95),
        ],
        col_wrap=3,
        fig_kwargs=dict(
            y_title="au",
        ),
    ).update_layout(height=1500)
)


the results of which are quite acceptable. Without much fiddling, ranges are identified within which a reasonable amount of peaks fall (2 > x < 6). The only draw back is that some of the parameter values are currently hard coded data dependent values, meaning that a different baseline subtraction will require different values. A problem to be solved down track, but essentially means that every run will require a little manual tuning.
