To develop a PARAFAC2 pipeline We first need to choose a test set. This should be a related group of samples with a relatively high peak prominance, and preferably easily clustered peaks.


In [None]:
%reload_ext autoreload
%autoreload 3 --print

# get the test data as two tables: metadata and a samplewise stacked img table
import logging

import duckdb as db
import plotly.express as px
import polars as pl
from database_etl import get_data
from pca_analysis import xr_signal

from pca_analysis.definitions import DB_PATH_UV
from pca_analysis.get_sample_data import get_ids_by_varietal
import plotly.io as pio
import xarray as xr
import darkdetect

logger = logging.getLogger(__name__)

logger.setLevel(logging.DEBUG)

xr.set_options(display_expand_data=False, display_expand_coords=False)

if darkdetect.isDark():
    pio.templates.default = "plotly_dark"


with db.connect(DB_PATH_UV) as conn:
    ids = get_ids_by_varietal("shiraz", conn)

    ds = get_data(output="xr", con=conn, runids=ids)

# replace id with id_rank to be more human friendly
ds = (
    ds.assign_coords(
        id_rank=lambda x: (
            "id",
            x.coords["id"].to_dataframe()["id"].rank(method="dense").astype(int),
        )
    )
    .swap_dims({"id": "id_rank"})
    .rename({"imgs": "raw_data"})
    .assign(wavelength=lambda x: ("wavelength", x["wavelength"].astype(int).data))
)

fig = (
    ds.sel(wavelength=256)
    .pipe(
        xr_signal.facet_plot_multiple_traces,
        grouper=["id_rank"],
        data_keys=["raw_data"],
        x_key="mins",
        trace_kwargs=[dict(mode="lines", line=dict(color="cadetblue"))],
        col_wrap=3,
        fig_kwargs=dict(y_title="au", shared_yaxes=True),
    )
    .update_layout(height=1000)
)

display(ds)
display(fig)


As we can see, sample 2 is very much an outlier when compared to the other samples, and will be removed.


In [None]:
ds = ds.where(lambda x: x.id_rank != 2, drop=True)


In [None]:
ds = ds.assign_attrs(
    {
        "description": "shiraz dataset for development of PARAFAC2 pipeline. Each id_rank is a unique sample."
    }
)
ds


This dataset will be written to PARAFAC2_TESTSET for use in downstream development. Modifications to the testset will be done here and then propagated.

In [None]:
from pca_analysis.definitions import PARAFAC2_TESTSET

ds.to_netcdf(PARAFAC2_TESTSET)
