2023-10-30

Continuing from here: [creating_3d_dataset](/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/src/wine_analysis_hplc_uv/notebooks/creating_3d_dataset.ipynb)

This document will contain my PCA analysis of a sample HPLC-DAD dataset for insights, including rank estimation.

In [None]:
# initialise environment

%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn.objects as so
import seaborn as sns
from mcr_methods import Preprocessing, MCR_Analysis
from pymcr.constraints import ConstraintNonneg, ConstraintNorm
from pymcr.regressors import OLS, NNLS

mcr_pro = Preprocessing()
mcr_anal = MCR_Analysis()

In [None]:
dset = pd.read_parquet(
    "/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/src/wine_analysis_hplc_uv/notebooks/tidy_3d_dset_raw.parquet"
)
dset.head()

In [None]:
d111 = (
    dset.loc[lambda df: df.samplecode == "111"]
    .drop(
        ["detection", "samplecode", "wine", "color", "varietal", "id", "mins"], axis=1
    )
    .reset_index(drop=True)
    .rename_axis("i")
)

d111

First the data needs to be baseline corrected and smoothed.

In [None]:
# melt
m_d111 = d111.melt(
    var_name="wavelength", value_name="abs", ignore_index=False
).reset_index()
m_d111

In [None]:
# display head of melted frame
m_d111.head()

In [None]:
# plot raw
fig, ax = plt.subplots(figsize=(5, 3), dpi=150)

p = m_d111.pipe(
    lambda x: sns.lineplot(
        data=x, hue="wavelength", x="i", y="abs", legend=False, ax=ax
    )
)

In [None]:
# smooth

m_d111 = mcr_pro.smooth(
    m_d111, "wavelength", "abs", "smooth", savgol_kws=dict(window_length=4, polyorder=2)
)
m_d111.groupby("wavelength").get_group("256").pipe(
    lambda x: so.Plot(data=x, x="i", y="smooth").add(so.Line())
).show()

In [None]:
# bcorr

m_d111 = mcr_pro.baseline_subtract(m_d111, "smooth", "wavelength", "bcorr")

display(m_d111.head())

# plot overlay of baseline and original signal
(
    m_d111.groupby("wavelength")
    .get_group("256")
    .loc[:, ["i", "smooth", "bline"]]
    .pipe(lambda df: df if display(df) else df)  # display df
    .melt(var_name="signal", value_name="abs", id_vars="i")
    .pipe(lambda df: df if display(df) else df)  # display df
    .pipe(lambda df: so.Plot(data=df, x="i", y="abs", color="signal").add(so.Line()))
)

First the data needs to be scaled and centered. Also, subset to the region of interest, < 4000.

In [None]:
# subset to <4000
m_d111_subset = mcr_pro.subset(m_d111, "i", 4000)

display(m_d111_subset.head())

(
    m_d111_subset.groupby("wavelength")
    .get_group("256")
    .reset_index()
    .pipe(lambda x: so.Plot(data=x, x="i", y="bcorr").add(so.Line()))
)

In [None]:
# scale and center

m_d111_subset = mcr_pro.scale_and_center(m_d111_subset, "bcorr")

m_d111_subset.groupby("wavelength").get_group("256").pipe(
    lambda x: so.Plot(data=x, x="i", y="scale_center").add(so.Line()).show()
)

# PCA

@juan_mcriter_2020 says that PCA can be used to estimate the number of compounds in $X$. @nardecchia_2020 says that this is based on "the scree test for the number of factors". Plotting eigenvalues against components, the chemical rank is defined as the point at which the curve elbows.

Component selection is necessarily arbitrary, ergo I will define the threshold of variance % as greater than 1E-3

In [None]:
n_components = mcr_anal.calculate_components(
    m_d111_subset.pivot_table(columns="wavelength", values="scale_center", index="i")
)

Therefore for this dataset, the chemical rank is 6. This is very surprising, as I was expecting at least as many components as peaks.

## Counting Peaks

In [None]:
# find the peaks defined as those as prominant as 2% of the maxima of the signal

m_d111_subset = mcr_anal.detect_peaks(
    df=m_d111_subset,
    grouper="wavelength",
    target_col="bcorr",
    peaks_colname="bcorr_peaks",
    prom_ratio=0.02,
)

display(m_d111_subset.head())

m_d111_subset.groupby("wavelength").get_group("256").pipe(
    lambda df: so.Plot(df, x="i")
    .add(so.Line(), y="bcorr")
    .add(so.Dot(), y="bcorr_peaks")
)

In [None]:
# a prominence value of 2 is appropriate for 256nm, but is it appropriate for all wavelengths?

(
    m_d111_subset.loc[lambda df: df.wavelength.isin(["190", "256", "400"])].pipe(
        lambda df: so.Plot(df, x="i")
        .facet("wavelength")
        .share(y=False)
        .add(so.Line(), y="bcorr")
        .add(so.Dot(), y="bcorr_peaks")
        .layout(size=(15, 3))
    )
)

In [None]:
m_d111_subset.groupby("wavelength").get_group("190").pipe(
    lambda df: so.Plot(df, x="i")
    .add(so.Line(), y="bcorr")
    .add(so.Dot(), y="bcorr_peaks")
)

In [None]:
m_d111_subset.groupby("wavelength")["bcorr_peaks"].agg(
    lambda x: x.dropna().shape[0]
).plot()

So yeah, there is a disconnect between the expected components and the number of peaks. Ah well, pushing on.

# SIMPLISMA

SIMPLe-to-use Interactive Self-modeling Mixture Analysis.

Selection of pure variables from $D$.

First published by @windig_1991.

In [None]:
m_d111_subset

In [None]:
# Main Algorithm

m_d111_subset_aug = m_d111_subset.pivot_table(
    columns="wavelength", index="i", values="scale_center"
)
# Run Simplisma
sp, concs = mcr_anal.simplisma(m_d111_subset_aug.values, 5, 5)

In [None]:
sp.shape

In [None]:
plt.plot(sp)

In [None]:
plt.plot(m_d111_subset_aug)

In [None]:
sp.shape

In [None]:
m_d111_subset_aug.shape

# MCR

In [None]:
mcrar = mcr_anal.mcr_als(
    D=m_d111_subset_aug.T,
    S=sp,
    mcr_als_kws=dict(
        max_iter=100,
        st_regr="OLS",
        c_regr="OLS",
        c_constraints=[ConstraintNonneg(), ConstraintNorm()],
        tol_increase=1e4,
        tol_n_above_min=10,
    ),
    mcr_als_fit_kws=dict(verbose=True),
)
mcrar

In [None]:
copt = mcrar.C_opt_

In [None]:
copt.shape

In [None]:
plt.plot(m_d111_subset_aug)

In [None]:
copt = pd.DataFrame(copt)
copt.head()

In [None]:
m_copt = copt.melt(var_name="column", value_name="conc")
m_copt.head()

In [None]:
plt.plot(mcrar.C_opt_.dot(mcrar.ST_opt_).T)

In [None]:
out = pd.DataFrame(
    mcrar.C_opt_.dot(mcrar.ST_opt_),
    columns=m_d111_subset_aug.columns,
    index=m_d111_subset_aug.index,
)
out.head()

In [None]:
m_d111_subset_aug.reset_index().melt(
    var_name="wavelength", id_vars="i", value_name="abs"
).groupby("wavelength").get_group("256")["abs"].plot()

In [None]:
melt_out = out.reset_index().melt(var_name="wavelength", id_vars="i", value_name="abs")
melt_out.groupby("wavelength").get_group("256")["abs"].plot()

In [None]:
melt_out.head()

In [None]:
melt_out = melt_out.fillna(0)
melt_out.head()

In [None]:
cc = plt.tricontourf(melt_out.wavelength, melt_out.i, melt_out["abs"], level=10)
artists, labels = cc.legend_elements()
plt.legend(artists, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

In [None]:
opath = "/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/src/wine_analysis_hplc_uv/notebooks/pca_sample.parquet"
m_d111_subset_aug.to_parquet(opath)

2023-11-01

Conclusion: a prototype pipeline has been established but interpretation of the results escapes me. Possibly D is rank deficient and analysis of a multiset will reveal more useful results, but in the meantime I am moving on to different tacks.