In [None]:
import numpy as np
import duckdb as db
import polars as pl
import plotnine as p9
from wine_analysis_hplc_uv.chapter_one import baselines, get_samples, polars_extension
from wine_analysis_hplc_uv import definitions

# Is Signal Baseline Height a Function of % Methanol in Mobile Phase

## Abstract

A notebook containing an analysis and consideration of whethre or not to remove a fitted baseline from all samples automatically. The outcome was that while there is an observable correlation between methanol AUC (Area Under Curve) and baseline AUC, it was decided that this was coincident on the fact that an increased %Me increases the pace of elution of components, in turn increasing the average baseline AUC, and that since white wines had little no observable baseline, that the source of the baseline is from the sample. We concluded by stating that baseline removal would be acceptable for inter-varietal studies, but generally advised against.

## Introduction

TODO

## Results

### AUC as a Function of Gradient and Sample

TODO:
  - [ ] complete this hypothesis test.

I hypothesis that the baseline is a convolution of the % composition of methanol in the mobile phase. To prove this, find auc as a function of % methanol for each sample.

Possible approach: fit a baseline, compare the profile of the baseline to the methanol curve.


|    min  |   A%  |    B%  |  Flow mL/min |  Pressure psi |
|---|---|---|---|---|
|  38.00 | 0.00 | 100.00 |  1.000 |  6999.96 |
|  40.00 | 0.00 | 100.00 | 1.000  | 6999.96 |
|  42.00 | 95.00 |  5.00 | 1.000  | 6999.96 |
|  52.00 | 95.00 |  5.00 | 1.000  | 6999.96 |

  So the methanol gradient is a three spline curve, the first is 0 to 38 mins with a gradient of 2.5, the second is 40 to 42 with a gradient of zero, and the third is 42 to 52 mins with a gradient of -9.5%.

#### Methanol Curve

In [None]:
# generate the methanol curve
def gen_methanol_curve() -> pl.DataFrame:
    x = np.linspace(0, 52, 1000)

    # first window

    window_1 = 5 + (x[x <= 38]) * 2.5

    # second window
    window_2 = (x[(x > 38) & (x <= 42)] * 0) + 100

    # third window

    window_3 = 100 + ((x[x > 42] - 42) * -9.5)

    df_ = pl.DataFrame({"y": window_3}).with_row_index("idx")

    # combine the windows together
    methanol_curve = np.concatenate((window_1, window_2, window_3))
    # pad methanol curve to match length of x

    df = pl.DataFrame({"mins": x, "% methanol": methanol_curve})

    return df


methanol_curve = gen_methanol_curve()
display(methanol_curve)
display(
    p9.ggplot(methanol_curve.with_row_index("idx"), p9.aes(x="mins", y="% methanol"))
    + p9.geom_line()
    + p9.ggtitle("% Methanol as a Function of Time in a 2.5% Gradient")
)

#### Baselines

##### Samples

In [None]:
# get the samples

n_samples = 10

with db.connect(definitions.DB_PATH) as con:
    samples = get_samples.get_samples(
        con=con, detection="raw", n_samples=n_samples, distinct_wine=True
    )
(
    p9.ggplot(
        db.sql("from samples where mins <= 30 order by sample_num, idx")
        .pl()
        .pipe(polars_extension.to_enum, "sample_num"),
        p9.aes(x="mins", y="absorbance", color="sample_num"),
    )
    + p9.geom_line()
    + p9.ggtitle(f"Overlay of Random 'Raw' Samples n = {n_samples} @ 256 nm")
)

In [None]:
# calculate the baselines

baseline_results = baselines.calculate_baselines(
    df=samples, grp_col="sample_num", y_col="absorbance"
)

signals = baselines.join_baseline_result_signals(
    signals=samples,
    baselines=baseline_results["baselines"],
    corrected_signals=baseline_results["corrected"],
)
signals.head()

In [None]:
# plot correction overlay


def plot_correction_overlay(df: pl.DataFrame, grp: str) -> None:
    """
    produce a plot of the original signal, baseline and corrected signal for each group in 'grp'
    """

    p = (
        p9.ggplot(
            df,
        )
        + p9.geom_line(p9.aes(x="mins", y="absorbance"), color="blue", alpha=0.5)
        + p9.geom_line(p9.aes(x="mins", y="baseline"), color="red")
        + p9.geom_line(p9.aes(x="mins", y="corrected"), color="black")
        + p9.facet_wrap(facets=grp)
        + p9.theme(figure_size=(16, 8))
        + p9.ggtitle("Base Signal, Fitted Baseline and Corrected Signal by Sample")
    )

    display(p)


plot_correction_overlay(
    df=signals.pipe(polars_extension.to_enum, "sample_num"),
    grp="sample_num",
)

## Discussion

To answer the question of correlation between % methanol gradient and baseline AUC, it appears that they are tangentially related, and that apart from samples such as 17, the majority of components are eluting by the 30 minute mark, which is just over half way up the methanol gradient. In samples possessing high AUC relative to zero, such as 1, 2, 3, 8, 17, 47, 66, 84, 98, etc., the baseline becomes significant after the 5 minute mark, where methanol is > 12.5% of the mobile phase composition. It is the 5 to 30 minute range that would benefit from baseline removal.

## Conclusion

TODO