In [None]:
import duckdb as db
import polars as pl
import numpy as np
import plotnine as p9
from wine_analysis_hplc_uv import definitions

# Selecting Representative Shiraz

This notebook contains the exploration and selection of a representative set of raw detected shiraz samples for display in the section "Quantiative Analysis". The intent is to show visually where time domain peak regions exist.

## Get Data

To get the raw shiraz data we will need to join the metadata and chromatospectral data.

In [None]:
def get_raw_shiraz(con: db.DuckDBPyConnection) -> pl.DataFrame:
    """
    extract the raw detected shiraz at 256 nm.
    """
    query = """
    select
        sm.wine as wine,
        sm.detection,
        sm.samplecode,
        sm.sample_num,
        cs.idx,
        cs.mins,
        cs.absorbance
    from
        pbl.chromatogram_spectra_long as cs
    left join
        pbl.sample_metadata as sm
    on
        sm.id=cs.id
    where
        wavelength=256
    and
        varietal='shiraz'
    and
        detection='raw'
    and
        mins<30
    order by
        sm.sample_num, cs.idx
    """

    shiraz = con.sql(query)

    return shiraz.pl()


with db.connect(definitions.DB_PATH) as con:
    shiraz = get_raw_shiraz(con=con)
shiraz.describe()

In [None]:
# get the categories sorted from lowest to highest


def to_enum(df: pl.DataFrame, col: str):
    """
    cast a column to Enum
    """
    enum_dtype = pl.Enum(df.get_column(col).unique().sort().cast(str))
    df_ = df.with_columns(pl.col(col).cast(str).cast(enum_dtype))
    return df_


shiraz = to_enum(df=shiraz, col="sample_num")
shiraz.dtypes

In [None]:
# plot an overlay of the signals

(
    p9.ggplot(
        shiraz,
        p9.aes(x="mins", y="absorbance", color="sample_num"),
    )
    + p9.geom_line()
)

This is too much, visually. We want to display 3 samples based on auc, the minima, maxima and median. Also, we need to remove sample 75 as it is bad data relative to the other samples, vastly lacking in intensity and complexity.

## Dropping Sample 75

In [None]:
# remove sample 75 and plot again
filter_75 = pl.col("sample_num").eq("75")
shiraz_without_75 = shiraz.filter(~filter_75)
shiraz_without_75.filter(filter_75)

(
    p9.ggplot(
        shiraz_without_75,
        p9.aes(x="mins", y="absorbance", color="sample_num"),
    )
    + p9.geom_line()
)

## Selecting AUC Distributed Samples

As a first base visual display of variation amongst the raw shiraz @ 256nm dataset, we can use the distribution of AUC, namely the minima, median and maxima. This will provide three samples with minimal overlap.

In [None]:
aucs = (
    shiraz_without_75.group_by("sample_num")
    .agg(
        pl.map_groups(
            ["mins", pl.col("absorbance").sub(pl.col("absorbance").min())],
            function=lambda x: np.trapz(x=x[0], y=x[1]),
        ).alias("auc")
    )
    .sort("auc", descending=True)
)

To rationalise this result, lets compare it with the maxima peak height.

## Maxima Peak Height

In [None]:
maximas = shiraz_without_75.group_by("sample_num").agg(
    pl.col("absorbance").max().alias("maxima_height")
)
maximas

and we then want to compare auc with height.

## Comparison AUC vs Maxima Height

To compare these two metrics, plot a line of best fit and observe correlation.

In [None]:
def join_auc_and_heights(
    auc: pl.DataFrame, height: pl.DataFrame, key: str
) -> pl.DataFrame:
    """
    join the two metric frames on `key`
    """
    df_ = auc.join(height, on=key)

    return df_


metrics = join_auc_and_heights(auc=aucs, height=maximas, key="sample_num").sort(
    "auc", descending=True
)
metrics

In [None]:
# add wine to metrics
# TODO: fix this so the sample_num category sorts correctly for the below plot.
metrics_with_names = db.sql(
    """
    select
        cast(l.sample_num as integer) as sample_num,
        r.wine,
        l.auc,
        l.maxima_height,
    from
        metrics as l
    inner join
        (
        select
            distinct sample_num as sample_num,
            wine,
        from
            shiraz
            ) as r
    on
        l.sample_num=r.sample_num
    order by
        l.auc
    """
).pl()
metrics_with_names = to_enum(metrics_with_names, col="sample_num").sort("sample_num")
metrics_with_names

In [None]:
metrics_with_names

### Viz

In [None]:
def plot_scatter(
    metrics: pl.DataFrame, auc_key: str, height_key: str, id_key: str
) -> pl.DataFrame:
    """
    plot auc vs height with point labels
    """

    (
        p9.ggplot(metrics, p9.aes(height_key, auc_key, label=id_key))
        + p9.geom_point(
            p9.aes(color=id_key),
        )
        + p9.geom_text(nudge_y=30)
        + p9.ggtitle("AUC vs peak maxima height")
        + p9.theme(figure_size=(16, 4))
    ).draw(show=True)


plot_scatter(
    metrics=metrics_with_names,
    auc_key="auc",
    height_key="maxima_height",
    id_key="sample_num",
)

### Correlation

We will calculate the correlation between AUC and peak height as Pearson's correlation coefficient using `scipy.stats.linregress` [docs](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html).

In [None]:
from scipy import stats


def generate_linear_curve(slope: float, intercept: float, x):
    """
    generate a line of best fit from input slope and intercept and input x
    """

    return slope * x + intercept


def calculate_r_squared_annotation_position(
    x, y, x_pos: float, y_pos: float
) -> dict[str, float]:
    """
    calculate the position of the annotation as a function of the range of x and y.
    use `x_pos` and `y_pos` to move the annotation relative to the ranges.
    """

    anno_x = (max(x) - min(x)) * x_pos
    anno_y = (max(y) - min(y)) * y_pos

    return {"x": anno_x, "y": anno_y}


def plot_metrics_with_lobf_overlay(
    df: pl.DataFrame,
    lobf,
    r_text: str,
    x_key: str,
    y_key: str,
    id_key: str,
    label_x: float,
    label_y: float,
):
    """
    plot x and y variables with a fitted line
    """

    df_ = df.with_columns(lobf=lobf)

    r_sq_anno_pos = calculate_r_squared_annotation_position(
        x=df[x_key], y=df[y_key], x_pos=label_x, y_pos=label_y
    )

    (
        p9.ggplot(
            df_,
            p9.aes(
                x_key,
                y_key,
                label=id_key,
            ),
        )
        + p9.geom_line(p9.aes(y=lobf))
        + p9.geom_point(
            p9.aes(
                color=id_key,
            )
        )
        + p9.geom_text(nudge_y=100)
        # see <https://ggplot2.tidyverse.org/reference/annotate.html>
        + p9.annotate(
            x=r_sq_anno_pos["x"],
            y=r_sq_anno_pos["y"],
            geom="label",
            label=f"$r^2 = {r_text:.3}$",
        )
        + p9.ggtitle("AUC vs peak maxima height with LOBF")
        + p9.theme(figure_size=(16, 4))
    ).draw(show=True)

    display(lobf)


def fit_metrics(
    df: pl.DataFrame, x_key: str, y_key: str, id_key: str, label_x, label_y
):
    """
    fit the metrics to find the correlation and display the line of best fit
    """
    x = df.get_column(x_key).to_numpy()
    y = df.get_column(y_key).to_numpy()

    result = stats.linregress(x=x, y=y)
    lobf = generate_linear_curve(slope=result.slope, intercept=result.intercept, x=x)

    plot_metrics_with_lobf_overlay(
        df=df,
        lobf=lobf,
        x_key=x_key,
        y_key=y_key,
        id_key=id_key,
        r_text=np.power(result.rvalue, 2),
        label_x=label_x,
        label_y=label_y,
    )  # type: ignore
    return result, lobf

In [None]:
result, lobf = fit_metrics(
    df=metrics_with_names,
    x_key="maxima_height",
    y_key="auc",
    id_key="sample_num",
    label_y=1.2,
    label_x=1.2,
)

In [None]:
metrics_without_6 = metrics_with_names.filter(~pl.col("sample_num").eq("6"))

result_without_6, lobf_without_6 = fit_metrics(
    df=metrics_without_6,
    x_key="maxima_height",
    y_key="auc",
    id_key="sample_num",
    label_y=1.6,
    label_x=1.7,
)

In [None]:
from IPython.display import Markdown as md

md(
    f"As we can see, the exclusion of sample  of sample 6 decreases the correlation from {np.power(result.rvalue, 2):.3} to {np.power(result_without_6.rvalue, 2):.3}, indicating that sample 6 is an outlier, as we have no reason to suspect that AUC is direclty correlated with peak height."
)

### Outlier Definition and Detection

It is obvious from the plot above that sample 6 sits away from the cluster  of other samples

## Selecting Representative Samples by AUC

To select the representatives we will find the samples possessing the minimum, median and maximum AUC.

In [None]:
# generate a table with the minimum, median and maximum auc samples indicated by 'sample_num'

bound_aucs = db.sql("""
with t1 as (select * from metrics_without_6)
select
    sample_num,
    auc,
    'min' as stat
from
    t1
where
    auc=(select min(auc) from t1)
union
    select
       sample_num,
       auc,
       'max' as stat,
    from
       t1
    where
        auc=(select max(auc) from t1)
union
    select
       sample_num,
       auc,
       'median' as stat,
    from
       t1
    where
        auc=(select median(auc) from t1)
order by
       auc
""").pl()

bound_aucs

In [None]:
md(f"""
As can be seen in the table above, sample {bound_aucs.filter(pl.col('stat').eq('min'))['sample_num'].item()} is the minimum with an auc of {bound_aucs.filter(pl.col('stat').eq('min'))['auc'].item():.3} while sample {bound_aucs.filter(pl.col('stat').eq('median'))['sample_num'].item()} with an auc of {bound_aucs.filter(pl.col('stat').eq('median'))['auc'].item():.3} is the median, and finally sample {bound_aucs.filter(pl.col('stat').eq('max'))['sample_num'].item()} is the maximum with an auc of {bound_aucs.filter(pl.col('stat').eq('max'))['auc'].item():.3}.
""")

And to demonstrate the plot:

In [None]:
# filter to the samples present in 'bound_auc' and plot

plot_samples = shiraz_without_75.filter(
    pl.col("sample_num").is_in(bound_aucs.get_column("sample_num"))
)
# verify the filtering
plot_samples.select("sample_num").unique()

In [None]:
# generate the plot
p = (
    p9.ggplot(
        to_enum(plot_samples.cast({"sample_num": int}), "sample_num"),
        p9.aes(x="mins", y="absorbance", color="sample_num"),
    )
    + p9.geom_line()
    + p9.scale_x_continuous(breaks=np.arange(0, 30, 2.5))
    + p9.ggtitle("Minimum, Median, and Maximum AUC 'raw' Shiraz")
)
p

And output the image as png:

In [None]:
p.save("min_max_median_raw_shiraz.png")