In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from wine_analysis_hplc_uv import definitions
from wine_analysis_hplc_uv.modeling import pca
import matplotlib.pyplot as plt
import seaborn as sns
import duckdb as db
import pandas as pd
import numpy as np

con = db.connect(definitions.DB_PATH)
pwine_data = pca.get_data(con)
con.close()
wine_data_fig = pca.build_figure(pwine_data)

### Low Maxima Wines

The first question is what is going on with those wines with a very high amplitude and very low maxima. Lets pull those up.


In [None]:
def low_max_wines(data):
    low_max_wines = (data
    .max().reset_index(name='max').set_index('code_wine')
    .query('max<50')
    .reset_index()
    .pipe(lambda df: pd.concat([df['code_wine'].str.split("_-", expand=True).set_axis(['code', 'wine'], axis=1), df], axis=1))
    .drop('code_wine', axis=1)
    )
    return low_max_wines
low_max_wines = low_max_wines(pwine_data)
low_max_wines

For these wines, its not just a question of whether they are failed runs, because they may be somewhat salvagable with scaling. It depends more on the profile than the scale.

So we need to investigate the profiles, but also the injection volumes, which can be found in `chemstation_metadata`.

First get the samplecodes back from `id_wine` then join it to to `chemstation_metadata` to get the injection volume.

At the same time, plot each individually.


In [None]:
def plot_low_max_wines(low_max_wines):

    fig, axs = plt.subplots(4,2, figsize = (12,8))

    axs = axs.flatten()
    low_max_wines = low_max_wines.assign(code_wine = lambda df: df['code'] + "_-" + df['wine']).set_index('code_wine')

    for i, label in enumerate(low_max_wines.index):
        print(i, label)
        sns.lineplot(pwine_data[label], ax=axs[i])
        axs[i].set_ylabel('abs')
        axs[i].set_title(label)
        
    fig.tight_layout()
plot_low_max_wines(low_max_wines)

So, the following wines are invalid:

| #   | code     | wine                                                    |
|---|---|---|
|  0 | 128    | 2019 mount pleasant wines mount henry shiraz pinot noir |
|  1 | 161    | 2021 le juice fleurie fleurie gamay                     |
|  2 | 163    | 2015 yangarra estate shiraz mclaren vale                |
|  3 | 164    | 2015 yangarra estate old vine grenache                  |
|  4 | 165    | 2020 izway shiraz bruce                                 |
|  5 | ca0101 | 2021 yering station pinot noir                          |
|  6 | ca0301 | 2021 chris ringland shiraz                              |

As a list:


In [None]:
low_max_wines['code'].to_list()

In [None]:
low_max_wines['wine'].to_list()

Now we remove the `low_max_wines` from the dataset and observe any changes in the biplot.


In [None]:
no_low_max_wine = (pwine_data
 .melt()
 .pipe(lambda df: pd.concat([df['code_wine'].str.split("_-", expand=True).set_axis(['code', 'wine'], axis=1), df], axis=1))
 .query('code not in @definitions.BAD_CUPRAC_SAMPLES')
 .drop(['code','wine'], axis=1)
 .assign(i=lambda df: df.groupby('code_wine').cumcount())
 .pivot(columns="code_wine", values="value", index="i")
 )
no_low_max_wine.head()

In [None]:
title = wine_data_fig.suptitle("CUPRAC Red Dataset")
wine_data_fig

In [None]:
no_low_max_wine_fig = pca.build_figure(no_low_max_wine)
title = no_low_max_wine_fig.suptitle("Filtered for Low Maxima")
no_low_max_wine_fig.tight_layout()

After removing the low maxima signals we can see that the PCA biplot tightens up considerably.

## Misaligned Wines

There is at least one signal (not always visible on the plot atm..) whose maxima appears after 3000. To identify this wine, we can make a distribution of maxima times.


In [None]:


wines_idxmax = (no_low_max_wine
          .idxmax()
          .to_frame(name='idxmax')
          .reset_index()
          .pipe(lambda df: pd.concat([df['code_wine'].str.split("_-", expand=True).set_axis(['code', 'wine'], axis=1), df], axis=1))
)
wines_idxmax.head()

In [None]:
dist_fig, ax = plt.subplots(figsize=(10,6))
a = (wines_idxmax
 .plot.hist(bins=34, ax=ax, edgecolor='black',title='abs maxima idx dist', xlabel='maxima idx', label='code_wine')
 #.head()
 )

In [None]:
q1 = wines_idxmax['idxmax'].quantile(0.25)
q3 = wines_idxmax['idxmax'].quantile(0.75)
IQR = q3-q1
display(IQR)

In [None]:
outliers = wines_idxmax.query('(idxmax < @q1- 1.5*@IQR) or (idxmax > (@q3 + 1.5 * @IQR))')
display(outliers.sort_values('idxmax'))

So there are definitely outliers, but it is a fairly normal distribution. Lets observe the individual signals.


In [None]:
idxmax_outlier_fig, axs = plt.subplots(3,2, figsize=(12, 8))
axs = axs.flatten()
for i, label in enumerate(outliers['code_wine']):
    no_low_max_wine[label].plot(ax=axs[i])
    axs[i].set_title(label, fontsize = 7)
figtitle = idxmax_outlier_fig.suptitle("idxmax outlier samples")
idxmax_outlier_fig.tight_layout()

A pattern emerges - 4/6 wines are Pinot Noir, one is Shiraz. The Bardolino is representative, theres nothing to compare it to. There are two questions to answer now:

- [x] Is this all the pinot noir?
- [ ] how much does this torbreck shiraz differ from the others?

In [None]:
# count all pinot noir depending on presence of 'pinot noir' or 'bourgogne' in their name.
pinot_noir = (
    no_low_max_wine
    .filter(regex='.*pinot noir.*|.*bourgogne.*', axis=1)
)
display(pinot_noir.columns)
display(len(pinot_noir.columns))

So there are 9 pinot noir in the dataset at this point, yet only 4 turned up as outliers. Lets plot them.

In [None]:
pinot_noir_fig, axs = plt.subplots(4,2, sharey=True, sharex=True)
axs = axs.flatten()
for i, label in enumerate(pinot_noir.columns):
    plot = pinot_noir[label].plot(ax=axs[i])
    title = axs[i].set_title(label.split("_-")[1], fontsize =8)

pinot_noir_fig.suptitle('CUPRAC Pinot Noirs')
pinot_noir_fig.tight_layout()

In [None]:
outlier_pn = outliers.set_index('wine').filter(regex='.*pinot noir.*|.*bourgogne.*', axis=0)
outlier_pn

So it appears that only 4 of 7 pinot noirs are outliers. Of those are they also low abs signals relative to the others?

In [None]:
pinot_noir.max()

In [None]:
pd.cut(pinot_noir.index, 5)

NOTE TO SELF WHEN GET BACK!

rolling is a method of aggregation as well as a dimensionality reduction that provides a series of values based on the average of a defined size window.

What I am looking for is a method of calculating the average across a number of individuals within set bins. ergo a cut followed by a groupby on the cuts is the way to go.

In [None]:
pinot_noir.assign(bins=pd.cut(pinot_noir.index, 50)).groupby('bins').mean().plot(layout='constrained')
plt.legend(bbox_to_anchor=(1.1,1.05))
pinot_noir.plot()
plt.legend(bbox_to_anchor=(1.1,1.05))

In [None]:
pinot_noir[outlier_pn['code_wine']].max()

In [None]:
(
    pinot_noir
    .loc[0:2500,:]
    .round(3)
    #.rolling(100)
    #.mean()
    .plot(alpha = 0.7)
 )
plt.legend(bbox_to_anchor=(1.1,1.05))