New Hypothesis: The first x principal components of each sample within a dataset will be sufficient to distinguish the samples by class through XGBoost.

Need:
- Dataset with signals and class labels.
- Preprocessing pipeline.

In [None]:
%reload_ext autoreload
%autoreload 2

# initialization

import pandas as pd
import numpy as np
import seaborn.objects as so
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from wine_analysis_hplc_uv.notebooks.mcr import mcr_methods
from sklearn import decomposition
from wine_analysis_hplc_uv.signal_analysis.signal_analysis import SignalAnalyzer

In [None]:
class DSetPrepper(mcr_methods.Preprocessing, mcr_methods.PCA, decomposition.PCA, SignalAnalyzer):
    
    def __init__(self):
        return None
    
dset_prepper = DSetPrepper()

In [None]:
dset = pd.read_parquet(
    "/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/src/wine_analysis_hplc_uv/notebooks/tidy_3d_dset_raw.parquet"
)
dset.head()

Get the reds.

In [None]:
# select the reds

reds = dset.groupby('color').get_group('red')
reds.head()

now get a description of the varietals. We want to be able to classify by varietal

In [None]:
reds.groupby('id')['mins'].plot()

In [None]:
counts_by_varietal = reds.groupby(['varietal'])['id'].nunique()
counts_by_varietal

There you go. Somewhat imbalanced, but it is what it is for now. Now to process them.

As the processing pipeline was developed for 1 sample with 1 grouping column and we now have two - "id" and "wavelength", gna have to see if thatll work.

In [None]:
# reset index, add a aggregate column 'code_wine' from wine and samplecode, drop unnecessary columns.

def adjust_df(df):
    df = (df
            .reset_index(drop=True)
            .pipe(lambda df: df if df.insert(loc=0, column='code_wine', value=df.samplecode+"_"+df.wine) else df)
            .drop(['detection','color', 'samplecode','wine'],axis=1)
    )
    return df


reds = adjust_df(reds)
display(reds.head())
            

72, 115, a0301, 97, a0101, 99. All have abnormalities in the y axis. Just remove them for now.

In [None]:
reds.loc[lambda df: ~(df.code_wine.str.contains("72|115|a0301|99"))]

In [None]:
(reds.groupby('code_wine').filter(
    lambda x: any(x.mins.diff()<0) | any(x.mins.diff()==0)
    )
    ['code_wine'].unique()
    )

In [None]:
reds.groupby('code_wine').get_group('a0301_2021 chris ringland shiraz').mins.plot()

In [None]:
# 72, 115, a0301, 97, a0101, 99

In [None]:
any(reds.loc[lambda df: df.code_wine.str.contains("115")].mins.diff()==0)

In [None]:
grps = reds.groupby('code_wine')

expected_gradient =reds.groupby('code_wine').get_group(list(reds.groupby('code_wine').groups)[0]).mins.diff().mean()
expected_gradient

In [None]:
reds

In [None]:
# plot of time domain over index for all samples. Slow to render, hence commented out 2023-11-02

# reds.pipe(lambda x: so.Plot(x.reset_index(names='i'), x='i',y='mins').add(so.Line()).facet('code_wine', wrap=3).layout(size=(15,100)).share(x=False, y=False)
#           )

In [None]:
# samples with identified abnormalities in time domain based on the plots above

abnormalities=reds.loc[lambda x: x.code_wine.str.contains('72|115|a0301|60|98')]
abnormalities['code_wine'].unique()

In [None]:
# plot time domain of abnormal samples
abnormalities.pipe(lambda df: so.Plot(df.reset_index(names='i'), x='i',y='mins')
                   .add(so.Line())
                   .facet('code_wine', wrap=2)
                   .share(x=False, y=False)
                   .layout(size=(20,15))
                   )

In [None]:
# plot chromatograms of abnormal samples

abnormalities.pipe(lambda df: so.Plot(df, x='mins',y='256').add(so.Line()).facet('code_wine', wrap=2).share(x=False, y=False).layout(size=(20,15)))

In [None]:
def reassign_time(df, grouper: str | list[str], freq_str: str):
    
    df = df.assign(
        mins=lambda x: x.groupby(grouper)["mins"].transform(
            lambda x: pd.timedelta_range(start=0, periods=len(x), freq=freq_str).total_seconds()/60
        )
    )

    return df


reds = reassign_time(reds, "code_wine", "0.4S")
reds.head()

In [None]:
# resample to 2S

def resample_to_2s(df):
    
    grouper = ['varietal','id','code_wine']
    
    df = (
        df
        .pipe(lambda df: df
              .assign(mins=
                  pd.TimedeltaIndex(df.mins, unit='m')
              )
        .set_index('mins')
        .groupby(grouper, group_keys=False)
        .apply(lambda grp:
        grp
        .resample('2S').interpolate()
        .ffill()
        )
        .set_index(grouper, append=True).reset_index('mins').reset_index()
        .assign(mins=lambda df: df.mins.dt.total_seconds()/60)
        )
    )
    
    return df
    
resampled_reds = resample_to_2s(reds);
resampled_reds.head()

In [None]:
def melt_reds(df):
   odf = (df.melt(id_vars=['varietal','id','code_wine','mins'], value_name='signal', var_name='wavelength')
          .loc[:,lambda df: df
               .columns
               .drop('mins')
               .insert(-1, 'mins')
               .tolist()])
   display(odf.head())
   return odf
    
mreds = melt_reds(resampled_reds)

In [None]:
# smooth

mreds = mreds.assign(smoothed=lambda df: df.pipe(dset_prepper._smooth, grouper=['id','wavelength'], col='signal'))
mreds.head()

In [None]:
mreds

In [None]:
# baseline subtract

# import os
# bcorr_path = os.path.join(os.getcwd(),'bcorr.parquet')

mreds = (mreds
        .assign(
        **{'bcorr':lambda df: df
                                .pipe(
                                    dset_prepper._baseline_subtract,
                                    grouper=["id", "wavelength"],
                                    col="smoothed",
                                    asls_kws=dict(max_iter=100, tol=1e-3, lam=1e5)
                                    )
         }
)
)

mreds.head()

In [None]:
ricci_49 = mreds.groupby(['code_wine','wavelength']).get_group((
    '49_2020 matias riccitelli malbec hey malbec!','256')).reset_index()

so.Plot(ricci_49, x='mins').add(so.Line(),y='smoothed').add(so.Line(color='red', alpha=0.5),
                                                            y='bcorr').show()

Next is to subset to a 'region of interest' to reduce the total dataset size. Hypothesize that it will correspond to below 4000 observations.

In [None]:
def find_peaks(df):
    odf = df.pipe(dset_prepper.detect_peaks, grouper=['id','wavelength'], target_col='bcorr', peaks_colname='peaks', prom_ratio=0.02)
    return odf

mreds = find_peaks(mreds)
mreds.head()

In [None]:
# plot 200, 256, 350 to observe peak detection results

temp = mreds.loc[lambda x: (x.wavelength.isin(['200','256','350']) & (x.code_wine==list(x.groupby('code_wine').groups.keys())[0]))]

(so.Plot(temp
        
         , x='mins')
 .add(so.Line(), y='bcorr')
 .add(so.Dot(color='red',marker='x',alpha=0.75), y='peaks')
 .facet('wavelength')
 .layout(size=(15,5))
 .share(y=False).show())
temp.head()

In [None]:
# how many samples have peaks after 30 minutes? do this another time, bin by 10 minute 
# intervals, count not nas, produce a histogram of counts per bin.

def find_peaks_after_30_mins(df):
    display(
        df.head()
            )
    
    
find_peaks_after_30_mins(temp)

In [None]:
# subset

def subset(df):
    df = df.loc[lambda df: (df.mins<30) & (df.wavelength=='256')].reset_index(drop=True)
    
    assert not df.empty
    return df
    
mreds = subset(mreds)
mreds.head()

In [None]:
# scale and center

def scale_and_center(df):
    df = (
        df
        .groupby('id', as_index=False)
        .apply(dset_prepper.scale_and_center, 'bcorr')
        .reset_index(drop=True)
    )
   
    assert isinstance(df, pd.DataFrame)
    
    display(df.head())
    
    return df

mreds = scale_and_center(mreds);

In [None]:
mreds_pivot = mreds.pivot_table(columns=['varietal','code_wine','id'], index='mins',values='scale_center').sort_index(axis=1)
mreds_pivot

In [None]:
def export_dset(df):

    import os

    opath = os.path.join(os.getcwd(), "pca_dset.parquet")

    df.to_parquet(opath)

    return None

mreds_pivot.pipe(export_dset)