# Creating a Downsampled Test Dataset

This notebook will create a parquet file of downsampled CUPRAC detected shiraz signals at 450 nm.


In [None]:
%load_ext autoreload
%autoreload 2

from wine_analysis_hplc_uv.get_test_data import gd
from wine_analysis_hplc_uv.signal_processing.mindex_signal_processing import SignalProcessor
import pandas as pd
import numpy as np
sigpro = SignalProcessor()
df = gd()

In [None]:
# setup
df = (
    df.pipe(sigpro.long_format)
    .pipe(
        lambda df: (
            df.drop(["id", "detection"], axis=1)
            if pd.Series(["id", "detection"]).isin(df.columns).all()
            else df
        )
    )
    .pipe(sigpro.tidy_format)
    .drop(["163", "165", "ca0301"], axis=1)  # bad samples
)
df

## Downsampling

In [None]:
rdf = (
    df.pipe(sigpro.adjust_timescale)
    .pipe(sigpro.correct_offset)
    .pipe(sigpro.downsample_signal, "2S")
)
rdf
# note, torbreck-struie appears to have 30 less observations than the other samples, not sure why, profile still looks good. this results in a number of NaT/NaN, but no big deal.

In [None]:
# fill in torbreck-struie missing values


# rdf =
def fillfunc(df):
    if df.name == "torbreck-struie":
        df = (
            df.stack(["samplecode", "wine"], dropna=False)
            .assign(
                mins=lambda df: pd.timedelta_range(
                    start="0", periods=df.shape[0], freq="2S"
                )
            )
            .assign(value=lambda df: df.value.fillna(method="ffill"))
            .unstack(["samplecode", "wine"])
            .reorder_levels(["samplecode", "wine", "vars"], axis=1)
        )
        return df
    else:
        return df


rdf = rdf.groupby(["samplecode"], axis=1).apply(fillfunc).droplevel(0, axis=1)
rdf

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1)

(
    rdf.stack(["samplecode", "wine"])
    .groupby(["samplecode"])
    .apply(
        lambda df: df.plot(
            x="mins", y="value", ax=ax, label=df.index.get_level_values(1)[0]
        )
    )
)

## Write to File

Testing between read times for parquet and csv found that parquet read times were twice as fast. so use parquet.

In [None]:
rdf.to_parquet(
    path="/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/tests/test_data/processing_test_set/cupshz_testset.pq"
)

In [None]:
a = pd.read_parquet(
    "/Users/jonathan/mres_thesis/wine_analysis_hplc_uv/tests/test_data/processing_test_set/cupshz_testset.pq"
)
a