# Gross Time Alignment on Last Peak


Trying to help decomposition by subtracting large differences via subtraction of mean peak time across sample set.

In [None]:
%reload_ext autoreload
%autoreload 3 --print

# get the test data as two tables: metadata and a samplewise stacked img table
import logging

import duckdb as db
import plotly.express as px
import polars as pl
from database_etl import get_data

from pca_analysis.definitions import DB_PATH_UV
from pca_analysis.get_sample_data import get_ids_by_varietal
import xarray as xr

logger = logging.getLogger(__name__)

logger.setLevel(logging.DEBUG)

xr.set_options(display_expand_data=False)


with db.connect(DB_PATH_UV) as conn:
    ids = get_ids_by_varietal("shiraz", conn)

    ds = get_data(output="xr", con=conn, runids=ids)
# replace id with id_rank to be more human friendly
ds = ds.assign_coords(
    id_rank=lambda x: (
        "id",
        x.coords["id"].to_dataframe()["id"].rank(method="dense").astype(int),
    )
)
ds


In [None]:
import plotly.express as px


def px_line_pipe(ds):
    return (
        ds.to_dataframe()
        .reset_index()
        .pipe(px.line, x="mins", y="imgs", color="id_rank")
        .update_traces(line=dict(width=0.75))
    )


ds.sel(wavelength="256").pipe(px_line_pipe)


Firstly some gross alignment can be introduced. Namely, we can subtract the last peak time from all times to align them on that peak, as the majority of samples have a very close peak time. What is that peak time? The last peak is between 40 and 44 mins.

In [None]:
last_peak = ds.sel(wavelength="256", mins=slice(40, 44))
lp_fig = last_peak.pipe(px_line_pipe)


And with peak detection..

In [None]:
from pca_analysis import xr_signal

find_peaks_kws = xr_signal.find_peaks_defaults
find_peaks_kws["prominence"] = 0.5
last_peak = last_peak.pipe(
    xr_signal.find_peaks_dataset,
    array_key="imgs",
    grouper=["id_rank"],
    x_key="mins",
)


In [None]:
# filter out the na caused by aligning the peak maximas.

peaks = last_peak.peaks.to_dataframe().reset_index()[lambda x: ~x["peaks"].isna()][
    ["id_rank", "mins", "peaks"]
]
ranked_peaks = peaks.reset_index().set_index("id_rank")["mins"].rank()
ranked_peaks.plot.bar()


as we can see, quite the dispersion.

In [None]:
ranked_peaks.plot.hist(bins=30)


but as we can see its vaguely normal, so computing an average and subtracting will be acceptable.

In [None]:
peak_mins = peaks.reset_index(drop=True).set_index("id_rank")[["mins"]]

peak_mins.plot.hist(bins=30)


In [None]:
mean = peaks["mins"].mean()


In [None]:
peaks_mean_diff = peak_mins - mean
peaks_mean_diff.plot.bar()


Subtracting the mean centers the sample on zero. Subtracting the difference of each samples peak from each sample aligns on that mean.


In [None]:
peaks_mean_diff


In [None]:
adjusted_peaks = peaks.reset_index().set_index("id_rank")[["mins"]] - peaks_mean_diff

# adjusted_peaks.plot.bar()
adjusted_peaks.plot.line()


In [None]:
left = (
    ds[["id_rank", "mins"]]
    .to_dataframe()
    .reset_index()[["id_rank", "mins"]]
    .set_index("id_rank")
)
left


In [None]:
import pandas as pd

adjusted = (
    pd.merge(
        left=left,
        right=peaks_mean_diff.rename({"mins": "diff"}, axis=1),
        how="left",
        on="id_rank",
    )
    .assign(mins=lambda x: x["mins"] + x["diff"])
    .drop("diff", axis=1)
)
adjusted


In [None]:
ds = ds.assign_coords(mins_adjusted=adjusted.values.flatten())
ds.sel(wavelength="256")


## Conclusion

This is finicky and only changing 0.02 mins here and there. Not worth persuing at this time 2024-12-09T15:28:57
