# Determining Time Axis Offset Pattern


While determining the [time precision](determining_time_precision.ipynb) I noticed that once adjusted to a millisecond scale, it was clear that there was an offset in sample 154 of 15 milliseconds.


In [None]:
# setup
from wine_analysis_hplc_uv import definitions
from wine_analysis_hplc_uv.db_methods import get_data, pivot_wine_data
import pandas as pd
import duckdb as db

# pd.options.display.width = None
pd.options.display.max_rows = 20
pd.options.display.max_columns = 15
pd.options.display.max_colwidth = None
pd.options.display.colheader_justify = "left"

con = db.connect(definitions.DB_PATH)


def fetch_dataset(con):
    get_data.get_wine_data(
        con, detection=("cuprac",), wavelength=(450,), varietal=("shiraz",)
    )
    df = pivot_wine_data.pivot_wine_data(con)
    return df


df154 = (
    fetch_dataset(con)
    .loc[:, pd.IndexSlice["154", :, ["mins", "value"]]]
    .stack(["samplecode", "wine"])
    .reset_index()
    .set_index(["mins", "samplecode", "wine"])
    .unstack(["samplecode", "wine"])
    .reorder_levels(["samplecode", "wine", "vars"], axis=1)
    .pipe(lambda df: df.set_index(pd.to_timedelta(df.index, unit="minutes")))
    .pipe(
        lambda df: df.set_index(df.index.round("L"))
    )  # refer to README.md/decisions 1
)
display(df154.head())
display(df154.columns)
display(df154.shape)

In [None]:
df154 = (
    df154.index.to_frame()
    .rename_axis("mins_index")
    .diff(1)
    .rename({"mins": "diff"}, axis=1)
    .reset_index()
    .assign(diff_mode=lambda df: df.loc[:, "diff"].mode())
    .assign(mins_index=lambda df: df.loc[:, "mins_index"] - df.loc[0, "mins_index"])
    .pipe(lambda df: df if display(df) is None else df)
)

Which ends up being very clean. Now the question is, will that work for every sample?


In [None]:
def fetch_all_samples(con):
    get_data.get_wine_data(con, detection=("cuprac",), wavelength=(450,))
    df = (
        pivot_wine_data.pivot_wine_data(con)
        .loc[:, pd.IndexSlice[:, :, ["mins"]]]  #
        .stack(["samplecode", "wine"])
        .assign(
            mins=lambda df: df.groupby(["samplecode", "wine"])["mins"]
            .transform(pd.to_timedelta, **dict(unit="minutes"))
            .round("L")
        )
        .unstack(["samplecode", "wine"])
        .reorder_levels(["samplecode", "wine", "vars"], axis=1)
    )
    return df


adf = fetch_all_samples(con)
adf.head()

There looks to be a consistent offset. Now if I subtract the first value from all..


In [None]:
adf = (
    adf.stack(["samplecode", "wine"])
    .assign(
        mins=lambda df: df.groupby(["samplecode", "wine"])["mins"].transform(
            lambda x: x - x.iloc[0]
        )
    )  # adjust time axis by initial value so they all start at 1
    .unstack(["samplecode", "wine"])
    .reorder_levels(["samplecode", "wine", "vars"], axis=1)
    .sort_index(level=0, axis=1, sort_remaining=True)
    .pipe(lambda df: df if display(df.head()) else df)
)

Looks right, but I need to be certain. The following will return the count of unique values by row. Any row with more than 1 unique value indicates that there is more going on than a scalar offset.


In [None]:
(
    adf_.dropna()  # make all series the same length
    .agg([pd.unique], axis=1)  # get unique values in each row
    .explode(
        "unique"
    )  # expand on the index so each unique value per row now has multiple entries
    .groupby("i")  # groupby index to see how many unique elements per row index
    .size()
    .where(
        lambda s: s > 1
    )  # filter out any row indexes where more than 1 unique value. If false, returns NaN
    .dropna()  # remaining rows will be rows that had more than 1 unique value
)

In [None]:
con.close()

Ok, that's convincing enough for me. As of 2023-08-23 22:47:37 I am going to assume the full dataset follows the same pattern.

In summary: all data time axes have a varying offset equal to the value of the first measurement. Subtracting the first value from the axis will align the data so that the first measurement is zero.

The caveat is that the observation frequency must be the same for all samples.
