# Determining Time Axis Offset Pattern


While determining the [time precision](determining_time_precision.ipynb) I noticed that once adjusted to a millisecond scale, it was clear that there was an offset in sample 154 of 15 milliseconds.


In [1]:
# setup
from wine_analysis_hplc_uv import definitions
from wine_analysis_hplc_uv.db_methods import get_data, pivot_wine_data
import pandas as pd
import duckdb as db

# pd.options.display.width = None
pd.options.display.max_rows = 20
pd.options.display.max_columns = 15
pd.options.display.max_colwidth = None
pd.options.display.colheader_justify = "left"

con = db.connect(definitions.DB_PATH)


def fetch_dataset(con):
    get_data.get_wine_data(
        con, detection=("cuprac",), wavelength=(450,), varietal=("shiraz",)
    )
    df = pivot_wine_data.pivot_wine_data(con)
    return df


df154 = (
    fetch_dataset(con)
    .loc[:, pd.IndexSlice["154", :, ["mins", "value"]]]
    .stack(["samplecode", "wine"])
    .reset_index()
    .set_index(["mins", "samplecode", "wine"])
    .unstack(["samplecode", "wine"])
    .reorder_levels(["samplecode", "wine", "vars"], axis=1)
    .pipe(lambda df: df.set_index(pd.to_timedelta(df.index, unit="minutes")))
    .pipe(
        lambda df: df.set_index(df.index.round("L"))
    )  # refer to README.md/decisions 1
)
display(df154.head())
display(df154.columns)
display(df154.shape)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Function get_wine_data took 4.6980 seconds to execute



samplecode,154,154
wine,2020 leeuwin estate shiraz art series,2020 leeuwin estate shiraz art series
vars,i,value
mins,Unnamed: 1_level_3,Unnamed: 2_level_3
0 days 00:00:00.150000,0,2.260849
0 days 00:00:00.550000,1,2.960749
0 days 00:00:00.950000,2,3.189206
0 days 00:00:01.350000,3,3.115371
0 days 00:00:01.750000,4,2.675273


MultiIndex([('154', '2020 leeuwin estate shiraz art series',     'i'),
            ('154', '2020 leeuwin estate shiraz art series', 'value')],
           names=['samplecode', 'wine', 'vars'])

(6000, 2)

In [2]:
df154 = (
    df154.index.to_frame()
    .rename_axis("mins_index")
    .diff(1)
    .rename({"mins": "diff"}, axis=1)
    .reset_index()
    .assign(diff_mode=lambda df: df.loc[:, "diff"].mode())
    .assign(mins_index=lambda df: df.loc[:, "mins_index"] - df.loc[0, "mins_index"])
    .pipe(lambda df: df if display(df) is None else df)
)

Unnamed: 0,mins_index,diff,diff_mode
0,0 days 00:00:00,NaT,0 days 00:00:00.400000
1,0 days 00:00:00.400000,0 days 00:00:00.400000,NaT
2,0 days 00:00:00.800000,0 days 00:00:00.400000,NaT
3,0 days 00:00:01.200000,0 days 00:00:00.400000,NaT
4,0 days 00:00:01.600000,0 days 00:00:00.400000,NaT
...,...,...,...
5995,0 days 00:39:58,0 days 00:00:00.400000,NaT
5996,0 days 00:39:58.400000,0 days 00:00:00.400000,NaT
5997,0 days 00:39:58.800000,0 days 00:00:00.400000,NaT
5998,0 days 00:39:59.200000,0 days 00:00:00.400000,NaT


Which ends up being very clean. Now the question is, will that work for every sample?


In [96]:
def fetch_all_samples(con):
    get_data.get_wine_data(con, detection=("cuprac",), wavelength=(450,))
    df = (
        pivot_wine_data.pivot_wine_data(con)
        .loc[:, pd.IndexSlice[:, :, ["mins"]]]  #
        .stack(["samplecode", "wine"])
        .assign(
            mins=lambda df: df.groupby(["samplecode", "wine"])["mins"]
            .transform(pd.to_timedelta, **dict(unit="minutes"))
            .round("L")
        )
        .unstack(["samplecode", "wine"])
        .reorder_levels(["samplecode", "wine", "vars"], axis=1)
    )
    return df


adf = fetch_all_samples(con)
adf.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Function get_wine_data took 4.8921 seconds to execute



samplecode,116,117,119,120,121,122,123,...,ca0201,ca0301,crawford-cab,mt-diff-bannockburn-pn,st hugo gsm,stoney-rise-pn,torbreck-struie
wine,2021 marco de bartoli lucido,2022 greywacke sauvignon blanc,2021 bortolin angelo valdobbiadene extra dry,2021 stefano lubiana pinot noir,2021 jumping juice pinot noir,2021 lethbridge wines pinot noir,2021 baglio di grìsi nero davola,...,2019 kendall-jackson chardonnay vintners reserve,2021 chris ringland shiraz,2018 crawford river cabernets,2020 mt. difficulty pinot noir bannockburn,2021 st hugo grenache shiraz mataro,2021 stoney rise pinot noir,2021 torbreck shiraz the struie
vars,mins,mins,mins,mins,mins,mins,mins,...,mins,mins,mins,mins,mins,mins,mins
i,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
0,0 days 00:00:00.287000,0 days 00:00:00.262000,0 days 00:00:00.375000,0 days 00:00:00.237000,0 days 00:00:00.075000,0 days 00:00:00.062000,0 days 00:00:00.387000,...,0 days 00:00:00.187000,0 days 00:00:00.037000,0 days 00:00:00.400000,0 days 00:00:00.012000,0 days 00:00:00.037000,0 days 00:00:00.125000,0 days 00:00:00.337000
1,0 days 00:00:00.687000,0 days 00:00:00.662000,0 days 00:00:00.775000,0 days 00:00:00.637000,0 days 00:00:00.475000,0 days 00:00:00.462000,0 days 00:00:00.787000,...,0 days 00:00:00.587000,0 days 00:00:00.437000,0 days 00:00:00.800000,0 days 00:00:00.412000,0 days 00:00:00.437000,0 days 00:00:00.525000,0 days 00:00:00.737000
2,0 days 00:00:01.087000,0 days 00:00:01.062000,0 days 00:00:01.175000,0 days 00:00:01.037000,0 days 00:00:00.875000,0 days 00:00:00.862000,0 days 00:00:01.187000,...,0 days 00:00:00.987000,0 days 00:00:00.837000,0 days 00:00:01.200000,0 days 00:00:00.812000,0 days 00:00:00.837000,0 days 00:00:00.925000,0 days 00:00:01.137000
3,0 days 00:00:01.487000,0 days 00:00:01.462000,0 days 00:00:01.575000,0 days 00:00:01.437000,0 days 00:00:01.275000,0 days 00:00:01.262000,0 days 00:00:01.587000,...,0 days 00:00:01.387000,0 days 00:00:01.237000,0 days 00:00:01.600000,0 days 00:00:01.212000,0 days 00:00:01.237000,0 days 00:00:01.325000,0 days 00:00:01.537000
4,0 days 00:00:01.887000,0 days 00:00:01.862000,0 days 00:00:01.975000,0 days 00:00:01.837000,0 days 00:00:01.675000,0 days 00:00:01.662000,0 days 00:00:01.987000,...,0 days 00:00:01.787000,0 days 00:00:01.637000,0 days 00:00:02,0 days 00:00:01.612000,0 days 00:00:01.637000,0 days 00:00:01.725000,0 days 00:00:01.937000


There looks to be a consistent offset. Now if I subtract the first value from all..


In [99]:
adf = (
    adf.stack(["samplecode", "wine"])
    .assign(
        mins=lambda df: df.groupby(["samplecode", "wine"])["mins"].transform(
            lambda x: x - x.iloc[0]
        )
    )  # adjust time axis by initial value so they all start at 1
    .unstack(["samplecode", "wine"])
    .reorder_levels(["samplecode", "wine", "vars"], axis=1)
    .sort_index(level=0, axis=1, sort_remaining=True)
    .pipe(lambda df: df if display(df.head()) else df)
)

samplecode,116,117,119,120,121,122,123,...,ca0201,ca0301,crawford-cab,mt-diff-bannockburn-pn,st hugo gsm,stoney-rise-pn,torbreck-struie
wine,2021 marco de bartoli lucido,2022 greywacke sauvignon blanc,2021 bortolin angelo valdobbiadene extra dry,2021 stefano lubiana pinot noir,2021 jumping juice pinot noir,2021 lethbridge wines pinot noir,2021 baglio di grìsi nero davola,...,2019 kendall-jackson chardonnay vintners reserve,2021 chris ringland shiraz,2018 crawford river cabernets,2020 mt. difficulty pinot noir bannockburn,2021 st hugo grenache shiraz mataro,2021 stoney rise pinot noir,2021 torbreck shiraz the struie
vars,mins,mins,mins,mins,mins,mins,mins,...,mins,mins,mins,mins,mins,mins,mins
i,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
0,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,...,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00
1,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,...,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000,0 days 00:00:00.400000
2,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,...,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000,0 days 00:00:00.800000
3,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,...,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000,0 days 00:00:01.200000
4,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,...,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000,0 days 00:00:01.600000


Looks right, but I need to be certain. The following will return the count of unique values by row. Any row with more than 1 unique value indicates that there is more going on than a scalar offset.


In [94]:
(
    adf_.dropna()  # make all series the same length
    .agg([pd.unique], axis=1)  # get unique values in each row
    .explode(
        "unique"
    )  # expand on the index so each unique value per row now has multiple entries
    .groupby("i")  # groupby index to see how many unique elements per row index
    .size()
    .where(
        lambda s: s > 1
    )  # filter out any row indexes where more than 1 unique value. If false, returns NaN
    .dropna()  # remaining rows will be rows that had more than 1 unique value
)

Series([], dtype: float64)

Ok, that's convincing enough for me. As of 2023-08-23 22:47:37 I am going to assume the full dataset follows the same pattern.

In summary: all data time axes have a varying offset equal to the value of the first measurement. Subtracting the first value from the axis will align the data so that the first measurement is zero.

The caveat is that the observation frequency must be the same for all samples.
