In [None]:
# setup
from wine_analysis_hplc_uv import definitions
from wine_analysis_hplc_uv.db_methods import get_data, pivot_wine_data
import pandas as pd
import duckdb as db

#pd.options.display.width = None
pd.options.display.max_rows = 20
pd.options.display.max_columns = 15
pd.options.display.max_colwidth = None
pd.options.display.colheader_justify = "left"

con = db.connect(definitions.DB_PATH)
def fetch_dataset(con):
    get_data.get_wine_data(con, detection=('cuprac',), wavelength=(450,), varietal=('shiraz',))
    df = pivot_wine_data.pivot_wine_data(con)
    return df

df154 = (fetch_dataset(con)
         .loc[:,pd.IndexSlice['154', :,['mins','value']]]
         .stack(['samplecode','wine'])
         .reset_index()
         .set_index(["mins",'samplecode','wine'])
         .unstack(['samplecode','wine'])
         .reorder_levels(['samplecode','wine','vars'], axis=1)
         #.pipe(lambda df: df.set_index(pd.to_timedelta(df.index, unit='minutes')))
         
         
)


# Identifying the Precision of Spectrum Chromatogram Observations in my Dataset

There is question of what is the precision of the time points of my observations. For example, sample 154:

In [None]:
df154.head()

the second time point of this sample is:

In [None]:
obs = (
    df154
    .index.to_frame().astype(str).iat[1,0]
)
print(obs)
print("num sigfigs:", len(obs.split(".")[1]))
#.split('.')[1]

 Unfortunately even the 'raw' data in my database has a precision of sometimes 18 digits, which could not possibly be correct, and must be a symptom of float datatypes in Python. To settle this once and for all, I could either make a decision of what is the minimum time scale that retains unique values in the time column, or check a .UV file.

In [None]:
import rainbow as rb
import os
filepath = os.path.join(definitions.LIB_DIR,"cuprac","131.D")
obs = rb.read(filepath).get_file('DAD1.UV').xlabels[0]
print(obs)
print("num sigfigs:",len(str(obs).split(".")[1]))

Well I have been vindicated, as rainbow is also returning 18 significant figures. Thus the second approach is required - identify an appropriate level of granularity by testing several time scales and seeing when duplicate values appear. Observe the millisecond ('L') and second ('S') scales (refer to [offset alias](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) for the symbology):

In [None]:
df154_ = df154.pipe(lambda df: df.set_index(pd.to_timedelta(df.index, unit="minutes")))

In [None]:
len(df154_
    .index[df154_.index
    .round(freq="L")
    .duplicated()
    ]
)

In [None]:
len(df154_
    .index[df154_.index.round(freq='S')
           .duplicated()])

It appears that no duplicates are detected at the millisecond scale ('L') , however at the second ('S') scale, over half the observation points are now duplicates. Thus we will continue at the millisecond scale.