In [1]:
import bioeq
import polars as pl
import statsmodels
from bioeq import Crossover2x2
import numpy as np

In [2]:
# Load Simdata

df_simdata = pl.read_csv(
    source="https://raw.githubusercontent.com/shaunporwal/bioeq/refs/heads/main/simdata/bioeq_simdata_1.csv"
)

In [3]:
bioeq.__version__

'0.1.1'

In [4]:
df_simdata.columns

['SubjectID',
 'Period',
 'Sequence',
 'Formulation',
 'Time (hr)',
 'Concentration (ng/mL)']

In [5]:
df_simdata = df_simdata.rename(
    {
        "SubjectID": "subject_id",
        "Period": "period",
        "Sequence": "sequence",
        "Formulation": "formulation",
        "Time (hr)": "time",
        "Concentration (ng/mL)": "concentration",
    }
)

In [6]:
df_simdata.columns

['subject_id', 'period', 'sequence', 'formulation', 'time', 'concentration']

In [7]:
# Instantiate the class with the correct column names
crossover = Crossover2x2(
    data=df_simdata,
    subject_col="subject_id",
    seq_col="sequence",
    period_col="period",
    time_col="time",
    conc_col="concentration",
    form_col="formulation",
)

# Test the AUC calculation function
auc_results = crossover.df_params
print(auc_results)

shape: (10, 11)
┌────────────┬────────┬──────────┬─────────────┬───┬───────────┬──────┬──────────┬──────────┐
│ subject_id ┆ period ┆ sequence ┆ formulation ┆ … ┆ Cmax      ┆ Tmax ┆ log_AUC  ┆ log_Cmax │
│ ---        ┆ ---    ┆ ---      ┆ ---         ┆   ┆ ---       ┆ ---  ┆ ---      ┆ ---      │
│ i64        ┆ i64    ┆ str      ┆ str         ┆   ┆ f64       ┆ f64  ┆ f64      ┆ f64      │
╞════════════╪════════╪══════════╪═════════════╪═══╪═══════════╪══════╪══════════╪══════════╡
│ 1          ┆ 1      ┆ TR       ┆ Reference   ┆ … ┆ 49.449766 ┆ 0.0  ┆ 4.617735 ┆ 3.900957 │
│ 1          ┆ 2      ┆ TR       ┆ Test        ┆ … ┆ 49.507197 ┆ 0.0  ┆ 4.610431 ┆ 3.902118 │
│ 2          ┆ 1      ┆ TR       ┆ Reference   ┆ … ┆ 49.571954 ┆ 0.0  ┆ 4.605317 ┆ 3.903425 │
│ 2          ┆ 2      ┆ TR       ┆ Test        ┆ … ┆ 50.394233 ┆ 0.0  ┆ 4.629355 ┆ 3.919877 │
│ 3          ┆ 1      ┆ TR       ┆ Reference   ┆ … ┆ 50.948843 ┆ 0.0  ┆ 4.599128 ┆ 3.930822 │
│ 3          ┆ 2      ┆ TR       ┆ Test     

In [8]:
crossover.run_anova("log_AUC")

Formulation levels: ['Reference' 'Test']
Period levels: [1 2]
Sequence levels: ['TR']
Error: Sequence is confounded (only one level). Provide data with ≥2 sequence levels.


In [9]:
crossover.run_nlme("log_AUC")

Formulation levels: ['Reference' 'Test']
Period levels: [1 2]
Sequence levels: ['TR']
Error: Sequence is confounded (only one level). Provide data with ≥2 sequence levels.


In [10]:
# Step 1: Sort the DataFrame to ensure correct time order within each group.
df_sorted = df_simdata.sort([subject_col, period_col, form_col, time_col])

NameError: name 'subject_col' is not defined

In [None]:
df_sorted.columns

In [11]:
# Step 2: Group by subject, period, and formulation, aggregating time and concentration into lists.
grouped_df = df_sorted.group_by([subject_col, period_col, form_col]).agg(
    [
        pl.col(time_col).alias("times"),
        pl.col(conc_col).alias("concentrations"),
    ]
)

In [None]:
grouped_df

In [None]:
grouped_df

In [None]:
df_simdata.head()

In [None]:
df_single_case = df_simdata.filter(
    (pl.col("subject_id") == 1)
    & (pl.col("period") == 1)
    & (pl.col("formulation") == "Reference")
)

df_single_case_small = df_single_case.select(pl.col("time"), pl.col("concentration"))

row_num = df_single_case_small["concentration"].len()

for conc in df_single_case_small["concentration"]:
    print(conc)

In [None]:
row_num

In [None]:
df_simdata.select("Sequence").unique().sort(by="Sequence")

In [None]:
df_simdata = pl.read_csv(
    source="https://raw.githubusercontent.com/statist-bhfz/bioeq/refs/heads/master/testdata.csv",
    separator=";",
    truncate_ragged_lines=True,
)


df

In [None]:
df_simdata = df.with_columns(  # with_columns doesn't modify cols in place, so has to be casted back to df
    pl.col("subj").cast(dtype=pl.Int64),
    pl.col("seq").cast(dtype=pl.Int64),
    pl.col("prd").cast(dtype=pl.Int64),
    pl.col("drug").cast(dtype=pl.String),
    pl.col("time").cast(dtype=pl.String).str.replace(r",", ".").cast(dtype=pl.Float64),
    pl.col("conc").cast(dtype=pl.String).str.replace(r",", ".").cast(dtype=pl.Float64),
)

In [None]:
df_simdata.columns

In [36]:
subject_col = "subj"
time_col = "time"
conc_col = "conc"

In [66]:
# Validate required columns

df = df_simdata.clone()
df = df.sort([subject_col, time_col]).clone()

In [None]:
# The below is probably something that has to be developed in parallel to this bioeq package, it's a whole separate entity

# In doing the data validation, what we want is probably something like:

# Parse

# 1. State all expected raw colnames and types or if applicable, expected categorical values
# 2. State raw col to parsed col relationship
# 3. See if there are problematic values
# 4. Resolve problematic values and create parsed cols
# 5. Check that they have been resolved
# 6. Assign the correct types to all the parsed cols once we're happy with col vals

# Validate

# 7. Check that all col values are reasonable to human judgement (pointblank in R)
# 8. Change those that aren't or filter out, and proceed with cleaned data (get_data_pass() in R)

# Derive

# 9. Create a list of cols to derive and which cols they are derived from
# 10. Derive the columns in the dataframe
# 11. Check that we derived all the columns that we said we would
# 12. Check that the col vals now have reasonable values, and if not either filter or replace (pointblank in R)