In [45]:
import bioeq
import polars as pl
from bioeq import Crossover2x2
import numpy as np

In [2]:
# Load Simdata

df_simdata = pl.read_csv(
    source="https://raw.githubusercontent.com/shaunporwal/bioeq/refs/heads/main/simdata/bioeq_simdata_1.csv"
)

In [3]:
bioeq.__version__

'0.1.0.1'

In [4]:
df_simdata.columns

['SubjectID',
 'Period',
 'Sequence',
 'Formulation',
 'Time (hr)',
 'Concentration (ng/mL)']

In [12]:
df_simdata = df_simdata.rename(
    {
        "SubjectID": "subject_id",
        "Period": "period",
        "Sequence": "sequence",
        "Formulation": "formulation",
        "Time (hr)": "time",
        "Concentration (ng/mL)": "concentration",
    }
)

ColumnNotFoundError: 'rename' on column: 'SubjectID' is invalid

Schema at this point: Schema:
name: subject_id, field: Int64
name: period, field: Int64
name: sequence, field: String
name: formulation, field: String
name: time, field: Float64
name: concentration, field: Float64


Resolved plan until failure:

	---> FAILED HERE RESOLVING THIS_NODE <---
DF ["subject_id", "period", "sequence", "formulation"]; PROJECT */6 COLUMNS

In [6]:
df_simdata.columns

['subject_id', 'period', 'sequence', 'formulation', 'time', 'concentration']

In [13]:
# Instantiate the class with the correct column names
crossover = Crossover2x2(
    data=df_simdata,
    subject_col="subject_id",
    seq_col="sequence",
    period_col="period",
    time_col="time",
    conc_col="concentration",
    form_col="formulation",
)

# Test the AUC calculation function
auc_results = crossover.calculate_auc()
print(auc_results)

AttributeError: 'DataFrame' object has no attribute 'groupby'

In [25]:
subject_col = "subject_id"
period_col = "period"
form_col = "formulation"
time_col = "time"
conc_col = "concentration"

In [39]:
# Step 1: Sort the DataFrame to ensure correct time order within each group.
df_sorted = df_simdata.sort([subject_col, period_col, form_col, time_col])

In [40]:
df_sorted.columns

['subject_id', 'period', 'sequence', 'formulation', 'time', 'concentration']

In [46]:
# Step 2: Group by subject, period, and formulation, aggregating time and concentration into lists.
grouped_df = df_sorted.group_by([subject_col, period_col, form_col]).agg(
    [
        pl.col(time_col).alias("times"),
        pl.col(conc_col).alias("concentrations"),
    ]
)

In [47]:
grouped_df

subject_id,period,formulation,times,concentrations
i64,i64,str,list[f64],list[f64]
2,1,"""Reference""","[0.0, 0.5, … 8.0]","[49.571954, 38.197632, … 3.47527]"
4,2,"""Test""","[0.0, 0.5, … 8.0]","[48.23696, 39.264123, … 1.847062]"
1,1,"""Reference""","[0.0, 0.5, … 8.0]","[49.449766, 39.455472, … 0.0]"
5,1,"""Reference""","[0.0, 0.5, … 8.0]","[50.69844, 39.113641, … 0.670034]"
3,1,"""Reference""","[0.0, 0.5, … 8.0]","[50.948843, 40.52089, … 0.46904]"
3,2,"""Test""","[0.0, 0.5, … 8.0]","[51.524242, 39.263039, … 1.239981]"
4,1,"""Reference""","[0.0, 0.5, … 8.0]","[49.884352, 38.638935, … 1.2594]"
1,2,"""Test""","[0.0, 0.5, … 8.0]","[49.507197, 39.332619, … 0.487989]"
5,2,"""Test""","[0.0, 0.5, … 8.0]","[49.230332, 40.152211, … 1.985263]"
2,2,"""Test""","[0.0, 0.5, … 8.0]","[50.394233, 39.062258, … 0.280222]"


In [57]:
grouped_df = grouped_df.with_columns(
    pl.map_batches(
        ["times", "concentrations"],
        lambda df: df.apply(
            lambda row: np.trapezoid(row["concentrations"], row["times"]), axis=1
        ),
    ).alias("AUC")
)


thread '<unnamed>' panicked at crates/polars-python/src/map/lazy.rs:200:19:
python function failed: 'list' object has no attribute 'apply'


PanicException: python function failed: 'list' object has no attribute 'apply'

In [None]:
# Step 4: Drop the temporary list columns and return the final DataFrame.
grouped_df = grouped_df.drop(["times", "concentrations"])

In [40]:
df_simdata.head()

subject_id,period,sequence,formulation,time,concentration
i64,i64,str,str,f64,f64
1,1,"""TR""","""Reference""",0.0,49.449766
1,1,"""TR""","""Reference""",0.5,39.455472
1,1,"""TR""","""Reference""",1.0,30.800394
1,1,"""TR""","""Reference""",2.0,19.762422
1,1,"""TR""","""Reference""",4.0,5.849937


In [41]:
df_single_case = df_simdata.filter(
    (pl.col("subject_id") == 1)
    & (pl.col("period") == 1)
    & (pl.col("formulation") == "Reference")
)

df_single_case_small = df_single_case.select(pl.col("time"), pl.col("concentration"))

row_num = df_single_case_small["concentration"].len()

for conc in df_single_case_small["concentration"]:
    print(conc)

49.44976551083605
39.45547222602065
30.800393819376495
19.762422181837874
5.849937318012263
2.365206238120429
0.0


In [10]:
row_num

7

In [28]:
df_simdata.select("Sequence").unique().sort(by="Sequence")

Sequence
str
"""TR"""


In [None]:
df_simdata = pl.read_csv(
    source="https://raw.githubusercontent.com/statist-bhfz/bioeq/refs/heads/master/testdata.csv",
    separator=";",
    truncate_ragged_lines=True,
)


df

In [None]:
df_simdata = df.with_columns(  # with_columns doesn't modify cols in place, so has to be casted back to df
    pl.col("subj").cast(dtype=pl.Int64),
    pl.col("seq").cast(dtype=pl.Int64),
    pl.col("prd").cast(dtype=pl.Int64),
    pl.col("drug").cast(dtype=pl.String),
    pl.col("time").cast(dtype=pl.String).str.replace(r",", ".").cast(dtype=pl.Float64),
    pl.col("conc").cast(dtype=pl.String).str.replace(r",", ".").cast(dtype=pl.Float64),
)

In [None]:
df_simdata.columns

In [36]:
subject_col = "subj"
time_col = "time"
conc_col = "conc"

In [None]:
# Validate required columns

df = df_simdata.clone()
df = df.sort([subject_col, time_col]).clone()

In [None]:
# Compute AUC for each group
auc_df = df.group_by(subject_col).agg(
    pl.struct([time_col, conc_col])
    .apply(
        lambda rows: np.trapz(
            [row[conc_col] for row in rows],
            [row[time_col] for row in rows],
        )
    )
    .alias("AUC")
)

In [None]:
# The below is probably something that has to be developed in parallel to this bioeq package, it's a whole separate entity

# In doing the data validation, what we want is probably something like:

# Parse

# 1. State all expected raw colnames and types or if applicable, expected categorical values
# 2. State raw col to parsed col relationship
# 3. See if there are problematic values
# 4. Resolve problematic values and create parsed cols
# 5. Check that they have been resolved
# 6. Assign the correct types to all the parsed cols once we're happy with col vals

# Validate

# 7. Check that all col values are reasonable to human judgement (pointblank in R)
# 8. Change those that aren't or filter out, and proceed with cleaned data (get_data_pass() in R)

# Derive

# 9. Create a list of cols to derive and which cols they are derived from
# 10. Derive the columns in the dataframe
# 11. Check that we derived all the columns that we said we would
# 12. Check that the col vals now have reasonable values, and if not either filter or replace (pointblank in R)

In [None]:
cross

In [None]:
Crossover2x2()