In [1]:
from girder_client import GirderClient
import pandas as pd
import birdshot
import re
from collections import defaultdict

In [2]:
# Authenticate with GirderClient
client = GirderClient(apiUrl="https://data.htmdec.org/api/v1")
client.authenticate(apiKey="MFfpVN81hmOaUV7cTGsovnzdr0iB87ygR0RxkDYA")

{'_id': '6424afee4236ff9b0883f24b'}

In [3]:

def summarize_presence_by_sample(df: pd.DataFrame, campaign: str, group_prefixes=None) -> pd.DataFrame:
    if group_prefixes is None:
        group_prefixes = [
            'Elastic Modulus', 'Elongation', 'Maximum ∂2σ/∂ε2',
            'UTS/YS Ratio', 'Ultimate Tensile Strength', 'Yield Strength'
        ]

    grouped_columns = defaultdict(list)
    single_columns = []

    for col in df.columns:
        for prefix in group_prefixes:
            pattern = re.compile(rf'^{re.escape(prefix)}\.([a-zA-Z])$')
            match = pattern.match(col)
            if match:
                suffix = match.group(1)
                grouped_columns[prefix].append((col, suffix))
                break
        else:
            single_columns.append(col)

    summary_rows = []

    for idx, row in df.iterrows():
        sample_id = row.name if df.index.name else idx
        entry = {"Sample": sample_id}

        # Grouped columns: summarize which subsamples exist
        for prefix, cols in grouped_columns.items():
            present_suffixes = [suffix for col, suffix in cols if pd.notna(row[col])]
            entry[prefix] = ", ".join(sorted(present_suffixes)) if present_suffixes else "None"

        # Non-grouped columns
        for col in single_columns:
            entry[col] = "Yes" if pd.notna(row[col]) else "No"

        summary_rows.append(entry)

    # Add summary row per campaign
    summary = {"Sample": campaign}
    total_rows = df.shape[0]

    for prefix, cols in grouped_columns.items():
        total_possible = total_rows * len(cols)
        total_present = sum(pd.notna(df[col]).sum() for col, _ in cols)
        summary[prefix] = f"{total_present}/{total_possible}"

    for col in single_columns:
        total_present = pd.notna(df[col]).sum()
        summary[col] = f"{total_present}/{total_rows}"

    summary_rows.append(summary)

    return pd.DataFrame(summary_rows)

In [4]:
campaign = 'CBA'
df = birdshot.query(campaign=campaign, client=client)
summary_df = summarize_presence_by_sample(df, campaign)
summary_df.to_csv(f"{campaign}_presence_summary.csv", index=False)