In [None]:
from girder_client import GirderClient
import pandas as pd
import birdshot
import re
from collections import defaultdict

# Authenticate with GirderClient
client = GirderClient(apiUrl="https://data.htmdec.org/api/v1")
client.authenticate(apiKey="MFfpVN81hmOaUV7cTGsovnzdr0iB87ygR0RxkDYA")

# List of campaigns to analyze
campaigns = ["AAA", "AAB", "AAC", "AAD", "AAE", "BAA", "BBA", "BBB", "BBC", "CBA"]

# Initialize DataFrames to store results
all_samples_missing = []
all_columns_missing = []
all_grouped_column_missing = []

# Define groupable prefixes to look for
group_prefixes = [
    'Elastic Modulus', 'Elongation', 'Maximum ∂2σ/∂ε2',
    'UTS/YS Ratio', 'Ultimate Tensile Strength', 'Yield Strength'
]

# Loop through each campaign and calculate missing values
for campaign in campaigns:
    # Query the data for the given campaign
    df = birdshot2.query(campaign=campaign, client=client)
    
    # Count missing per sample (row)
    row_missing = df.isna().sum(axis=1)
    row_missing.name = "Missing Values"
    row_missing = row_missing.reset_index()
    row_missing.columns = ['Sample', 'Missing Values']
    row_missing['Campaign'] = campaign
    all_samples_missing.append(row_missing)

    # Count missing per column
    column_missing = df.isna().sum(axis=0)
    column_missing.name = campaign
    all_columns_missing.append(column_missing)

    # Grouped column missing values
    grouped_missing = defaultdict(int)
    for prefix in group_prefixes:
        pattern = re.compile(rf"^{re.escape(prefix)}\.[a-zA-Z]$")
        matching_cols = [col for col in df.columns if pattern.match(col)]
        if matching_cols:
            missing_sum = df[matching_cols].isna().sum().sum()
            grouped_missing[prefix] += missing_sum

    grouped_df = pd.DataFrame.from_dict(grouped_missing, orient='index', columns=[campaign])
    all_grouped_column_missing.append(grouped_df)

# Combine results
samples_missing_df = pd.concat(all_samples_missing, ignore_index=True)
campaign_sums = samples_missing_df.groupby('Campaign')['Missing Values'].sum().reset_index()
campaign_sums['Sample'] = campaign_sums['Campaign']
samples_missing_df = pd.concat([samples_missing_df, campaign_sums], ignore_index=True)

columns_missing_df = pd.concat(all_columns_missing, axis=1).T
columns_missing_df.loc['Total'] = columns_missing_df.sum(axis=0)
columns_missing_df['Total'] = columns_missing_df.sum(axis=1)
columns_to_sort = [col for col in columns_missing_df.columns if col != 'Total']
columns_missing_df = columns_missing_df[sorted(columns_to_sort) + ['Total']]

# Combine grouped missing per campaign
grouped_missing_df = pd.concat(all_grouped_column_missing, axis=1)
grouped_missing_df['Total'] = grouped_missing_df.sum(axis=1)
grouped_missing_df = grouped_missing_df.sort_index()

# Save all results
samples_missing_df.to_csv('per_sample.csv', index=False)
columns_missing_df.to_csv('per_column.csv', index=True)
grouped_missing_df.to_csv('per_column_grouped.csv', index=True)
