In [1]:

import pandas as pd
import numpy as np
import autoencodix as acx
from autoencodix.data.datapackage import DataPackage
from autoencodix.configs.stackix_config import StackixConfig
from autoencodix.configs.default_config import DataCase

# --- Step 1: generate synthetic data ---
np.random.seed(42)
n_samples = 100
modalities = ["rna", "protein", "atac", "cytokine", "metabolite"]

n_paired = int(n_samples * 0.65)
paired_ids = [f"sample_{i}" for i in range(n_paired)]
remaining_ids = [f"sample_{i}" for i in range(n_paired, n_samples)]

data_frames = {}
for mod in modalities:
    mod_ids = paired_ids + list(np.random.choice(remaining_ids, size=int(len(remaining_ids)*0.7), replace=False))
    df = pd.DataFrame(
        np.random.randn(len(mod_ids), 10),
        index=mod_ids,
        columns=[f"{mod}_feature_{i}" for i in range(10)]
    )
    data_frames[mod] = df

dp = DataPackage(
    multi_bulk=data_frames,
    annotation={mod: df.copy() for mod, df in data_frames.items()}
)

stackix_config = StackixConfig(
    data_case=DataCase.MULTI_BULK,
    requires_paired=False
)

stackix = acx.Stackix(data=dp, config=stackix_config)
result = stackix.run()

for mod in modalities:
    train_len = len(result.datasets.train.datasets[mod].sample_ids)
    valid_len = len(result.datasets.valid.datasets[mod].sample_ids)
    test_len = len(result.datasets.test.datasets[mod].sample_ids)
    total_len = train_len + valid_len + test_len
    print(f"Modality {mod}: train={train_len}, valid={valid_len}, test={test_len}, total={total_len}")
    assert total_len == len(data_frames[mod]), "All samples must be assigned"


"""
This checks if a sample_id does only occur in one split.
Ths should be the case because of our pairing aware splitting,
so if one sample occurs in multiple data modalites, this sample
needs to be in the same split for these data modalites to prevent
leakage. If the sample does only occur in one data modality, however,
it can also be in only one split, so the following test makes sense

"""
all_sample_ids = set().union(*[df.index for df in data_frames.values()])

for sid in all_sample_ids:
    splits = set()
    for split_name, dataset in zip(
        ["train", "valid", "test"],
        [result.datasets.train, result.datasets.valid, result.datasets.test]
    ):
        for mod in modalities:
            if sid in dataset.datasets[mod].sample_ids:
                splits.add(split_name)
    assert len(splits) == 1, f"Sample {sid} appears in multiple splits across modalities"

print("Integration test passed!")

in handle_direct_user_data with data: <class 'autoencodix.data.datapackage.DataPackage'>
--- Running Pairing-Aware Split ---
Training each modality model...
Training modality: rna
Training modality: rna
Epoch 1 - Train Loss: 417.3006
Sub-losses: recon_loss: 417.2992, var_loss: 0.0014, anneal_factor: 0.0000, effective_beta_factor: 0.0000
Epoch 1 - Valid Loss: 29.3943
Sub-losses: recon_loss: 29.3943, var_loss: 0.0000, anneal_factor: 0.0000, effective_beta_factor: 0.0000
Epoch 2 - Train Loss: 402.7640
Sub-losses: recon_loss: 401.2875, var_loss: 1.4765, anneal_factor: 0.0344, effective_beta_factor: 0.0034
Epoch 2 - Valid Loss: 31.7762
Sub-losses: recon_loss: 31.7691, var_loss: 0.0071, anneal_factor: 0.0344, effective_beta_factor: 0.0034
Epoch 3 - Train Loss: 433.3524
Sub-losses: recon_loss: 408.7492, var_loss: 24.6032, anneal_factor: 0.9656, effective_beta_factor: 0.0966
Epoch 3 - Valid Loss: 34.3600
Sub-losses: recon_loss: 34.1360, var_loss: 0.2240, anneal_factor: 0.9656, effective_beta_f