# CONGASp Training on Custom Bin Count Data
This notebook demonstrates how to load your bin count and ploidy data, prepare it for CONGASp, train the model, and inspect/save the results.

## 1. Import Libraries

In [7]:
import sys
sys.path.append('/app')
import pandas as pd
import torch
from congas.Interface import Interface
from congas.models.LatentCategorical import LatentCategorical
from pyro.optim import ClippedAdam
from pyro.infer import TraceGraph_ELBO
import numpy as np


## 2. Load Bin Count and Ploidy Data

In [8]:
# Path to data folder (adjust if running inside Docker)
data_dir = './'  # Current directory

# Load bin count matrix (bins x cells)
bin_counts = pd.read_csv(data_dir + 'bin_counts_for_python.csv', index_col=0)

# Load ploidy file
ploidy_df = pd.read_csv(data_dir + 'bin_ploidy.csv')
pld = torch.tensor(ploidy_df['ploidy'].values, dtype=torch.float32)

# Split columns based on "146p" in column names
atac_cols = [col for col in bin_counts.columns if "146p" in col]
rna_cols = [col for col in bin_counts.columns if "146p" not in col]

# Subset DataFrames
bin_counts_atac = bin_counts[atac_cols]
bin_counts_rna = bin_counts[rna_cols]


## 3. Prepare Data for CONGASp

In [12]:
# Convert to torch tensors
data_rna = torch.tensor(bin_counts_rna.values, dtype=torch.float32)
data_atac = torch.tensor(bin_counts_atac.values, dtype=torch.float32)

# Normalization factors
norm_factor_rna = data_rna.sum(dim=0)
norm_factor_atac = data_atac.sum(dim=0)

# Number of segments (bins)
segments = data_rna.shape[0]  # Should be the same for both

# Prepare data_dict for CONGASp
data_dict = {
    'data_rna': data_rna,
    'norm_factor_rna': norm_factor_rna,
    'data_atac': data_atac,
    'norm_factor_atac': norm_factor_atac,
    'pld': pld,
    'segments': segments
}

In [20]:
# Choose model, optimizer, and loss
model = LatentCategorical
optimizer = ClippedAdam
loss = TraceGraph_ELBO

# Initialize interface
interface = Interface(model, optimizer, loss)
interface.initialize_model(data_dict)

# Set required model parameters
param_dict = {
    'K': 3,  # Number of clusters
    'theta_shape_rna': torch.ones(segments) * 20.0,  # Shape parameter for RNA
    'theta_rate_rna': torch.ones(segments) * 0.5,    # Rate parameter for RNA
    'theta_shape_atac': torch.ones(segments) * 10.0, # Shape parameter for ATAC
    'theta_rate_atac': torch.ones(segments) * 0.5,   # Rate parameter for ATAC
    'lambda': 0.5,  # Weight between RNA and ATAC (0.5 = equal weight)
    'multiome': False,  # Set to True since you have both RNA and ATAC
    'equal_mixture_weights': True,  # Use same mixture weights for both modalities
    'likelihood_rna': 'NB',  # Negative Binomial for RNA
    'likelihood_atac': 'NB',  # Negative Binomial for ATAC
    'nb_size_init_rna': torch.ones(segments) * 10.0,  # NB size parameter for RNA
    'nb_size_init_atac': torch.ones(segments) * 5.0,  # NB size parameter for ATAC
    'hidden_dim': 5  # Number of possible copy number states
}

interface.set_model_params(param_dict)

# Train the model
losses, n_obs = interface.run(steps=200)

ELBO: 0.176901128  : 100%|██████████| 200/200 [11:04<00:00,  3.32s/it]


Done!





## 4. Set Up and Train the Model

## 5. Inspect and Save Results

In [21]:
params = interface.learned_parameters()

# Save results as numpy file
np.save(data_dir + 'congas_results.npy', params)

# Optionally, display cluster assignments
if 'assignment_rna' in params:
    print('Cluster assignments:', params['assignment_rna'])



Computing assignment probabilities
Cluster assignments: [0 0 0 ... 2 0 0]
