# Notebook 1: Simulation of a Homogeneous Cell Population

### Objective
This notebook simulates a realistic dataset representing a single, homogeneous population of cells. The goal is to create a ground truth dataset to validate a deconvolution model. The simulation generates known mitochondrial cluster signatures and a population of cells whose mitochondrial compositions are all slight variations of a single, central "true" set of proportions. It includes several configurable parameters to control the characteristics of the data, such as the number of mutations per cluster and the distribution of cluster sizes.

The final output is a single compressed file, `sc_mito_vars/sim_data/simulated_data_regression.npz`, containing all the necessary data for the modeling step.

In [None]:
### Cell 1: Imports and Configuration
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
from datetime import datetime

run_id = "R10"
# --- Simulation Configuration ---
N_CLUSTERS = 50
N_LOCATIONS = 1650
N_CELLS = 100
MIN_VARIANTS_PER_CLUSTER = 2
MAX_VARIANTS_PER_CLUSTER = 50

# --- New Configurable Parameters ---
# Controls variation between cells. Higher value = more homogeneous.
HOMOGENEITY_ALPHA = 100.0 
# Controls the amount of simulated sequencing noise.
NOISE_LEVEL = 0.01

# Controls the skew towards less-mutated clusters. Higher value = stronger skew.
SKEW_FACTOR = 2.0
# Controls the distribution of the number of mutations per cluster.
# shape = 0 -> uniform distribution. shape > 1 -> strong skew to low mutation counts.
MUTATION_DENSITY_SHAPE = 1.5

# Controls the skew of proportion sizes. Higher value = more dominant clusters.
PROPORTION_SKEW = 5.0


In [None]:
### Cell 2: Data Generation Function

def generate_homogeneous_data(n_clusters, n_locations, n_cells, min_variants, max_variants, 
                              homogeneity_alpha, noise_level, skew_factor, mutation_shape, proportion_skew):
    """
    Generates data for a single homogeneous population of cells, with control
    over the mean cluster sizes and the distribution of mutations per cluster.
    """
    
    # 1. Define Ground Truth Cluster Signatures (K_true)
    print("1. Generating known mitochondrial cluster signatures (K_true)...")
    
    # --- Create a controllable distribution for the number of variants ---
    possible_counts = np.arange(min_variants, max_variants + 1)
    probabilities = 1 / (possible_counts**mutation_shape + 1e-9)
    probabilities /= probabilities.sum() 
    
    K_true = np.zeros((n_locations, n_clusters))
    for i in range(n_clusters):
        num_variants = np.random.choice(possible_counts, p=probabilities)
        mut_sites = np.random.choice(n_locations, num_variants, replace=False)
        K_true[mut_sites, i] = 1.0

    # 2. Define "True" Mean Proportions, biased by mutation count and proportion skew
    print("2. Simulating proportions, with skew for dominant clusters...")
    
    num_variants_per_cluster = np.sum(K_true, axis=0)
    
    # Create a base for proportions, inversely related to the number of variants
    base_proportions = 1 / ((num_variants_per_cluster + 1) ** skew_factor)
    
    # Create a skewed alpha vector for the Dirichlet distribution
    # This will generate a few large proportions and many small ones.
    dirichlet_alpha = base_proportions ** proportion_skew
    dirichlet_alpha = (dirichlet_alpha / dirichlet_alpha.sum()) * n_clusters
    
    # Draw the final proportions from this skewed Dirichlet distribution
    true_mean_proportions = np.random.dirichlet(dirichlet_alpha)
    
    # 3. For each cell, simulate a slight variation from the mean proportions
    cell_proportions_list = []
    
    true_mean_props_safe = np.where(true_mean_proportions == 0, 1e-6, true_mean_proportions)
    for _ in range(n_cells):
        cell_props = np.random.dirichlet(homogeneity_alpha * true_mean_props_safe)
        cell_proportions_list.append(cell_props)
            
    P_cells_true = np.array(cell_proportions_list)

    # 4. Generate "Observed" Data based on individual cells
    print("4. Generating 'observed' VAFs for each cell with simulated noise...")
    C_true = P_cells_true @ K_true.T 

    C_observed = C_true + np.random.normal(0, noise_level, size=C_true.shape)
    C_observed = np.clip(C_observed, 0, 1) 

    print("\n--- Data Simulation Complete ---")
    
    return {
        "K_true": K_true,
        "P_cells_true": P_cells_true,
        "C_observed": C_observed,
        "true_mean_proportions": true_mean_proportions,
        "num_variants_per_cluster": num_variants_per_cluster
    }


In [None]:
### Cell 3: Execute Simulation and Visualize

simulated_data = generate_homogeneous_data(
    N_CLUSTERS, 
    N_LOCATIONS, 
    N_CELLS, 
    MIN_VARIANTS_PER_CLUSTER, 
    MAX_VARIANTS_PER_CLUSTER,
    HOMOGENEITY_ALPHA,
    NOISE_LEVEL,
    SKEW_FACTOR,
    MUTATION_DENSITY_SHAPE,
    PROPORTION_SKEW
)

# --- Visualization ---
fig, axes = plt.subplots(4, 1, figsize=(10, 30), gridspec_kw={'height_ratios': [3, 1, 1, 1]})
fig.suptitle('Overview of Simulated Data', fontsize=16)


# Plot 1: The distribution of proportions across all cells
p_df = pd.DataFrame(simulated_data['P_cells_true'], columns=[f'Cluster {i+1}' for i in range(N_CLUSTERS)])
p_df.plot(kind='area', stacked=True, ax=axes[0], colormap='viridis', alpha=0.8)
axes[0].set_title('Cluster Proportions Across the Cell Population')
axes[0].set_xlabel('Cell ID')
axes[0].set_ylabel('Proportion')
axes[0].legend(loc='upper right')
axes[0].set_ylim(0, 1)

# Plot 2: The relationship between variant count and mean proportion size
variant_counts = simulated_data['num_variants_per_cluster']
mean_props = simulated_data['true_mean_proportions']
sns.barplot(x=np.arange(N_CLUSTERS), y=mean_props, ax=axes[1], palette="viridis")
axes[1].set_title('Mean Proportion vs. Number of Variants per Cluster')
axes[1].set_xlabel('Cluster ID')
axes[1].set_ylabel('Mean Proportion')
# Add text labels for variant counts
for i, count in enumerate(variant_counts):
    axes[1].text(i, mean_props[i] + 0.005, f'{int(count)} vars', ha='center', va='bottom', fontsize=9)

# Plot 3: Histogram of the number of variants per cluster
sns.histplot(data=variant_counts, ax=axes[2], bins=10, kde=False)
axes[2].set_title('Distribution of Variants per Cluster')
axes[2].set_xlabel('Number of Variants in Signature')
axes[2].set_ylabel('Count of Clusters')

# Plot 4: Histogram of the mean proportion sizes
sns.histplot(data=mean_props, ax=axes[3], bins=10, kde=True)
axes[3].set_title('Distribution of Mean Proportion Sizes')
axes[3].set_xlabel('Mean Proportion Size')
axes[3].set_ylabel('Count of Clusters')


plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


In [None]:
### Cell 4: Create directory structure and save all files

# Generate a unique run identifier based on timestamp

run_dir = f'../sc_mito_vars/sim_data/{run_id}'
numpy_dir = f'{run_dir}/numpy'

# Create directories if they don't exist
os.makedirs(run_dir, exist_ok=True)
os.makedirs(numpy_dir, exist_ok=True)

print(f"Created run directory: {run_dir}")
print(f"Created numpy directory: {numpy_dir}")

# Save the main compressed file to the numpy directory
main_output_filename = f'{numpy_dir}/simulated_data_regression.npz'
np.savez_compressed(main_output_filename, **simulated_data)
print(f"\nMain simulated data saved to '{main_output_filename}'")

# Save intermediate files to the run directory
# 1. Save variant signatures for each cluster (K_true)
variant_signatures_filename = f'{run_dir}/variant_signatures.npy'
np.save(variant_signatures_filename, simulated_data['K_true'])
print(f"Variant signatures saved to '{variant_signatures_filename}'")

# 2. Save proportion of clusters per individual cell (P_cells_true)
cell_proportions_filename = f'{run_dir}/cell_proportions.npy'
np.save(cell_proportions_filename, simulated_data['P_cells_true'])
print(f"Cell proportions saved to '{cell_proportions_filename}'")

# 3. Also save as CSV for easier inspection
variant_signatures_csv = f'{run_dir}/variant_signatures.csv'
pd.DataFrame(simulated_data['K_true'], 
             columns=[f'Cluster_{i+1}' for i in range(N_CLUSTERS)]).to_csv(variant_signatures_csv, index=False)
print(f"Variant signatures CSV saved to '{variant_signatures_csv}'")

cell_proportions_csv = f'{run_dir}/cell_proportions.csv'
pd.DataFrame(simulated_data['P_cells_true'], 
             columns=[f'Cluster_{i+1}' for i in range(N_CLUSTERS)]).to_csv(cell_proportions_csv, index=False)
print(f"Cell proportions CSV saved to '{cell_proportions_csv}'")


In [None]:
### Cell 5: Save simulation parameters and metadata

# Save simulation parameters for reproducibility
params = {
    'N_CLUSTERS': N_CLUSTERS,
    'N_LOCATIONS': N_LOCATIONS,
    'N_CELLS': N_CELLS,
    'MIN_VARIANTS_PER_CLUSTER': MIN_VARIANTS_PER_CLUSTER,
    'MAX_VARIANTS_PER_CLUSTER': MAX_VARIANTS_PER_CLUSTER,
    'HOMOGENEITY_ALPHA': HOMOGENEITY_ALPHA,
    'NOISE_LEVEL': NOISE_LEVEL,
    'SKEW_FACTOR': SKEW_FACTOR,
    'MUTATION_DENSITY_SHAPE': MUTATION_DENSITY_SHAPE,
    'PROPORTION_SKEW': PROPORTION_SKEW,
    'run_id': run_id,
    'timestamp': datetime.now().isoformat()
}

# Save parameters as JSON for easy reading
params_filename = f'{run_dir}/simulation_parameters.json'
with open(params_filename, 'w') as f:
    json.dump(params, f, indent=4)
print(f"\nSimulation parameters saved to '{params_filename}'")

# Save additional metadata
metadata = {
    'num_variants_per_cluster': simulated_data['num_variants_per_cluster'].tolist(),
    'true_mean_proportions': simulated_data['true_mean_proportions'].tolist(),
    'data_shapes': {
        'K_true': simulated_data['K_true'].shape,
        'P_cells_true': simulated_data['P_cells_true'].shape,
        'C_observed': simulated_data['C_observed'].shape
    }
}

metadata_filename = f'{run_dir}/metadata.json'
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=4)
print(f"Metadata saved to '{metadata_filename}'")


In [None]:
### Cell 6: Save simulation parameters and metadata

# Save simulation parameters for reproducibility
params = {
    'N_CLUSTERS': N_CLUSTERS,
    'N_LOCATIONS': N_LOCATIONS,
    'N_CELLS': N_CELLS,
    'MIN_VARIANTS_PER_CLUSTER': MIN_VARIANTS_PER_CLUSTER,
    'MAX_VARIANTS_PER_CLUSTER': MAX_VARIANTS_PER_CLUSTER,
    'HOMOGENEITY_ALPHA': HOMOGENEITY_ALPHA,
    'NOISE_LEVEL': NOISE_LEVEL,
    'SKEW_FACTOR': SKEW_FACTOR,
    'MUTATION_DENSITY_SHAPE': MUTATION_DENSITY_SHAPE,
    'PROPORTION_SKEW': PROPORTION_SKEW,
    'run_id': run_id,
    'timestamp': datetime.now().isoformat()
}

# Save parameters as JSON for easy reading
import json
params_filename = f'{run_dir}/simulation_parameters.json'
with open(params_filename, 'w') as f:
    json.dump(params, f, indent=4)
print(f"\nSimulation parameters saved to '{params_filename}'")

# Save additional metadata
metadata = {
    'num_variants_per_cluster': simulated_data['num_variants_per_cluster'].tolist(),
    'true_mean_proportions': simulated_data['true_mean_proportions'].tolist(),
    'data_shapes': {
        'K_true': list(simulated_data['K_true'].shape),
        'P_cells_true': list(simulated_data['P_cells_true'].shape),
        'C_observed': list(simulated_data['C_observed'].shape)
    },
    'statistics': {
        'total_variants': int(simulated_data['K_true'].sum()),
        'mean_variants_per_cluster': float(simulated_data['num_variants_per_cluster'].mean()),
        'std_variants_per_cluster': float(simulated_data['num_variants_per_cluster'].std()),
        'min_variants_per_cluster': int(simulated_data['num_variants_per_cluster'].min()),
        'max_variants_per_cluster': int(simulated_data['num_variants_per_cluster'].max()),
        'proportion_entropy': float(-np.sum(simulated_data['true_mean_proportions'] * 
                                           np.log(simulated_data['true_mean_proportions'] + 1e-10))),
        'effective_clusters': int((simulated_data['true_mean_proportions'] > 0.01).sum())
    }
}

metadata_filename = f'{run_dir}/metadata.json'
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=4)
print(f"Metadata saved to '{metadata_filename}'")

# Save cluster-level summary as CSV
cluster_summary = pd.DataFrame({
    'cluster_id': np.arange(N_CLUSTERS),
    'num_variants': simulated_data['num_variants_per_cluster'],
    'mean_proportion': simulated_data['true_mean_proportions']
})
cluster_summary_filename = f'{run_dir}/cluster_summary.csv'
cluster_summary.to_csv(cluster_summary_filename, index=False)
print(f"Cluster summary saved to '{cluster_summary_filename}'")


In [None]:
### Cell 7: Create summary report and verification

# Generate a summary report
summary_lines = [
    f"Simulation Run: {run_id}",
    f"=" * 50,
    f"",
    f"Data Dimensions:",
    f"  - Number of clusters: {N_CLUSTERS}",
    f"  - Number of genomic locations: {N_LOCATIONS}",
    f"  - Number of cells: {N_CELLS}",
    f"",
    f"Simulation Parameters:",
    f"  - Homogeneity alpha: {HOMOGENEITY_ALPHA}",
    f"  - Noise level: {NOISE_LEVEL}",
    f"  - Skew factor: {SKEW_FACTOR}",
    f"  - Mutation density shape: {MUTATION_DENSITY_SHAPE}",
    f"  - Proportion skew: {PROPORTION_SKEW}",
    f"",
    f"Variant Statistics:",
    f"  - Total variants: {simulated_data['K_true'].sum():.0f}",
    f"  - Min variants per cluster: {simulated_data['num_variants_per_cluster'].min():.0f}",
    f"  - Max variants per cluster: {simulated_data['num_variants_per_cluster'].max():.0f}",
    f"  - Mean variants per cluster: {simulated_data['num_variants_per_cluster'].mean():.2f}",
    f"  - Std variants per cluster: {simulated_data['num_variants_per_cluster'].std():.2f}",
    f"",
    f"Proportion Statistics:",
    f"  - Min mean proportion: {simulated_data['true_mean_proportions'].min():.6f}",
    f"  - Max mean proportion: {simulated_data['true_mean_proportions'].max():.6f}",
    f"  - Std of mean proportions: {simulated_data['true_mean_proportions'].std():.6f}",
    f"  - Effective clusters (>1% proportion): {(simulated_data['true_mean_proportions'] > 0.01).sum()}",
    f"",
    f"Files Generated:",
    f"  - Main data: {main_output_filename}",
    f"  - Variant signatures: {variant_signatures_filename}",
    f"  - Cell proportions: {cell_proportions_filename}",
    f"  - Variant signatures CSV: {variant_signatures_csv}",
    f"  - Cell proportions CSV: {cell_proportions_csv}",
    f"  - Cluster summary: {cluster_summary_filename}",
    f"  - Parameters: {params_filename}",
    f"  - Metadata: {metadata_filename}"
]

summary_text = "\n".join(summary_lines)
print(summary_text)

# Save summary to file
summary_filename = f'{run_dir}/summary.txt'
with open(summary_filename, 'w') as f:
    f.write(summary_text)
print(f"\nSummary report saved to '{summary_filename}'")

# Verify all files were created
print("\n" + "=" * 50)
print("Verification of saved files:")
print("=" * 50)

files_to_check = [
    (main_output_filename, "Main compressed data"),
    (variant_signatures_filename, "Variant signatures (numpy)"),
    (cell_proportions_filename, "Cell proportions (numpy)"),
    (variant_signatures_csv, "Variant signatures (CSV)"),
    (cell_proportions_csv, "Cell proportions (CSV)"),
    (cluster_summary_filename, "Cluster summary"),
    (params_filename, "Simulation parameters"),
    (metadata_filename, "Metadata"),
    (summary_filename, "Summary report")
]

all_files_exist = True
for filepath, description in files_to_check:
    if os.path.exists(filepath):
        file_size = os.path.getsize(filepath) / 1024  # Size in KB
        print(f"✓ {description}: {file_size:.2f} KB")
    else:
        print(f"✗ {description}: NOT FOUND")
        all_files_exist = False

if all_files_exist:
    print("\n✓ All files successfully created!")
else:
    print("\n⚠ Some files are missing. Please check the output.")
