In [15]:
# Cell 1: Setup and Imports
"""
# Single-Cell Mitochondrial Variant Analysis from VCF
## Processing VAULT-generated VCF files and simulating additional cells

This notebook:
1. Reads VCF files from VAULT pipeline containing single-mitochondrion variant profiles
2. Applies quality filters (PASS, SNPs only, VAF, depth thresholds)
3. Clusters variants into representative groups
4. Generates simulated cell populations based on observed single-cell data
"""

import sys
import os

# Add the parent directory to path to import from src
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import json
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from vcf_processor import VCFProcessor
from variant_clustering import VariantClusterer  
from cell_simulator import CellSimulator

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("Modules loaded successfully")
print(f"Working directory: {os.getcwd()}")
print(f"Python path includes: {sys.path[0]}")


Modules loaded successfully
Working directory: /mmfs1/research/hhakimjavadi/projects/MitoDemon/repositories/MitoDemon/notebooks
Python path includes: /mmfs1/research/hhakimjavadi/projects/MitoDemon/repositories/MitoDemon/src


In [16]:
"""
### Configuration Parameters
All parameters can be modified here for different analyses
"""
# Input/Output Configuration
sample="SRR12676843"
VCF_FILE = "../../../data/vault_pipeline_output/"+sample+"_output/snp/all_snp_from_perfect_umi.vcf"  # Path to input VCF
OUTPUT_DIR = "../sc_mito_vars/real_data"      # Output directory for results
# Quality Filtering Parameters (relaxed)
FILTER_PASS = True          # Keep only PASS filter variants
INCLUDE_INDELS = True       # Include INDELs (new)
MIN_VAF = 0.25              # Minimum variant allele frequency (relaxed)
MIN_DEPTH = 1               # Minimum total read depth (relaxed)
MIN_ALT_READS = 1           # Minimum reads supporting alternate allele (relaxed)
MAX_MISSING_RATE = 0.95     # Max missing rate (new: retain >=5% UMIs)
# Clustering Parameters
N_CLUSTERS = 20                     # Number of variant clusters to identify
CLUSTERING_METHOD = 'spectral'      # New default: spectral for sparse data
USE_PCA = True                      # Use PCA (new)
PCA_COMPONENTS = 50                 # PCA components (new)
CONTINUOUS = True                   # Use continuous VAF (new)
# Simulation Parameters (for generating additional cells)
N_CELLS = 200                       # Number of cells to simulate
HOMOGENEITY_ALPHA = 50.0           # Controls variation between cells (higher = more homogeneous)
NOISE_LEVEL = 0.01                  # Gaussian noise level
SKEW_FACTOR = 2.0                   # Controls skew towards less-mutated clusters
DROPOUT_RATE = 0.1                  # Probability of zero values
OVERDISPERSION = 0.05               # For more realistic noise modeling
POISSON_LAMBDA = 10.0               # Lambda for Poisson read noise (new)
UMI_BIAS_RATE = 0.1                 # UMI bias rate (new)
# Mitochondrial genome parameters
N_MITO_POSITIONS = 16569            # Standard human mitochondrial genome length
# Random seed for reproducibility
RANDOM_SEED = 42 
# Create output directory with timestamp
run_id = "imigseq_SRR12455630"
run_dir = f'{OUTPUT_DIR}/{run_id}'
os.makedirs(run_dir, exist_ok=True)
os.makedirs(f'{run_dir}/numpy', exist_ok=True)
print("=" * 60 )
print("CONFIGURATION SUMMARY")
print("=" * 60 )
print(f"Input VCF: {VCF_FILE}")
print(f"Output directory: {run_dir}")
print(f"\nFiltering parameters:")
print(f"  - Filter PASS only: {FILTER_PASS}")
print(f"  - Include INDELs: {INCLUDE_INDELS}")
print(f"  - Min VAF: {MIN_VAF}")
print(f"  - Min depth: {MIN_DEPTH}")
print(f"  - Min alt reads: {MIN_ALT_READS}")
print(f"  - Max missing rate: {MAX_MISSING_RATE}")
print(f"\nClustering parameters:")
print(f"  - Number of clusters: {N_CLUSTERS}")
print(f"  - Method: {CLUSTERING_METHOD}")
print(f"  - Use PCA: {USE_PCA} with {PCA_COMPONENTS} components")
print(f"  - Continuous VAF: {CONTINUOUS}")
print(f"\nSimulation parameters:")
print(f"  - Cells to simulate: {N_CELLS}")
print(f"  - Homogeneity alpha: {HOMOGENEITY_ALPHA}")
print(f"  - Noise level: {NOISE_LEVEL}")
print(f"  - Skew factor: {SKEW_FACTOR}")
print(f"  - Dropout rate: {DROPOUT_RATE}")
print(f"  - Overdispersion: {OVERDISPERSION}")
print(f"  - Poisson lambda: {POISSON_LAMBDA}")
print(f"  - UMI bias rate: {UMI_BIAS_RATE}")


CONFIGURATION SUMMARY
Input VCF: ../../../data/vault_pipeline_output/SRR12676843_output/snp/all_snp_from_perfect_umi.vcf
Output directory: ../sc_mito_vars/real_data/imigseq_SRR12455630

Filtering parameters:
  - Filter PASS only: True
  - Include INDELs: True
  - Min VAF: 0.25
  - Min depth: 1
  - Min alt reads: 1
  - Max missing rate: 0.95

Clustering parameters:
  - Number of clusters: 20
  - Method: spectral
  - Use PCA: True with 50 components
  - Continuous VAF: True

Simulation parameters:
  - Cells to simulate: 200
  - Homogeneity alpha: 50.0
  - Noise level: 0.01
  - Skew factor: 2.0
  - Dropout rate: 0.1
  - Overdispersion: 0.05
  - Poisson lambda: 10.0
  - UMI bias rate: 0.1


In [17]:
# Cell 3: Read and Process VCF File
"""
### Step 1: Read and Process VCF File
Extract variant information from VAULT-generated VCF
"""

print("=" * 60)
print("STEP 1: READING VCF FILE")
print("=" * 60)

# Initialize processor
processor = VCFProcessor(VCF_FILE, verbose=True)

# Read VCF
variants_df = processor.read_vcf()

print("\n--- Variant Statistics (Before Filtering) ---")
print(f"Total variants: {len(variants_df)}")
print(f"SNPs: {variants_df['is_snp'].sum()}")
print(f"Indels: {variants_df['is_indel'].sum()}")
print(f"Unique UMIs (mitochondrial molecules): {variants_df['umi_id'].nunique()}")

# Display sample of variants
print("\nSample of variants (first 10 rows):")
display(variants_df[['chrom', 'pos', 'ref', 'alt', 'umi_id', 'read_count', 
                     'vaf', 'total_depth', 'is_snp']].head(10))

# UMI statistics
umi_counts = variants_df['umi_id'].value_counts()
print(f"\nUMI statistics:")
print(f"  - Mean variants per UMI: {umi_counts.mean():.2f}")
print(f"  - Median variants per UMI: {umi_counts.median():.0f}")
print(f"  - Max variants per UMI: {umi_counts.max()}")


STEP 1: READING VCF FILE
Reading VCF file: ../../../data/vault_pipeline_output/SRR12676843_output/snp/all_snp_from_perfect_umi.vcf
Loaded 207634 variants from VCF file
Unique UMIs: 7885

--- Variant Statistics (Before Filtering) ---
Total variants: 207634
SNPs: 702
Indels: 206932
Unique UMIs (mitochondrial molecules): 7885

Sample of variants (first 10 rows):


Unnamed: 0,chrom,pos,ref,alt,umi_id,read_count,vaf,total_depth,is_snp
0,NC_005089.1,216,A,AC,2_5end_AGTATTGCTGCA,2,0.5,2,False
1,NC_005089.1,614,T,TAG,2_5end_AGTATTGCTGCA,2,0.5,2,False
2,NC_005089.1,1611,A,AAAG,2_5end_AGTATTGCTGCA,2,0.5,2,False
3,NC_005089.1,1715,CTTT,C,2_5end_AGTATTGCTGCA,2,1.0,1,False
4,NC_005089.1,2150,TA,T,2_5end_AGTATTGCTGCA,2,1.0,1,False
5,NC_005089.1,3861,T,TGA,2_5end_AGTATTGCTGCA,2,1.0,1,False
6,NC_005089.1,4722,T,TTAC,2_5end_AGTATTGCTGCA,2,1.0,1,False
7,NC_005089.1,5651,AT,A,2_5end_AGTATTGCTGCA,2,1.0,1,False
8,NC_005089.1,15573,CTTA,C,2_5end_AGTATTGCTGCA,2,0.5,2,False
9,NC_005089.1,15611,T,TG,2_5end_AGTATTGCTGCA,2,0.5,2,False



UMI statistics:
  - Mean variants per UMI: 26.33
  - Median variants per UMI: 27
  - Max variants per UMI: 78


In [18]:
"""
### Step 2: Apply Quality Filters
Filter variants based on quality criteria
"""
print("=" * 60 )
print("STEP 2: APPLYING QUALITY FILTERS")
print("=" * 60 )
# Apply filters (updated with relaxed params and INDEL inclusion)
filtered_variants = processor.apply_filters(
    filter_pass=FILTER_PASS,
    include_indels=INCLUDE_INDELS,
    min_vaf=MIN_VAF,
    min_depth=MIN_DEPTH,
    min_alt_reads=MIN_ALT_READS,
    max_missing_rate=MAX_MISSING_RATE
)
print("\n--- Filtering Summary ---")
print(f"Variants passing all filters: {len(filtered_variants)}")
print(f"Unique UMIs with variants: {filtered_variants['umi_id'].nunique()}")
print(f"Retention rate: {len(filtered_variants)/len(variants_df)*100 :.1f}%")


STEP 2: APPLYING QUALITY FILTERS
After PASS filter: 207634 variants (0 removed)
INDELs included (encoded as continuous features)
After VAF >= 0.25: 189572 variants (18062 removed)
After depth >= 1: 189572 variants (0 removed)
After alt reads >= 1: 189572 variants (0 removed)
After frequency filter (min 39 occurrences): 46903 variants (142669 removed)

Final: 46903 variants passed all filters (22.6%)
Unique UMIs remaining: 7545

--- Filtering Summary ---
Variants passing all filters: 46903
Unique UMIs with variants: 7545
Retention rate: 22.6%


In [None]:
# 4.1 Plot distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# VAF distribution
axes[0, 0].hist(filtered_variants['vaf'], bins=30, edgecolor='black', alpha=0.7, color='blue')
axes[0, 0].axvline(MIN_VAF, color='red', linestyle='--', label=f'Min VAF = {MIN_VAF}')
axes[0, 0].set_xlabel('Variant Allele Frequency')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('VAF Distribution (Filtered Variants)')
axes[0, 0].legend()

# Read depth distribution
axes[0, 1].hist(filtered_variants['total_depth'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[0, 1].axvline(MIN_DEPTH, color='red', linestyle='--', label=f'Min Depth = {MIN_DEPTH}')
axes[0, 1].set_xlabel('Total Read Depth')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Read Depth Distribution (Filtered Variants)')
axes[0, 1].legend()

# Alt read distribution
axes[1, 0].hist(filtered_variants['alt_depth'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1, 0].axvline(MIN_ALT_READS, color='red', linestyle='--', label=f'Min Alt = {MIN_ALT_READS}')
axes[1, 0].set_xlabel('Alt Read Count')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Alt Read Distribution (Filtered Variants)')
axes[1, 0].legend()

# Position distribution
axes[1, 1].hist(filtered_variants['pos'], bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_xlabel('Mitochondrial Position')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Variant Position Distribution')

plt.tight_layout()
plt.savefig(f'{run_dir}/filtering_distributions.png', dpi=150)
plt.show()

In [20]:
"""
### Step 3: Create Variant Matrix
Convert filtered variants to matrix format
"""
print("=" * 60 )
print("STEP 3: CREATING VARIANT MATRIX")
print("=" * 60 )
# Create variant matrix (updated: continuous, with INDELs)
variant_matrix = processor.get_variant_matrix(
    n_positions=N_MITO_POSITIONS,
    continuous=CONTINUOUS,
    include_indels=INCLUDE_INDELS
)
print(f"\nVariant matrix shape: {variant_matrix.shape}")
print(f"  - Rows (features): {variant_matrix.shape[0 ]}")
print(f"  - Columns (UMIs): {variant_matrix.shape[1 ]}")
print(f"\nMatrix statistics:")
print(f"  - Total non-zero: {np.sum(variant_matrix > 0)}")
print(f"  - Sparsity: {1  - np.sum(variant_matrix > 0) / variant_matrix.size:.6f}")
print(f"  - Mean value per UMI: {variant_matrix.mean(axis=0).mean():.2f}")


STEP 3: CREATING VARIANT MATRIX
Encoding 46741 INDELs as 3 features each (presence, length, type)
Created variant matrix: (156792, 7545)
Total variants/features: 97726
Mean value per UMI: 0.00
Sparsity: 0.999917

Variant matrix shape: (156792, 7545)
  - Rows (features): 156792
  - Columns (UMIs): 7545

Matrix statistics:
  - Total non-zero: 97726
  - Sparsity: 0.999917
  - Mean value per UMI: 0.00


In [None]:

# Cell 5.2 Visualize variant matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Heatmap of first 1000 positions
im1 = axes[0].imshow(variant_matrix[:1000, :], aspect='auto', cmap='binary', interpolation='nearest')
axes[0].set_xlabel('UMI Index')
axes[0].set_ylabel('Mitochondrial Position')
axes[0].set_title('Variant Matrix (First 1000 positions)')
plt.colorbar(im1, ax=axes[0], label='Variant Present')

# Variants per UMI
variants_per_umi = variant_matrix.sum(axis=0)
axes[1].bar(range(len(variants_per_umi)), variants_per_umi, color='steelblue')
axes[1].set_xlabel('UMI Index')
axes[1].set_ylabel('Number of Variants')
axes[1].set_title('Variants per UMI')

plt.tight_layout()
plt.savefig(f'{run_dir}/variant_matrix_visualization.png', dpi=150)
plt.show()

In [22]:
"""
### Step 4: Cluster Variants into Representative Groups
Identify K representative variant signatures
"""
print("=" * 60 )
print("STEP 4: CLUSTERING VARIANTS")
print("=" * 60 )
# Initialize clusterer (updated: spectral, PCA)
clusterer = VariantClusterer(
    method=CLUSTERING_METHOD, 
    n_clusters=N_CLUSTERS, 
    random_state=RANDOM_SEED,
    use_pca=USE_PCA,
    pca_components=PCA_COMPONENTS
)
# Perform clustering
print(f"Clustering using {CLUSTERING_METHOD} with K={N_CLUSTERS}...")
cluster_signatures, cluster_labels = clusterer.fit_predict(variant_matrix)
# Get clustering statistics
cluster_stats = clusterer.get_cluster_statistics(variant_matrix)
print(f"\nClustering Results:")
print(f"  - Number of clusters: {cluster_stats['n_clusters']}")
print(f"  - Cluster sizes: {cluster_stats['cluster_sizes']}")
print(f"  - Mean value per cluster: {cluster_stats['mean_variants_per_cluster']:.2f}")
print(f"  - Std value per cluster: {cluster_stats['std_variants_per_cluster']:.2f}")
# Calculate initial proportions from single cell
initial_proportions = np.bincount(cluster_labels, minlength=N_CLUSTERS) / len(cluster_labels)
initial_proportions = initial_proportions.reshape(1 , -1 )  # Shape: (1, n_clusters)
print(f"\nInitial cluster proportions (from single cell):")
for i, prop in enumerate(initial_proportions[0 ]):
    print(f"  Cluster {i}: {prop:.4f} ({prop*100 :.2f}%)")


STEP 4: CLUSTERING VARIANTS
Clustering using spectral with K=20...
Applied PCA: Reduced to 50 components

Clustering Results:
  - Number of clusters: 20
  - Cluster sizes: [ 212  100   61  117  105  120   61  161  106  138  136   98   83  188
 4596   99  848  122   12  182]
  - Mean value per cluster: 0.00
  - Std value per cluster: 0.00

Initial cluster proportions (from single cell):
  Cluster 0: 0.0281 (2.81%)
  Cluster 1: 0.0133 (1.33%)
  Cluster 2: 0.0081 (0.81%)
  Cluster 3: 0.0155 (1.55%)
  Cluster 4: 0.0139 (1.39%)
  Cluster 5: 0.0159 (1.59%)
  Cluster 6: 0.0081 (0.81%)
  Cluster 7: 0.0213 (2.13%)
  Cluster 8: 0.0140 (1.40%)
  Cluster 9: 0.0183 (1.83%)
  Cluster 10: 0.0180 (1.80%)
  Cluster 11: 0.0130 (1.30%)
  Cluster 12: 0.0110 (1.10%)
  Cluster 13: 0.0249 (2.49%)
  Cluster 14: 0.6091 (60.91%)
  Cluster 15: 0.0131 (1.31%)
  Cluster 16: 0.1124 (11.24%)
  Cluster 17: 0.0162 (1.62%)
  Cluster 18: 0.0016 (0.16%)
  Cluster 19: 0.0241 (2.41%)


In [None]:
# Cell 6.2 Visualize clustering results
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Cluster signatures heatmap
im = axes[0, 0].imshow(cluster_signatures[:1000, :], aspect='auto', cmap='RdBu_r', interpolation='nearest')
axes[0, 0].set_xlabel('Cluster ID')
axes[0, 0].set_ylabel('Mitochondrial Position (first 1000)')
axes[0, 0].set_title('Cluster Signatures')
plt.colorbar(im, ax=axes[0, 0])

# Cluster sizes
axes[0, 1].bar(range(N_CLUSTERS), cluster_stats['cluster_sizes'], color='coral')
axes[0, 1].set_xlabel('Cluster ID')
axes[0, 1].set_ylabel('Number of UMIs')
axes[0, 1].set_title('Cluster Sizes')

# Variants per cluster
axes[1, 0].bar(range(len(cluster_stats['variants_per_cluster'])), 
               cluster_stats['variants_per_cluster'], color='teal')
axes[1, 0].set_xlabel('Cluster ID')
axes[1, 0].set_ylabel('Total Variants')
axes[1, 0].set_title('Variants per Cluster')

# Initial proportions
axes[1, 1].bar(range(N_CLUSTERS), initial_proportions[0], color='purple')
axes[1, 1].set_xlabel('Cluster ID')
axes[1, 1].set_ylabel('Proportion')
axes[1, 1].set_title('Initial Cluster Proportions (Single Cell)')

plt.tight_layout()
plt.savefig(f'{run_dir}/clustering_results.png', dpi=150)
plt.show()

In [24]:
"""
### Step 5: Simulate Additional Cells with Realistic Noise
Generate a population of cells based on observed single-cell data
"""
print("=" * 60 )
print("STEP 5: SIMULATING ADDITIONAL CELLS WITH IMPROVED NOISE MODEL")
print("=" * 60 )
# Initialize simulator
simulator = CellSimulator(random_state=RANDOM_SEED)
# Simulate cell proportions for multiple cells
print(f"Simulating {N_CELLS} cells based on observed data...")
simulated_proportions = simulator.simulate_cell_proportions(
    observed_proportions=initial_proportions,
    n_cells=N_CELLS,
    homogeneity_alpha=HOMOGENEITY_ALPHA,
    skew_factor=SKEW_FACTOR
)
print(f"\nSimulated cell proportions shape: {simulated_proportions.shape}")
# Add noise (updated: with Poisson and UMI bias)
C_observed = simulator.add_noise(
    cell_proportions=simulated_proportions,
    cluster_signatures=cluster_signatures,
    noise_level=NOISE_LEVEL,
    poisson_lambda=POISSON_LAMBDA,
    umi_bias_rate=UMI_BIAS_RATE
)
print(f"\nObserved data matrix shape: {C_observed.shape}")


STEP 5: SIMULATING ADDITIONAL CELLS WITH IMPROVED NOISE MODEL
Simulating 200 cells based on observed data...

Simulated cell proportions shape: (200, 20)

Observed data matrix shape: (200, 156792)


In [25]:
"""
### Step 6: Save All Results
Save variant signatures, cell proportions, and metadata
"""
print("=" * 60 )
print("STEP 6: SAVING RESULTS")
print("=" * 60 )
# 1. Save main compressed file (updated with new params)
main_data = {
    'K_true': cluster_signatures,           
    'P_cells_true': simulated_proportions,  
    'C_observed': C_observed,               
    'true_mean_proportions': simulated_proportions.mean(axis=0 ),
    'num_variants_per_cluster': cluster_stats['variants_per_cluster']
}
main_output_filename = f'{run_dir}/numpy/vcf_processed_data.npz'
np.savez_compressed(main_output_filename, **main_data)
print(f"✓ Main data saved to: {main_output_filename}")
# 2. Save intermediate files
# Variant signatures
variant_signatures_filename = f'{run_dir}/variant_signatures.npy'
np.save(variant_signatures_filename, cluster_signatures)
print(f"✓ Variant signatures saved to: {variant_signatures_filename}")
# Cell proportions
cell_proportions_filename = f'{run_dir}/cell_proportions.npy'
np.save(cell_proportions_filename, simulated_proportions)
print(f"✓ Cell proportions saved to: {cell_proportions_filename}")
# Save as CSV for inspection
variant_signatures_csv = f'{run_dir}/variant_signatures.csv'
pd.DataFrame(cluster_signatures.T, 
             columns=[f'Pos_{i+1 }' for i in range(cluster_signatures.shape[0 ])],
             index=[f'Cluster_{i+1 }' for i in range(N_CLUSTERS)]).to_csv(variant_signatures_csv)
print(f"✓ Variant signatures CSV saved to: {variant_signatures_csv}")
cell_proportions_csv = f'{run_dir}/cell_proportions.csv'
pd.DataFrame(simulated_proportions, 
             columns=[f'Cluster_{i+1 }' for i in range(N_CLUSTERS)]).to_csv(cell_proportions_csv, index=False)
print(f"✓ Cell proportions CSV saved to: {cell_proportions_csv}")
# 3. Save filtered variants
filtered_variants_csv = f'{run_dir}/filtered_variants.csv'
filtered_variants.to_csv(filtered_variants_csv, index=False)
print(f"✓ Filtered variants saved to: {filtered_variants_csv}")
# Also save the variant matrix for use in deconvolution
variant_matrix_file = f'{run_dir}/variant_matrix.npy'
np.save(variant_matrix_file, variant_matrix)
print(f"✓ Variant matrix saved to: {variant_matrix_file}")
# 4. Save parameters (updated to match Cell 2: replace SNPS_ONLY with INCLUDE_INDELS, add new noise params)
params = {
    'input_file': VCF_FILE,
    'filter_pass': FILTER_PASS,
    'include_indels': INCLUDE_INDELS,
    'min_vaf': MIN_VAF,
    'min_depth': MIN_DEPTH,
    'min_alt_reads': MIN_ALT_READS,
    'max_missing_rate': MAX_MISSING_RATE,
    'n_clusters': N_CLUSTERS,
    'clustering_method': CLUSTERING_METHOD,
    'n_cells': N_CELLS,
    'homogeneity_alpha': HOMOGENEITY_ALPHA,
    'noise_level': NOISE_LEVEL,
    'dropout_rate': DROPOUT_RATE,  
    'overdispersion': OVERDISPERSION,  
    'skew_factor': SKEW_FACTOR,
    'n_mito_positions': N_MITO_POSITIONS,
    'random_seed': RANDOM_SEED,
    'run_id': run_id,
    'timestamp': datetime.now().isoformat(),
    'continuous': CONTINUOUS,
    'use_pca': USE_PCA,
    'pca_components': PCA_COMPONENTS,
    'poisson_lambda': POISSON_LAMBDA,
    'umi_bias_rate': UMI_BIAS_RATE
}
params_filename = f'{run_dir}/parameters.json'
with open(params_filename, 'w') as f:
    json.dump(params, f, indent=4 )
print(f"✓ Parameters saved to: {params_filename}")
# 5. Save metadata
metadata = {
    'vcf_stats': {
        'total_variants': len(variants_df),
        'filtered_variants': len(filtered_variants),
        'unique_umis_original': variants_df['umi_id'].nunique(),
        'unique_umis_filtered': filtered_variants['umi_id'].nunique()
    },
    'clustering_stats': {
        'n_clusters': int(cluster_stats['n_clusters']),
        'cluster_sizes': cluster_stats['cluster_sizes'].tolist(),
        'variants_per_cluster': cluster_stats['variants_per_cluster'].tolist(),
        'mean_variants_per_cluster': float(cluster_stats['mean_variants_per_cluster'])
    },
    'simulation_stats': {
        'n_cells_simulated': int(N_CELLS),
        'mean_proportions': simulated_proportions.mean(axis=0 ).tolist(),
        'std_proportions': simulated_proportions.std(axis=0 ).tolist()
    },
    'data_shapes': {
        'variant_signatures': list(cluster_signatures.shape),
        'cell_proportions': list(simulated_proportions.shape),
        'observed_data': list(C_observed.shape)
    }
}
metadata_filename = f'{run_dir}/metadata.json'
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=4 )
print(f"✓ Metadata saved to: {metadata_filename}")
print("\n" + "=" * 60 )
print("ALL FILES SAVED SUCCESSFULLY")
print("=" * 60 )


STEP 6: SAVING RESULTS
✓ Main data saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/numpy/vcf_processed_data.npz
✓ Variant signatures saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/variant_signatures.npy
✓ Cell proportions saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/cell_proportions.npy
✓ Variant signatures CSV saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/variant_signatures.csv
✓ Cell proportions CSV saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/cell_proportions.csv
✓ Filtered variants saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/filtered_variants.csv
✓ Variant matrix saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/variant_matrix.npy
✓ Parameters saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/parameters.json
✓ Metadata saved to: ../sc_mito_vars/real_data/imigseq_SRR12455630/metadata.json

ALL FILES SAVED SUCCESSFULLY


In [26]:
"""
### Step 7: Generate Summary Report
Create a comprehensive summary of the analysis
"""
# Generate summary report
summary_lines = [
    f"VCF Processing and Simulation Report",
    f"Run ID: {run_id}",
    f"=" * 60,
    f"",
    f"INPUT DATA:",
    f"  Source: {VCF_FILE}",
    f"  Total variants: {len(variants_df)}",
    f"  Unique UMIs: {variants_df['umi_id'].nunique()}",
    f"",
    f"FILTERING:",
    f"  Variants passing filters: {len(filtered_variants)}",
    f"  Retention rate: {len(filtered_variants)/len(variants_df)*100 :.1f}%",
    f"  Filter criteria:",
    f"    - PASS only: {FILTER_PASS}",
    f"    - Include INDELs: {INCLUDE_INDELS}",
    f"    - Min VAF: {MIN_VAF}",
    f"    - Min depth: {MIN_DEPTH}",
    f"    - Min alt reads: {MIN_ALT_READS}",
    f"",
    f"CLUSTERING:",
    f"  Method: {CLUSTERING_METHOD}",
    f"  Number of clusters: {N_CLUSTERS}",
    f"  Mean variants per cluster: {cluster_stats['mean_variants_per_cluster']:.2f}",
    f"  Cluster sizes: {cluster_stats['cluster_sizes'].tolist()}",
    f"",
    f"SIMULATION:",
    f"  Cells simulated: {N_CELLS}",
    f"  Homogeneity alpha: {HOMOGENEITY_ALPHA}",
    f"  Noise level: {NOISE_LEVEL}",
    f"  Skew factor: {SKEW_FACTOR}",
    f"",
    f"OUTPUT FILES:",
    f"  Main data: {main_output_filename}",
    f"  Variant signatures: {variant_signatures_filename}",
    f"  Cell proportions: {cell_proportions_filename}",
    f"  Filtered variants: {filtered_variants_csv}",
    f"  Parameters: {params_filename}",
    f"  Metadata: {metadata_filename}"
]
summary_text = "\n".join(summary_lines)
print(summary_text)
# Save summary
summary_filename = f'{run_dir}/summary.txt'
with open(summary_filename, 'w') as f:
    f.write(summary_text)
print(f"\n✓ Summary report saved to: {summary_filename}")
print("\n" + "=" * 60)
print("ANALYSIS COMPLETE!")
print(f"All results saved to: {run_dir}")
print("=" * 60)


VCF Processing and Simulation Report
Run ID: imigseq_SRR12455630

INPUT DATA:
  Source: ../../../data/vault_pipeline_output/SRR12676843_output/snp/all_snp_from_perfect_umi.vcf
  Total variants: 207634
  Unique UMIs: 7885

FILTERING:
  Variants passing filters: 46903
  Retention rate: 22.6%
  Filter criteria:
    - PASS only: True
    - Include INDELs: True
    - Min VAF: 0.25
    - Min depth: 1
    - Min alt reads: 1

CLUSTERING:
  Method: spectral
  Number of clusters: 20
  Mean variants per cluster: 0.00
  Cluster sizes: [212, 100, 61, 117, 105, 120, 61, 161, 106, 138, 136, 98, 83, 188, 4596, 99, 848, 122, 12, 182]

SIMULATION:
  Cells simulated: 200
  Homogeneity alpha: 50.0
  Noise level: 0.01
  Skew factor: 2.0

OUTPUT FILES:
  Main data: ../sc_mito_vars/real_data/imigseq_SRR12455630/numpy/vcf_processed_data.npz
  Variant signatures: ../sc_mito_vars/real_data/imigseq_SRR12455630/variant_signatures.npy
  Cell proportions: ../sc_mito_vars/real_data/imigseq_SRR12455630/cell_proportio

In [27]:
# Cell 11: Visualization Functions Module
"""
### Visualization Functions
Functions to create plots similar to the simulation notebook
"""

def plot_variant_distributions(filtered_variants, cluster_signatures, cluster_labels, 
                               simulated_proportions, run_dir):
    """
    Create comprehensive visualizations of variant and cluster distributions
    Similar to the simulation notebook's visualization
    """
    
    # Calculate statistics
    n_clusters = cluster_signatures.shape[1]
    variants_per_cluster = cluster_signatures.sum(axis=0)
    mean_proportions = simulated_proportions.mean(axis=0)
    
    # Create the main figure with subplots
    fig, axes = plt.subplots(4, 1, figsize=(12, 20), 
                             gridspec_kw={'height_ratios': [3, 1, 1, 1]})
    fig.suptitle('Overview of VCF-Derived Data', fontsize=16)
    
    # Plot 1: Cluster Proportions Across the Cell Population (stacked area)
    p_df = pd.DataFrame(simulated_proportions, 
                       columns=[f'Cluster {i+1}' for i in range(n_clusters)])
    p_df.plot(kind='area', stacked=True, ax=axes[0], colormap='viridis', alpha=0.8)
    axes[0].set_title('Cluster Proportions Across the Simulated Cell Population')
    axes[0].set_xlabel('Cell ID')
    axes[0].set_ylabel('Proportion')
    axes[0].legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=2)
    axes[0].set_ylim(0, 1)
    
    # Plot 2: Mean Proportion vs. Number of Variants per Cluster
    colors = plt.cm.viridis(np.linspace(0, 1, n_clusters))
    bars = axes[1].bar(np.arange(n_clusters), mean_proportions, color=colors)
    axes[1].set_title('Mean Proportion vs. Number of Variants per Cluster')
    axes[1].set_xlabel('Cluster ID')
    axes[1].set_ylabel('Mean Proportion')
    
    # Add text labels for variant counts
    for i, (bar, count) in enumerate(zip(bars, variants_per_cluster)):
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height + 0.005,
                    f'{int(count)} vars', ha='center', va='bottom', fontsize=8)
    
    # Plot 3: Histogram of the number of variants per cluster
    axes[2].hist(variants_per_cluster, bins=min(10, n_clusters), 
                edgecolor='black', alpha=0.7, color='steelblue')
    axes[2].set_title('Distribution of Variants per Cluster')
    axes[2].set_xlabel('Number of Variants in Signature')
    axes[2].set_ylabel('Count of Clusters')
    axes[2].axvline(variants_per_cluster.mean(), color='red', linestyle='--', 
                   label=f'Mean: {variants_per_cluster.mean():.1f}')
    axes[2].legend()
    
    # Plot 4: Histogram of the mean proportion sizes
    axes[3].hist(mean_proportions, bins=min(10, n_clusters), 
                edgecolor='black', alpha=0.7, color='coral')
    axes[3].set_title('Distribution of Mean Proportion Sizes')
    axes[3].set_xlabel('Mean Proportion Size')
    axes[3].set_ylabel('Count of Clusters')
    
    # Add KDE if we have enough clusters
    if n_clusters > 5:
        from scipy import stats
        kde = stats.gaussian_kde(mean_proportions)
        x_range = np.linspace(0, mean_proportions.max() * 1.1, 100)
        axes[3].twinx().plot(x_range, kde(x_range), 'r-', alpha=0.5, label='KDE')
    
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(f'{run_dir}/variant_cluster_distributions.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    return fig

def plot_vcf_specific_analysis(filtered_variants, variant_matrix, cluster_labels, run_dir):
    """
    Create VCF-specific visualizations
    """
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle('VCF-Specific Analysis', fontsize=16)
    
    # 1. VAF distribution by cluster
    unique_umis = filtered_variants['umi_id'].unique()
    cluster_vafs = []
    for cluster_id in np.unique(cluster_labels):
        cluster_umis = unique_umis[cluster_labels == cluster_id]
        cluster_vars = filtered_variants[filtered_variants['umi_id'].isin(cluster_umis)]
        if len(cluster_vars) > 0:
            cluster_vafs.append(cluster_vars['vaf'].values)
    
    if cluster_vafs:
        bp = axes[0, 0].boxplot(cluster_vafs, labels=[f'C{i}' for i in range(len(cluster_vafs))])
        axes[0, 0].set_xlabel('Cluster')
        axes[0, 0].set_ylabel('VAF')
        axes[0, 0].set_title('VAF Distribution by Cluster')
        axes[0, 0].tick_params(axis='x', rotation=45)
    
    # 2. Variant density along mitochondrial genome
    positions = filtered_variants['pos'].values
    axes[0, 1].hist(positions, bins=50, edgecolor='black', alpha=0.7, color='green')
    axes[0, 1].set_xlabel('Mitochondrial Position')
    axes[0, 1].set_ylabel('Variant Count')
    axes[0, 1].set_title('Variant Density Along Mitochondrial Genome')
    
    # 3. Read depth vs VAF
    axes[0, 2].scatter(filtered_variants['total_depth'], filtered_variants['vaf'], 
                      alpha=0.5, s=20)
    axes[0, 2].set_xlabel('Total Read Depth')
    axes[0, 2].set_ylabel('VAF')
    axes[0, 2].set_title('Read Depth vs VAF')
    axes[0, 2].axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='VAF=0.5')
    axes[0, 2].legend()
    
    # 4. UMI read count distribution
    umi_read_counts = filtered_variants.groupby('umi_id')['read_count'].first()
    axes[1, 0].hist(umi_read_counts, bins=20, edgecolor='black', alpha=0.7, color='purple')
    axes[1, 0].set_xlabel('Reads per UMI')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].set_title('Distribution of Reads per UMI')
    
    # 5. Variants per UMI by cluster
    variants_per_umi = variant_matrix.sum(axis=0)
    cluster_colors = plt.cm.tab10(cluster_labels)
    axes[1, 1].scatter(range(len(variants_per_umi)), variants_per_umi, 
                      c=cluster_colors, alpha=0.6)
    axes[1, 1].set_xlabel('UMI Index')
    axes[1, 1].set_ylabel('Number of Variants')
    axes[1, 1].set_title('Variants per UMI (colored by cluster)')
    
    # 6. Cluster size distribution
    cluster_sizes = np.bincount(cluster_labels)
    axes[1, 2].pie(cluster_sizes, labels=[f'C{i}' for i in range(len(cluster_sizes))],
                   autopct='%1.1f%%', startangle=90)
    axes[1, 2].set_title('Cluster Size Distribution')
    
    plt.tight_layout()
    plt.savefig(f'{run_dir}/vcf_specific_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    return fig

def plot_comparison_real_vs_simulated(initial_proportions, simulated_proportions, 
                                      cluster_signatures, run_dir):
    """
    Compare real (single-cell) data with simulated population
    """
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Real vs Simulated Data Comparison', fontsize=16)
    
    n_clusters = cluster_signatures.shape[1]
    
    # 1. Compare initial vs simulated mean proportions
    initial_props = initial_proportions.flatten()
    simulated_mean = simulated_proportions.mean(axis=0)
    
    x = np.arange(n_clusters)
    width = 0.35
    
    axes[0, 0].bar(x - width/2, initial_props, width, label='Original (Single Cell)', 
                   color='blue', alpha=0.7)
    axes[0, 0].bar(x + width/2, simulated_mean, width, label='Simulated Mean', 
                   color='red', alpha=0.7)
    axes[0, 0].set_xlabel('Cluster ID')
    axes[0, 0].set_ylabel('Proportion')
    axes[0, 0].set_title('Original vs Simulated Mean Proportions')
    axes[0, 0].legend()
    
    # 2. Proportion variance across simulated cells
    prop_std = simulated_proportions.std(axis=0)
    axes[0, 1].bar(range(n_clusters), prop_std, color='orange', alpha=0.7)
    axes[0, 1].set_xlabel('Cluster ID')
    axes[0, 1].set_ylabel('Standard Deviation')
    axes[0, 1].set_title('Proportion Variability Across Simulated Cells')
    axes[0, 1].axhline(y=prop_std.mean(), color='red', linestyle='--', 
                       label=f'Mean SD: {prop_std.mean():.3f}')
    axes[0, 1].legend()
    
    # 3. Heatmap of simulated proportions
    im = axes[1, 0].imshow(simulated_proportions[:20].T, aspect='auto', cmap='YlOrRd')
    axes[1, 0].set_xlabel('Cell Index (first 20)')
    axes[1, 0].set_ylabel('Cluster ID')
    axes[1, 0].set_title('Simulated Cell Proportions Heatmap')
    plt.colorbar(im, ax=axes[1, 0], label='Proportion')
    
    # 4. Cluster correlation matrix
    if n_clusters <= 20:  # Only show if reasonable number of clusters
        corr_matrix = np.corrcoef(simulated_proportions.T)
        im2 = axes[1, 1].imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
        axes[1, 1].set_xlabel('Cluster ID')
        axes[1, 1].set_ylabel('Cluster ID')
        axes[1, 1].set_title('Cluster Proportion Correlations')
        plt.colorbar(im2, ax=axes[1, 1], label='Correlation')
    else:
        # If too many clusters, show distribution of correlations
        corr_matrix = np.corrcoef(simulated_proportions.T)
        upper_tri = corr_matrix[np.triu_indices_from(corr_matrix, k=1)]
        axes[1, 1].hist(upper_tri, bins=30, edgecolor='black', alpha=0.7)
        axes[1, 1].set_xlabel('Correlation Coefficient')
        axes[1, 1].set_ylabel('Count')
        axes[1, 1].set_title('Distribution of Cluster Correlations')
    
    plt.tight_layout()
    plt.savefig(f'{run_dir}/real_vs_simulated_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    return fig


In [None]:
# Cell 12: Execute Comprehensive Visualizations
"""
### Execute Comprehensive Visualizations
Generate all visualization plots similar to the simulation notebook
"""

print("=" * 60)
print("GENERATING COMPREHENSIVE VISUALIZATIONS")
print("=" * 60)

# 1. Main distribution plots (similar to simulation notebook)
print("\n1. Creating variant and cluster distribution plots...")
fig1 = plot_variant_distributions(
    filtered_variants=filtered_variants,
    cluster_signatures=cluster_signatures,
    cluster_labels=cluster_labels,
    simulated_proportions=simulated_proportions,
    run_dir=run_dir
)

# 2. VCF-specific analysis plots
print("\n2. Creating VCF-specific analysis plots...")
fig2 = plot_vcf_specific_analysis(
    filtered_variants=filtered_variants,
    variant_matrix=variant_matrix,
    cluster_labels=cluster_labels,
    run_dir=run_dir
)

# 3. Real vs simulated comparison
print("\n3. Creating real vs simulated comparison plots...")
fig3 = plot_comparison_real_vs_simulated(
    initial_proportions=initial_proportions,
    simulated_proportions=simulated_proportions,
    cluster_signatures=cluster_signatures,
    run_dir=run_dir
)

print("\n✓ All visualizations completed and saved")
