# NB 03: Robust ICA Module Decomposition

Decompose each pilot organism's fitness matrix into stable independent
components (modules) using the robust ICA pipeline.

Algorithm (Borchert et al. 2019):
1. Standardize: center each gene row to mean 0, scale by std
2. Determine n_components via PCA (Marchenko-Pastur threshold)
3. Run FastICA 100× with different random seeds
4. Cluster all components by |cosine similarity| (DBSCAN)
5. Stable clusters = modules; compute gene weights
6. Threshold membership via D'Agostino K² normality test

**Run locally** — no Spark needed.

In [None]:
import pandas as pd
import numpy as np
import json
import sys
from pathlib import Path

# Add src to path for pipeline import
sys.path.insert(0, str(Path('../src').resolve()))
from ica_pipeline import (
    standardize_matrix, select_n_components, robust_ica,
    compute_gene_weights, threshold_membership
)

DATA_DIR = Path('../data')
MATRIX_DIR = DATA_DIR / 'matrices'
MODULE_DIR = DATA_DIR / 'modules'
MODULE_DIR.mkdir(parents=True, exist_ok=True)

# Ensure dependencies
try:
    from sklearn.decomposition import FastICA
    print("scikit-learn available")
except ImportError:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'scikit-learn'])
    print("Installed scikit-learn")

In [None]:
# Load pilot organisms
pilots = pd.read_csv(DATA_DIR / 'pilot_organisms.csv')
pilot_ids = pilots['orgId'].tolist()
print(f"Pilot organisms: {pilot_ids}")

# Load matrix summary
summary = pd.read_csv(MATRIX_DIR / 'matrix_summary.csv')
summary

## 1. PCA Dimensionality Selection

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, len(pilot_ids), figsize=(4*len(pilot_ids), 4))
if len(pilot_ids) == 1:
    axes = [axes]

n_components_selected = {}

for ax, org_id in zip(axes, pilot_ids):
    fit_matrix = pd.read_csv(MATRIX_DIR / f'{org_id}_fitness_matrix.csv', index_col=0)
    X = fit_matrix.values
    X_std, _, _ = standardize_matrix(X)
    
    n_comp, eigenvalues = select_n_components(X_std, method='marchenko_pastur')
    n_components_selected[org_id] = n_comp
    
    ax.plot(range(1, len(eigenvalues)+1), eigenvalues, 'b.-')
    ax.axvline(n_comp, color='r', linestyle='--', label=f'n={n_comp}')
    ax.set_xlabel('Component')
    ax.set_ylabel('Eigenvalue')
    ax.set_title(f'{org_id}')
    ax.legend()
    ax.set_xlim(0, min(100, len(eigenvalues)))
    
    print(f"{org_id}: matrix {X.shape}, selected {n_comp} components")

plt.tight_layout()
plt.savefig('../figures/pca_eigenvalues.png', dpi=150, bbox_inches='tight')
plt.show()

## 2. Robust ICA

In [None]:
N_RUNS = 100
EPS = 0.15
MIN_SAMPLES = 50

for org_id in pilot_ids:
    profile_file = MODULE_DIR / f'{org_id}_module_profiles.csv'
    weights_file = MODULE_DIR / f'{org_id}_gene_weights.csv'
    member_file = MODULE_DIR / f'{org_id}_gene_membership.csv'
    params_file = MODULE_DIR / f'{org_id}_ica_params.json'
    
    if profile_file.exists() and profile_file.stat().st_size > 0:
        print(f"CACHED: {org_id}")
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing {org_id}")
    print(f"{'='*60}")
    
    # Load and standardize
    fit_matrix = pd.read_csv(MATRIX_DIR / f'{org_id}_fitness_matrix.csv', index_col=0)
    X = fit_matrix.values
    X_std, means, stds = standardize_matrix(X)
    n_comp = n_components_selected[org_id]
    
    print(f"Matrix: {X.shape[0]} genes × {X.shape[1]} experiments")
    print(f"Components: {n_comp}")
    print(f"Running {N_RUNS} ICA iterations...")
    
    # Run robust ICA
    modules, labels, all_components, metadata = robust_ica(
        X_std, n_comp, n_runs=N_RUNS, eps=EPS, min_samples=MIN_SAMPLES
    )
    
    print(f"  Converged: {metadata['n_converged']}/{N_RUNS}")
    print(f"  Stable modules: {metadata['n_stable_modules']}")
    print(f"  Noise components: {metadata['n_noise_components']}")
    
    if metadata['n_stable_modules'] == 0:
        print(f"  WARNING: No stable modules found for {org_id}. Try different eps/min_samples.")
        continue
    
    # Compute gene weights
    weights = compute_gene_weights(X_std, modules)
    
    # Threshold membership
    membership, thresholds = threshold_membership(weights)
    
    # Module summary
    n_members_per_module = membership.sum(axis=0)
    print(f"  Members per module: min={n_members_per_module.min()}, "
          f"median={np.median(n_members_per_module):.0f}, "
          f"max={n_members_per_module.max()}")
    
    # Save module profiles (module × experiment)
    module_names = [f'M{i:03d}' for i in range(len(modules))]
    profiles_df = pd.DataFrame(modules, index=module_names, columns=fit_matrix.columns)
    profiles_df.to_csv(profile_file)
    
    # Save gene weights (gene × module)
    weights_df = pd.DataFrame(weights, index=fit_matrix.index, columns=module_names)
    weights_df.to_csv(weights_file)
    
    # Save binary membership (gene × module)
    member_df = pd.DataFrame(membership, index=fit_matrix.index, columns=module_names)
    member_df.to_csv(member_file)
    
    # Save parameters
    params = {
        'orgId': org_id,
        'n_genes': int(X.shape[0]),
        'n_experiments': int(X.shape[1]),
        'n_components': n_comp,
        'n_runs': N_RUNS,
        'eps': EPS,
        'min_samples': MIN_SAMPLES,
        'n_stable_modules': int(metadata['n_stable_modules']),
        'n_converged': int(metadata['n_converged']),
        'members_per_module': n_members_per_module.tolist(),
        'thresholds': thresholds.tolist()
    }
    with open(params_file, 'w') as f:
        json.dump(params, f, indent=2)
    
    print(f"  Saved to {MODULE_DIR}/")

## 3. Module Quality Summary

In [None]:
# Summary across all organisms
all_params = []
for org_id in pilot_ids:
    params_file = MODULE_DIR / f'{org_id}_ica_params.json'
    if params_file.exists():
        with open(params_file) as f:
            all_params.append(json.load(f))

params_df = pd.DataFrame(all_params)
print("ICA Module Summary:")
params_df[['orgId', 'n_genes', 'n_experiments', 'n_components', 'n_stable_modules', 'n_converged']]

In [None]:
# Visualize module size distributions
fig, axes = plt.subplots(1, len(pilot_ids), figsize=(4*len(pilot_ids), 4))
if len(pilot_ids) == 1:
    axes = [axes]

for ax, org_id in zip(axes, pilot_ids):
    member_file = MODULE_DIR / f'{org_id}_gene_membership.csv'
    if not member_file.exists():
        continue
    membership = pd.read_csv(member_file, index_col=0)
    module_sizes = membership.sum(axis=0)
    ax.bar(range(len(module_sizes)), sorted(module_sizes, reverse=True))
    ax.set_xlabel('Module rank')
    ax.set_ylabel('Number of genes')
    ax.set_title(f'{org_id} ({len(module_sizes)} modules)')

plt.tight_layout()
plt.savefig('../figures/module_size_distribution.png', dpi=150, bbox_inches='tight')
plt.show()