# Exploratory Data Analysis

This notebook performs initial exploration and quality control of single-cell data.

## Steps:
1. Load data
2. Quality control metrics
3. Basic visualizations
4. Data summary statistics


In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src.preprocess import run_qc, calculate_qc_metrics
from src.utils import load_adata

# Set scanpy settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white')

# Set style
plt.style.use('default')
sns.set_palette("husl")


In [None]:
# Load data
data_file = Path("../data/synthetic/synthetic_combined.h5ad")

if not data_file.exists():
    print("Data file not found. Please run: python scripts/generate_synthetic_sc.py")
else:
    adata = load_adata(str(data_file))
    print(f"Loaded data: {adata.n_obs:,} cells, {adata.n_vars:,} genes")
    print(f"\nObservations (cells):")
    print(adata.obs.head())
    print(f"\nVariables (genes): {adata.n_vars}")


In [None]:
# Calculate QC metrics
adata = calculate_qc_metrics(adata)

# Visualize QC metrics
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Total counts
axes[0, 0].hist(adata.obs['total_counts'], bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Total counts')
axes[0, 0].set_ylabel('Number of cells')
axes[0, 0].set_title('Distribution of total counts')

# Number of genes
axes[0, 1].hist(adata.obs['n_genes_by_counts'], bins=50, edgecolor='black')
axes[0, 1].set_xlabel('Number of genes')
axes[0, 1].set_ylabel('Number of cells')
axes[0, 1].set_title('Distribution of genes per cell')

# Mitochondrial percentage
axes[1, 0].hist(adata.obs['pct_counts_mt'], bins=50, edgecolor='black')
axes[1, 0].set_xlabel('Mitochondrial %')
axes[1, 0].set_ylabel('Number of cells')
axes[1, 0].set_title('Distribution of mitochondrial %')

# Scatter: total counts vs genes
axes[1, 1].scatter(adata.obs['total_counts'], adata.obs['n_genes_by_counts'], 
                   alpha=0.3, s=1)
axes[1, 1].set_xlabel('Total counts')
axes[1, 1].set_ylabel('Number of genes')
axes[1, 1].set_title('Total counts vs Genes')

plt.tight_layout()
plt.show()


In [None]:
# Run QC filtering
adata_filtered, qc_stats = run_qc(adata, remove_doublets=True)

print("QC Statistics:")
print(f"  Cells before: {qc_stats['n_cells_before']:,}")
print(f"  Cells after: {qc_stats['n_cells_after']:,}")
print(f"  Cells removed: {qc_stats['n_removed']:,} ({qc_stats['pct_removed']:.1f}%)")
print(f"  Genes before: {qc_stats['n_genes_before']:,}")
print(f"  Genes after: {qc_stats['n_genes_after']:,}")

# Save filtered data
adata_filtered.write("../data/synthetic/adata_filtered.h5ad")
print("\nFiltered data saved.")


In [None]:
# Summary statistics by patient and timepoint
if 'patient_id' in adata_filtered.obs.columns and 'timepoint' in adata_filtered.obs.columns:
    summary = adata_filtered.obs.groupby(['patient_id', 'timepoint']).agg({
        'total_counts': ['mean', 'std'],
        'n_genes_by_counts': ['mean', 'std'],
        'pct_counts_mt': ['mean', 'std']
    }).round(2)
    print("Summary by patient and timepoint:")
    print(summary)
