# 07 - Prepare Shiny App Data

**COVID-19 GSE171524 Single-Cell Analysis**

This notebook generates all data files needed by the interactive R Shiny app.

## Output Files
1. `shiny_app_data_umap_sampled.csv` - Subsampled UMAP + metadata (20k cells)
2. `shiny_app_data_umap_metadata.csv` - Full metadata for all cells
3. `shiny_app_data_gene_expression.csv` - Expression for key genes
4. `shiny_app_data_de_results.csv` - Global DE results
5. `shiny_app_data_de_celltype.csv` - Cell type-specific DE results
6. `shiny_app_data_interactions.csv` - Ligand-receptor interactions

In [None]:
import sys
import warnings
import numpy as np
import pandas as pd
import scanpy as sc
from pathlib import Path
from scipy import sparse

warnings.filterwarnings('ignore')
sys.path.insert(0, '../scripts')
from markers import (
    MAJOR_CELL_TYPES, COVID_LIGANDS, COVID_RECEPTORS,
    MYELOID_DYSFUNCTION, DATP_SIGNATURE, FIBROSIS_SIGNATURE,
    EXHAUSTION_SIGNATURE, IFN_RESPONSE_SIGNATURE, INFLAMMATORY_CYTOKINES
)

print(f"Scanpy: {sc.__version__}")

In [None]:
# Define paths
INPUT_PATH = Path('../data/processed_data/adata_annotated.h5ad')
OUTPUT_DIR = Path('../covid19-lung-atlas/data')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load annotated data
print(f"Loading: {INPUT_PATH}")
adata = sc.read_h5ad(INPUT_PATH)
print(f"Loaded: {adata.n_obs:,} cells, {adata.n_vars:,} genes")

## 1. UMAP + Metadata (Subsampled)

In [None]:
# Subsample for Shiny app performance (20k cells, stratified by cell type & condition)
N_SAMPLE = 20000

np.random.seed(42)

# Stratified sampling
groups = adata.obs.groupby(['cell_type', 'condition'])
sampled_indices = []

for name, group in groups:
    n = max(1, int(len(group) / adata.n_obs * N_SAMPLE))
    n = min(n, len(group))
    sampled_indices.extend(np.random.choice(group.index, size=n, replace=False))

adata_sampled = adata[sampled_indices].copy()
print(f"Sampled {adata_sampled.n_obs:,} cells")

# Create UMAP dataframe
umap_df = pd.DataFrame(
    adata_sampled.obsm['X_umap'],
    index=adata_sampled.obs_names,
    columns=['UMAP1', 'UMAP2']
)

# Add metadata columns
for col in ['cell_type', 'condition', 'sample_id', 'patient_id']:
    if col in adata_sampled.obs.columns:
        umap_df[col] = adata_sampled.obs[col].values

# Add leiden if available
for col in adata_sampled.obs.columns:
    if 'leiden' in col and '_' not in col:
        umap_df['leiden'] = adata_sampled.obs[col].values
        break

# Save
output_path = OUTPUT_DIR / 'shiny_app_data_umap_sampled.csv'
umap_df.to_csv(output_path)
print(f"Saved: {output_path} ({umap_df.shape})")
print(f"\nColumns: {umap_df.columns.tolist()}")
print(f"\nCell types: {umap_df['cell_type'].value_counts().to_dict()}")

In [None]:
# Full metadata (no expression, for summary stats)
meta_df = adata.obs[['cell_type', 'condition', 'sample_id', 'patient_id']].copy()
if 'X_umap' in adata.obsm:
    meta_df['UMAP1'] = adata.obsm['X_umap'][:, 0]
    meta_df['UMAP2'] = adata.obsm['X_umap'][:, 1]

output_path = OUTPUT_DIR / 'shiny_app_data_umap_metadata.csv'
meta_df.to_csv(output_path)
print(f"Saved: {output_path} ({meta_df.shape})")

## 2. Gene Expression for Key Genes

In [None]:
# Select key genes for the app
key_genes = set()

# All major cell type markers
for genes in MAJOR_CELL_TYPES.values():
    key_genes.update(genes)

# COVID signatures
for genes in MYELOID_DYSFUNCTION.values():
    key_genes.update(genes)
key_genes.update(DATP_SIGNATURE)
key_genes.update(FIBROSIS_SIGNATURE)
key_genes.update(EXHAUSTION_SIGNATURE)
key_genes.update(IFN_RESPONSE_SIGNATURE)
key_genes.update(INFLAMMATORY_CYTOKINES)
key_genes.update(COVID_LIGANDS)
key_genes.update(COVID_RECEPTORS)

# Common genes of interest
key_genes.update(['ACE2', 'TMPRSS2', 'SFTPC', 'SFTPA1', 'AGER', 'CTHRC1'])

# Filter to available genes
available_genes = sorted([g for g in key_genes if g in adata_sampled.var_names])
print(f"Key genes available: {len(available_genes)} / {len(key_genes)}")

# Extract expression for sampled cells
X_expr = adata_sampled[:, available_genes].X
if sparse.issparse(X_expr):
    X_expr = X_expr.toarray()

expr_df = pd.DataFrame(
    X_expr,
    index=adata_sampled.obs_names,
    columns=available_genes
)

# Save
output_path = OUTPUT_DIR / 'shiny_app_data_gene_expression.csv'
expr_df.to_csv(output_path)
print(f"Saved: {output_path} ({expr_df.shape})")

## 3. Differential Expression Results

In [None]:
# Load DE results from analysis
TABLES_DIR = Path('../results/tables')

# Global DE
de_global_path = TABLES_DIR / 'de_global_covid_vs_control.csv'
if de_global_path.exists():
    de_global = pd.read_csv(de_global_path)
    print(f"Loaded global DE: {len(de_global)} genes")
else:
    # Run DE on the fly
    print("Running global DE analysis...")
    sc.tl.rank_genes_groups(
        adata,
        groupby='condition',
        groups=['COVID'],
        reference='Control',
        method='wilcoxon',
        n_genes=adata.n_vars
    )
    de_global = sc.get.rank_genes_groups_df(adata, group='COVID')
    de_global = de_global.rename(columns={'names': 'gene', 'logfoldchanges': 'log2FC', 'pvals_adj': 'padj'})
    print(f"Computed global DE: {len(de_global)} genes")

# Save for Shiny
output_path = OUTPUT_DIR / 'shiny_app_data_de_results.csv'
de_global.to_csv(output_path, index=False)
print(f"Saved: {output_path}")

In [None]:
# Cell type-specific DE
de_ct_path = TABLES_DIR / 'de_by_celltype.csv'
if de_ct_path.exists():
    de_celltype = pd.read_csv(de_ct_path)
    print(f"Loaded cell type DE: {len(de_celltype)} rows")
else:
    # Run DE per cell type
    print("Running cell type-specific DE...")
    de_results = []
    for ct in adata.obs['cell_type'].unique():
        adata_ct = adata[adata.obs['cell_type'] == ct].copy()
        n_covid = (adata_ct.obs['condition'] == 'COVID').sum()
        n_ctrl = (adata_ct.obs['condition'] == 'Control').sum()
        if n_covid < 50 or n_ctrl < 50:
            continue
        sc.tl.rank_genes_groups(
            adata_ct,
            groupby='condition',
            groups=['COVID'],
            reference='Control',
            method='wilcoxon',
            n_genes=adata_ct.n_vars
        )
        de_ct = sc.get.rank_genes_groups_df(adata_ct, group='COVID')
        de_ct = de_ct.rename(columns={'names': 'gene', 'logfoldchanges': 'log2FC', 'pvals_adj': 'padj'})
        de_ct['cell_type'] = ct
        de_results.append(de_ct)
        print(f"  {ct}: {len(de_ct)} genes")
    de_celltype = pd.concat(de_results, ignore_index=True)

# Save for Shiny
output_path = OUTPUT_DIR / 'shiny_app_data_de_celltype.csv'
de_celltype.to_csv(output_path, index=False)
print(f"Saved: {output_path}")

## 4. Cell-Cell Interactions (Simulated L-R Pairs)

In [None]:
# Generate ligand-receptor interaction data
# Based on known COVID-relevant pathways

lr_pairs = {
    'TGFb': [('TGFB1', 'TGFBR1'), ('TGFB1', 'TGFBR2'), ('TGFB2', 'TGFBR1')],
    'IL1': [('IL1B', 'IL1R1'), ('IL1A', 'IL1R1')],
    'IL6': [('IL6', 'IL6R'), ('IL6', 'IL6ST')],
    'TNF': [('TNF', 'TNFRSF1A')],
    'Chemokine': [('CCL2', 'CCR2'), ('CXCL8', 'CXCR4'), ('CXCL10', 'CXCR4'), ('CXCL12', 'CXCR4')],
    'PDGF': [('PDGFA', 'PDGFRA'), ('PDGFB', 'PDGFRB')],
    'FGF': [('FGF2', 'FGFR1'), ('FGF7', 'FGFR2')],
    'Wnt': [('WNT5A', 'FGFR1')],
    'Efferocytosis': [('GAS6', 'AXL'), ('GAS6', 'MERTK')],
}

cell_types = adata.obs['cell_type'].unique().tolist()

interactions = []
for pathway, pairs in lr_pairs.items():
    for ligand, receptor in pairs:
        if ligand not in adata.var_names or receptor not in adata.var_names:
            continue
        for sender in cell_types:
            for receiver in cell_types:
                # Calculate mean expression per condition
                for cond in ['COVID', 'Control']:
                    sender_mask = (adata.obs['cell_type'] == sender) & (adata.obs['condition'] == cond)
                    receiver_mask = (adata.obs['cell_type'] == receiver) & (adata.obs['condition'] == cond)
                    
                    if sender_mask.sum() < 10 or receiver_mask.sum() < 10:
                        continue
                    
                    lig_X = adata[sender_mask, ligand].X
                    rec_X = adata[receiver_mask, receptor].X
                    if sparse.issparse(lig_X):
                        lig_X = lig_X.toarray()
                    if sparse.issparse(rec_X):
                        rec_X = rec_X.toarray()
                    
                    lig_mean = float(np.mean(lig_X))
                    rec_mean = float(np.mean(rec_X))
                    score = lig_mean * rec_mean  # Product-based interaction score
                    
                    interactions.append({
                        'pathway': pathway,
                        'ligand': ligand,
                        'receptor': receptor,
                        'sender': sender,
                        'receiver': receiver,
                        'condition': cond,
                        'score': score
                    })

interactions_df = pd.DataFrame(interactions)
print(f"Raw interactions: {len(interactions_df)}")

# Pivot to get COVID vs Control side by side
interactions_wide = interactions_df.pivot_table(
    index=['pathway', 'ligand', 'receptor', 'sender', 'receiver'],
    columns='condition',
    values='score',
    fill_value=0
).reset_index()

# Calculate log2FC
interactions_wide['log2FC'] = np.log2(
    (interactions_wide['COVID'] + 1e-6) / (interactions_wide['Control'] + 1e-6)
)

# Filter to meaningful interactions (non-zero in at least one condition)
interactions_wide = interactions_wide[
    (interactions_wide['COVID'] > 0.001) | (interactions_wide['Control'] > 0.001)
]

print(f"Filtered interactions: {len(interactions_wide)}")

# Save
output_path = OUTPUT_DIR / 'shiny_app_data_interactions.csv'
interactions_wide.to_csv(output_path, index=False)
print(f"Saved: {output_path}")

## Summary

In [None]:
# Verify all output files
print("Generated Shiny app data files:")
print("="*60)
for f in sorted(OUTPUT_DIR.glob('shiny_app_data_*.csv')):
    size_mb = f.stat().st_size / 1e6
    print(f"  {f.name}: {size_mb:.1f} MB")

print(f"\nAll files saved to: {OUTPUT_DIR}")
print("\nTo run the Shiny app:")
print("  cd covid19-lung-atlas")
print("  R -e 'shiny::runApp(\"shiny_app.R\")' ")