# 06 - Myeloid Cell Analysis

**COVID-19 GSE171524 Single-Cell Analysis**

This notebook analyzes myeloid cell dysregulation in COVID-19 lungs.

## Key Paper Findings (Melms et al. 2021, Fig 2)
1. **NEAT1/MALAT1 high** in COVID macrophages (lncRNA dysregulation)
2. **AXL/MERTK low** (impaired efferocytosis)
3. **Monocyte-derived macrophages (MDM)** accumulation vs alveolar macrophages (AM)
4. **IL-1β expression** primarily in myeloid cells

## Objectives
1. Subset and re-cluster myeloid cells
2. Identify AM vs MDM populations
3. Validate NEAT1/MALAT1/AXL expression patterns
4. Analyze inflammatory cytokine sources
5. Compare COVID vs Control myeloid states

In [None]:
# Import libraries
import os
import sys
import warnings
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats, sparse

warnings.filterwarnings('ignore')

# Add scripts to path
sys.path.insert(0, '../scripts')
from markers import MYELOID_SUBTYPES, MYELOID_DYSFUNCTION, INFLAMMATORY_CYTOKINES
from plotting import COVID_COLORS

# Settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')

print(f"Scanpy: {sc.__version__}")

In [None]:
# Define paths
INPUT_PATH = Path('../data/processed_data/adata_annotated.h5ad')
OUTPUT_DIR = Path('../data/processed_data')
FIGURE_DIR = Path('../results/figures/myeloid')
FIGURE_DIR.mkdir(parents=True, exist_ok=True)

# Load data
print(f"Loading: {INPUT_PATH}")
adata_full = sc.read_h5ad(INPUT_PATH)
print(f"Loaded: {adata_full.n_obs:,} cells")

In [None]:
# Subset to myeloid cells
myeloid_mask = adata_full.obs['cell_type'] == 'Myeloid'
adata = adata_full[myeloid_mask].copy()

print(f"Myeloid cells: {adata.n_obs:,}")
print(f"\nBy condition:")
print(adata.obs['condition'].value_counts())

## Re-cluster Myeloid Cells

In [None]:
# Re-compute neighbors and UMAP for myeloid subset
if 'X_scVI' in adata.obsm:
    sc.pp.neighbors(adata, use_rep='X_scVI', n_neighbors=15)
else:
    sc.pp.neighbors(adata, n_neighbors=15)

sc.tl.umap(adata, min_dist=0.3)
sc.tl.leiden(adata, resolution=0.5, key_added='myeloid_cluster')

print(f"Myeloid subclusters: {adata.obs['myeloid_cluster'].nunique()}")

In [None]:
# Visualize myeloid UMAP
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sc.pl.umap(adata, color='myeloid_cluster', ax=axes[0], show=False,
           title='Myeloid Subclusters', legend_loc='on data')
sc.pl.umap(adata, color='condition', palette=COVID_COLORS, ax=axes[1], show=False,
           title='Condition')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'umap_myeloid.png', dpi=150)
plt.show()

## Identify AM vs MDM

In [None]:
# Score for myeloid subtypes
for subtype, genes in MYELOID_SUBTYPES.items():
    available = [g for g in genes if g in adata.var_names]
    if available:
        sc.tl.score_genes(adata, available, score_name=f'{subtype}_score')
        print(f"{subtype}: {len(available)} genes")

In [None]:
# Plot AM vs MDM scores
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sc.pl.umap(adata, color='AM_score', cmap='RdYlBu_r', ax=axes[0], show=False,
           title='Alveolar Macrophage Score')
sc.pl.umap(adata, color='MDM_score', cmap='RdYlBu_r', ax=axes[1], show=False,
           title='Monocyte-Derived Macrophage Score')

# AM vs MDM scatter
colors = [COVID_COLORS[c] for c in adata.obs['condition']]
axes[2].scatter(adata.obs['AM_score'], adata.obs['MDM_score'], 
                c=colors, alpha=0.3, s=5)
axes[2].set_xlabel('AM Score')
axes[2].set_ylabel('MDM Score')
axes[2].set_title('AM vs MDM (Red=COVID)')

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'am_vs_mdm_scores.png', dpi=150)
plt.show()

In [None]:
# Classify AM vs MDM based on scores
adata.obs['myeloid_subtype'] = 'Other'
adata.obs.loc[adata.obs['AM_score'] > adata.obs['MDM_score'], 'myeloid_subtype'] = 'AM'
adata.obs.loc[adata.obs['MDM_score'] > adata.obs['AM_score'], 'myeloid_subtype'] = 'MDM'

print("Myeloid subtype distribution:")
print(adata.obs['myeloid_subtype'].value_counts())

In [None]:
# Compare AM/MDM proportions by condition
subtype_props = pd.crosstab(
    adata.obs['condition'],
    adata.obs['myeloid_subtype'],
    normalize='index'
) * 100

print("AM/MDM proportions (%):")
print(subtype_props)

# Plot
fig, ax = plt.subplots(figsize=(8, 5))
subtype_props.plot(kind='bar', ax=ax, color=['#3498DB', '#E74C3C', '#95A5A6'])
ax.set_ylabel('Percentage')
ax.set_xlabel('')
ax.set_title('Macrophage Subtype Proportions')
ax.legend(title='Subtype')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'am_mdm_proportions.png', dpi=150)
plt.show()

## NEAT1/MALAT1/AXL Analysis

In [None]:
# Key paper genes
paper_genes = ['NEAT1', 'MALAT1', 'AXL', 'MERTK', 'GAS6']
available = [g for g in paper_genes if g in adata.var_names]

print(f"Available genes: {available}")

In [None]:
# Plot NEAT1, MALAT1, AXL expression on UMAP
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
axes = axes.flatten()

for i, gene in enumerate(['NEAT1', 'MALAT1', 'AXL', 'MERTK', 'GAS6']):
    if gene in adata.var_names:
        sc.pl.umap(adata, color=gene, cmap='viridis', ax=axes[i], show=False)

axes[-1].set_visible(False)
plt.tight_layout()
plt.savefig(FIGURE_DIR / 'umap_key_genes.png', dpi=150)
plt.show()

In [None]:
# Violin plots comparing COVID vs Control
fig, axes = plt.subplots(1, 4, figsize=(16, 4))

genes_to_plot = ['NEAT1', 'MALAT1', 'AXL', 'MERTK']

for ax, gene in zip(axes, genes_to_plot):
    if gene in adata.var_names:
        sc.pl.violin(
            adata,
            gene,
            groupby='condition',
            palette=COVID_COLORS,
            ax=ax,
            show=False
        )
        ax.set_title(gene)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'violin_key_genes.png', dpi=150)
plt.show()

In [None]:
# Statistical comparison
print("COVID vs Control expression (myeloid):")
print("="*50)

for gene in genes_to_plot:
    if gene not in adata.var_names:
        continue
    
    X_gene = adata[:, gene].X
    if sparse.issparse(X_gene):
        X_gene = X_gene.toarray()
    X_gene = np.asarray(X_gene).flatten()
    
    covid_mask = (adata.obs['condition'] == 'COVID').values
    ctrl_mask = (adata.obs['condition'] == 'Control').values
    
    covid_expr = X_gene[covid_mask]
    ctrl_expr = X_gene[ctrl_mask]
    
    stat, pval = stats.mannwhitneyu(covid_expr, ctrl_expr, alternative='two-sided')
    
    print(f"\n{gene}:")
    print(f"  COVID mean: {covid_expr.mean():.3f}")
    print(f"  Control mean: {ctrl_expr.mean():.3f}")
    print(f"  Fold change: {covid_expr.mean() / max(ctrl_expr.mean(), 0.001):.2f}")
    print(f"  P-value: {pval:.2e}")

## Inflammatory Cytokine Analysis

In [None]:
# Check IL-1β expression
cytokines = ['IL1B', 'IL6', 'TNF', 'CCL2', 'CCL3', 'CXCL8']
available_cyto = [g for g in cytokines if g in adata.var_names]

print(f"Available cytokines: {available_cyto}")

In [None]:
# Plot cytokine expression
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, gene in enumerate(available_cyto):
    if i < len(axes):
        sc.pl.violin(
            adata,
            gene,
            groupby='condition',
            palette=COVID_COLORS,
            ax=axes[i],
            show=False
        )
        axes[i].set_title(gene)

for j in range(len(available_cyto), len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'cytokine_expression.png', dpi=150)
plt.show()

In [None]:
# IL1B expression by myeloid subtype
if 'IL1B' in adata.var_names:
    fig, ax = plt.subplots(figsize=(10, 5))
    
    # Create combined grouping
    adata.obs['subtype_condition'] = adata.obs['myeloid_subtype'] + '_' + adata.obs['condition']
    
    sc.pl.violin(
        adata,
        'IL1B',
        groupby='subtype_condition',
        rotation=45,
        ax=ax,
        show=False
    )
    ax.set_title('IL1B Expression by Myeloid Subtype and Condition')
    
    plt.tight_layout()
    plt.savefig(FIGURE_DIR / 'il1b_by_subtype.png', dpi=150)
    plt.show()

## Myeloid Dysfunction Score

In [None]:
# Score for myeloid dysfunction markers
# High lncRNA
lncrna_genes = [g for g in MYELOID_DYSFUNCTION['lncRNA_high'] if g in adata.var_names]
if lncrna_genes:
    sc.tl.score_genes(adata, lncrna_genes, score_name='lncRNA_score')

# Low efferocytosis
efferocytosis_genes = [g for g in MYELOID_DYSFUNCTION['efferocytosis_low'] if g in adata.var_names]
if efferocytosis_genes:
    sc.tl.score_genes(adata, efferocytosis_genes, score_name='efferocytosis_score')

# Pro-inflammatory
inflamm_genes = [g for g in MYELOID_DYSFUNCTION['pro_inflammatory'] if g in adata.var_names]
if inflamm_genes:
    sc.tl.score_genes(adata, inflamm_genes, score_name='inflammatory_score')

print("Myeloid dysfunction scores computed")

In [None]:
# Plot dysfunction scores
score_cols = ['lncRNA_score', 'efferocytosis_score', 'inflammatory_score']
score_cols = [c for c in score_cols if c in adata.obs.columns]

fig, axes = plt.subplots(1, len(score_cols), figsize=(5*len(score_cols), 4))
if len(score_cols) == 1:
    axes = [axes]

for ax, col in zip(axes, score_cols):
    sc.pl.violin(
        adata,
        col,
        groupby='condition',
        palette=COVID_COLORS,
        ax=ax,
        show=False
    )
    ax.set_title(col.replace('_', ' ').title())

plt.tight_layout()
plt.savefig(FIGURE_DIR / 'dysfunction_scores.png', dpi=150)
plt.show()

In [None]:
# Combined dysfunction score: high lncRNA + high inflammatory + low efferocytosis
if all(c in adata.obs.columns for c in ['lncRNA_score', 'efferocytosis_score', 'inflammatory_score']):
    adata.obs['myeloid_dysfunction_score'] = (
        adata.obs['lncRNA_score'] + 
        adata.obs['inflammatory_score'] - 
        adata.obs['efferocytosis_score']
    )
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    sc.pl.umap(adata, color='myeloid_dysfunction_score', cmap='RdYlBu_r', 
               ax=axes[0], show=False, title='Myeloid Dysfunction Score')
    
    sc.pl.violin(adata, 'myeloid_dysfunction_score', groupby='condition',
                 palette=COVID_COLORS, ax=axes[1], show=False)
    axes[1].set_title('Dysfunction by Condition')
    
    plt.tight_layout()
    plt.savefig(FIGURE_DIR / 'myeloid_dysfunction_combined.png', dpi=150)
    plt.show()

## Differential Expression in Myeloid Cells

In [None]:
# DE in myeloid cells
sc.tl.rank_genes_groups(
    adata,
    groupby='condition',
    groups=['COVID'],
    reference='Control',
    method='wilcoxon',
    n_genes=500
)

# Extract results
de_myeloid = sc.get.rank_genes_groups_df(adata, group='COVID')

print("Top upregulated in COVID myeloid:")
print(de_myeloid.head(20)[['names', 'logfoldchanges', 'pvals_adj']])

In [None]:
# Save myeloid subset
output_path = OUTPUT_DIR / 'adata_myeloid.h5ad'
adata.write_h5ad(output_path, compression='gzip')
print(f"Saved: {output_path}")

# Save DE results
de_myeloid.to_csv(Path('../results/tables/de_myeloid.csv'), index=False)

## Summary

### Key Findings (Reproducing Paper)

1. **NEAT1/MALAT1**: Expected to be elevated in COVID myeloid cells
2. **AXL/MERTK**: Expected to be reduced (impaired efferocytosis)
3. **MDM accumulation**: More monocyte-derived macrophages in COVID
4. **IL-1β**: Primary source in myeloid cells

### Output
- `data/processed_data/adata_myeloid.h5ad` - Myeloid subset
- `results/tables/de_myeloid.csv` - Myeloid DE results

### Next Steps
→ **07_epithelial_trajectory.ipynb**: AT2→DATP→AT1 trajectory analysis

In [None]:
# Session info
print("\n=== Session Info ===")
print(f"Scanpy: {sc.__version__}")