In [None]:
# Notebook 2: RNA Clustering, Marker Genes, and Cell Type Annotation

In [1]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:

# Load preprocessed RNA data
adata = sc.read_h5ad("data/filtered_rna_subset.h5ad")

# Normalize and log-transform
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Check total counts before and after
adata.obs['total_counts_post'] = adata.X.sum(1).A1

plt.figure(figsize=(8, 4))
sns.histplot(adata.obs['total_counts_post'], bins=50)
plt.title("Total Counts per Cell After Normalization")
plt.xlabel("Total Counts")
plt.tight_layout()
plt.savefig("results/plots/ag_total_counts_post_norm.png", dpi=300)
plt.close()

In [22]:

# Highly Variable Genes

# Add mitochondrial gene content and other QC metrics
adata.var['mt'] = adata.var_names.str.upper().str.startswith('MT-')  # for human MT-CO1, MT-ND1 etc.

sc.pp.calculate_qc_metrics(
    adata,
    qc_vars=['mt'],
    percent_top=None,
    log1p=False,
    inplace=True
)

sc.pp.highly_variable_genes(adata, flavor="seurat", n_top_genes=2000)
sc.pl.highly_variable_genes(adata, show=False)
plt.savefig("results/plots/ag_highly_variable_genes.png", dpi=300)
plt.close()

adata = adata[:, adata.var['highly_variable']]


In [None]:
# PCA
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, show=False)
plt.savefig("results/plots/ag_pca_variance_ratio.png", dpi=300)
plt.close()


In [24]:
# Clustering with Multiple Resolutions
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

for res in [0.3, 0.5, 0.8]:
    sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')
    sc.tl.umap(adata)
    sc.pl.umap(adata, color=f'leiden_{res}', title=f"Leiden Clusters (res={res})", show=False)
    plt.savefig(f"results/plots/ag_umap_leiden_{res}.png", dpi=300)
    plt.close()

# Choose best resolution and continue
adata.obs['leiden'] = adata.obs['leiden_0.5']


In [None]:
# Marker Genes per Cluster
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=False, show=False)
plt.savefig("results/plots/ag_top_marker_genes_per_cluster.png", dpi=300)
plt.close()

# Save top 3 genes per cluster
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
top_genes_df = pd.DataFrame({group: result['names'][group][:3] for group in groups})
top_genes_df.to_csv("results/files/ag_top_marker_genes.csv")
display(top_genes_df)


In [26]:
# Marker Gene UMAP Expression
top_markers = top_genes_df.values.flatten()
top_markers = list(set(top_markers))  # Remove duplicates
sc.pl.umap(adata, color=top_markers, ncols=3, show=False)
plt.savefig("results/plots/ag_umap_marker_gene_expression.png", dpi=300)
plt.close()


In [27]:

# Mito Quality Check by Cluster
sc.pl.violin(
    adata,
    ['n_counts', 'n_genes_by_counts', 'pct_counts_mt'],
    groupby='leiden',
    stripplot=False,
    show=False
)
plt.savefig("results/plots/ag_qc_metrics_by_cluster.png", dpi=300)
plt.close()

In [28]:
# Manual Cell Type Annotation
cluster_labels = {
    "0": "Naive T cells",
    "1": "Monocytes",
    "2": "Cytotoxic T/NK",
    "3": "Memory T cells",
    "4": "Dendritic cells",
    "5": "B cells",
    "6": "Monocytes / Neutrophils",
    "7": "NK cells",
    "8": "Mito-rich/Unknown",  # (speculation) flag for investigation 
    "9": "pDCs or DC subtype"
}
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_labels)

In [29]:
# Annotated UMAP
sc.pl.umap(adata, color='cell_type', legend_loc='on data', title="Cell Type Annotations", show=False)
plt.savefig("results/plots/ag_umap_cell_type_annotations.png", dpi=300)
plt.close()

In [30]:
# Cell Type Proportions
cell_counts = adata.obs['cell_type'].value_counts(normalize=True).sort_index()
plt.figure(figsize=(8, 4))
sns.barplot(x=cell_counts.index, y=cell_counts.values)
plt.xticks(rotation=90)
plt.ylabel("Proportion")
plt.title("Cell Type Proportions")
plt.tight_layout()
plt.savefig("results/plots/ag_cell_type_proportions.png", dpi=300)
plt.close()


In [31]:

# Save Annotated Dataset
adata.write("results/files/ag_rna_annotated.h5ad")

# Also save useful metadata for ML downstream
adata.obs[['leiden', 'cell_type']].to_csv("results/files/ag_rna_labels.csv")
pd.DataFrame(adata.obsm['X_pca'], index=adata.obs_names).to_csv("results/files/ag_rna_pca.csv")
pd.DataFrame(adata.obsm['X_umap'], index=adata.obs_names).to_csv("results/files/ag_rna_umap.csv")

In [32]:

# Cluster 8 was characterized by high mitochondrial gene content and low gene complexity, a signature of apoptotic or dying cells. 
# These are often excluded from downstream biological and ML analyses to avoid noise from low-quality data.

import scanpy as sc

# Load the previously annotated RNA data
adata_annotated = sc.read_h5ad("results/files/ag_rna_annotated.h5ad")

# Remove Cluster 8
adata_cleaned = adata_annotated[adata_annotated.obs['leiden'] != '8'].copy()

#  drop unused categories to clean up metadata
adata_cleaned.obs['leiden'] = adata_cleaned.obs['leiden'].cat.remove_unused_categories()
adata_cleaned.obs['cell_type'] = adata_cleaned.obs['cell_type'].cat.remove_unused_categories()

# Save the cleaned file
adata_cleaned.write("results/files/ag_rna_annotated_cleaned.h5ad")


In [33]:
# visualize the cleaned UMAP (after removing Cluster 8) with resolution 0.5

# Re-compute neighbors and Leiden clustering at resolution=0.5
sc.pp.neighbors(adata_cleaned, n_neighbors=10, n_pcs=40)
sc.tl.leiden(adata_cleaned, resolution=0.5, key_added="leiden_0.5")

adata_cleaned.obs['leiden'] = adata_cleaned.obs['leiden_0.5']

# Annotate clusters based on chosen resolution
cluster_labels_05 = {
    "0": "Naive T cells",
    "1": "Monocytes",
    "2": "Cytotoxic T/NK",
    "3": "Memory T cells",
    "4": "Dendritic cells",
    "5": "B cells",
    "6": "Monocytes / Neutrophils",
    "7": "NK cells",
    "9": "pDCs or DC subtype"
}
adata_cleaned.obs['cell_type'] = adata_cleaned.obs['leiden'].map(cluster_labels_05)

# Drop categories to clean metadata
adata_cleaned.obs['leiden'] = adata_cleaned.obs['leiden'].astype("category")
adata_cleaned.obs['cell_type'] = adata_cleaned.obs['cell_type'].astype("category")

# Save updated object
adata_cleaned.write("results/files/ag_rna_final_cleaned_annotated.h5ad")

# Plot final UMAP
sc.tl.umap(adata_cleaned) 
sc.pl.umap(adata_cleaned, color="cell_type", legend_loc="on data", title="Final Cell Type Annotations", show=False)
plt.savefig("results/plots/ag_umap_final_cell_type_annotations.png", dpi=300)
plt.close()