In [1]:
!uv pip install 'scanpy @ git+https://github.com/keller-mark/scanpy@af55e9d'

[2K[2mResolved [1m36 packages[0m [2min 229ms[0m[0m                                        [0m
[2mAudited [1m36 packages[0m [2min 0.20ms[0m[0m


In [1]:
from anndata import read_zarr, read_h5ad, AnnData
from mudata import MuData
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
from compasce.io import dir_name_to_str

In [3]:
def get_adata():
    adata = read_h5ad("KPMP_PREMIERE_SC_version1.5_ForExplorer_withRC.032624.h5ad")

    should_subset = True
    if should_subset:
        # subset using random sample so that multiple sample groups are represented to enable comparison
        np.random.seed(1)
        obs_subset = np.random.choice(adata.obs.index.tolist(), size=25_000, replace=False).tolist()
        var_slice = slice(0, 10_000)
        adata = adata[obs_subset, var_slice].copy()
        adata.layers["counts"] = adata.raw[obs_subset, var_slice].X.todense()
    else:
        adata.layers["counts"] = adata.raw.X.todense()
    adata.raw = None
    return adata

In [4]:
# For KPMP_PREMIERE....h5ad
sample_id_col = "SampleID"
sample_group_pairs = [
  ('diseasetype', ('Reference', 'AKI')),
  ('diseasetype', ('CKD', 'AKI')),
  ('diseasetype', ('Reference', 'CKD')),
]

In [5]:
adata = get_adata()
adata.uns["comparison_metadata"] = {}



In [6]:
adata

AnnData object with n_obs × n_vars = 25000 × 10000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'SpecimenID', 'LibraryID', 'SampleType', 'SampleID', 'Run', 'clusterNumber', 'subclass.l1', 'subclass.l2', 'dataSource', 'diseasetype', 'umap_1', 'umap_2', 'Age', 'Gender', 'Race', 'clusterClass', 'organism', 'tissue', 'assay', 'EnrollementCategory', 'disease', 'donor_id', 'suspension_type', 'tissue_type', 'development_stage_ontology_term_id', 'development_stage', 'cell_type', 'celltype', 'PrimaryAdjudicatedCategory', 'diabetes_history', 'hypertension', 'eGFR', 'is_primary_data', 'disease_category'
    var: 'features'
    uns: 'comparison_metadata'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'

In [7]:
adata.layers["logcounts"] = adata.layers["counts"].copy()
sc.pp.normalize_total(adata, target_sum = 1e6, layer="logcounts", inplace=True)
adata.layers["logcounts"] = np.log1p(adata.layers["logcounts"])

In [8]:
adata

AnnData object with n_obs × n_vars = 25000 × 10000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'SpecimenID', 'LibraryID', 'SampleType', 'SampleID', 'Run', 'clusterNumber', 'subclass.l1', 'subclass.l2', 'dataSource', 'diseasetype', 'umap_1', 'umap_2', 'Age', 'Gender', 'Race', 'clusterClass', 'organism', 'tissue', 'assay', 'EnrollementCategory', 'disease', 'donor_id', 'suspension_type', 'tissue_type', 'development_stage_ontology_term_id', 'development_stage', 'cell_type', 'celltype', 'PrimaryAdjudicatedCategory', 'diabetes_history', 'hypertension', 'eGFR', 'is_primary_data', 'disease_category'
    var: 'features'
    uns: 'comparison_metadata'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts', 'logcounts'

In [9]:
cell_type_col="cell_type"
key_added = "rank_genes_groups"
sc.tl.rank_genes_groups(adata, groupby=cell_type_col, method="wilcoxon", layer="logcounts", key_added=key_added)

cell_types = adata.obs[cell_type_col].unique().tolist()
cell_types = [x for x in cell_types if pd.notna(x)]

  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group

In [11]:
from os.path import join

In [12]:
class ComparisonMetadata:
    def __init__(self, comparison_key):
        self.comparison_key = comparison_key
        self.comparison_key_str = dir_name_to_str(comparison_key)
        self.items = []
    def get_df_key(self, df_type):
        return f"{self.comparison_key_str}.{df_type}"
    def append_df(self, adata_key, df_type, df_params, df_c_vals):
        self.items.append({
            "path": join(adata_key, self.get_df_key(df_type)),
            "coordination_values": df_c_vals,
            "analysis_type": df_type,
            "analysis_params": df_params,
        })
        return self.get_df_key(df_type)
    def get_dict(self):
        return {
            self.comparison_key_str: self.items
        }

In [13]:
for cell_type in cell_types:
    cmdata = ComparisonMetadata([("compare", cell_type_col), ("val", cell_type), "__rest__"])
    
    df = sc.get.rank_genes_groups_df(adata, group=cell_type, key=key_added)
    df = df.sort_values(by="pvals_adj", ascending=False)

    uns_key = cmdata.append_df("uns", "rank_genes_groups", {
        "rank_genes_groups": adata.uns[key_added]["params"],
        "rank_genes_groups_df": {
             "group": cell_type,   
        },
    }, {
        "obsType": "cell",
        "featureType": "gene",
        "obsSetSelection": [[cell_type_col, cell_type]],
    })
    adata.uns[uns_key] = df
    
    # Enrichment tests
    enrichment_df = sc.queries.enrich(adata, group=cell_type, log2fc_min=2, pval_cutoff=.01)
    enrichment_df = enrichment_df.drop(columns=["query", "parents"])

    uns_key = cmdata.append_df("uns", "enrich", {
        "rank_genes_groups": adata.uns[key_added]["params"],
        "enrich": {
            "group": cell_type,
            "log2fc_min": 2,
            "pval_cutoff": .01
        },
    }, {
        "obsType": "cell",
        "featureType": "pathway",
        "obsSetSelection": [[cell_type_col, cell_type]],
    })
    adata.uns[uns_key] = enrichment_df
    adata.uns["comparison_metadata"].update(cmdata.get_dict())

In [17]:
# TODO: LEMUR results
# TODO: within-celltype healthy vs. disease differential expression tests

In [16]:
import json

In [25]:
adata.uns["comparison_metadata"] = json.dumps(adata.uns["comparison_metadata"])

In [19]:
adata.write_h5ad("data/kpmp_premiere_with_comparisons.h5ad")

In [21]:
test_adata = read_h5ad("data/kpmp_premiere_with_comparisons.h5ad")

In [24]:
#json.loads(test_adata.uns["comparison_metadata"])