In [None]:
%pip install scanpy
%pip install anndata
%pip install MuData
%pip install loompy

In [None]:
from mudata import MuData
import numpy as np
from tqdm import tqdm
import pandas as pd
import anndata
import scanpy
import csv
import sys
import loompy
import logging
import os
from matplotlib import pyplot as plt

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:


In [None]:
# mount drive if neccessary
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
h5ad_path = "path/to/my/processed/h5ad"
adata = scanpy.read_h5ad(h5ad_path)

In [None]:
# extract a specific donor
adata = adata[adata.obs['genotype_cluster'] == 0] # or 1

AnnData object with n_obs × n_vars = 56783 × 1276
    obs: 'n_counts', 'intersecting', 'mutant_type', 'batch', 'genotype_cluster', 'stimulation'
    uns: 'batch_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'

In [None]:
output_path = "where/to/write/mutant_de_file.txt"

stims = np.unique(adata.obs['stimulation'].values)
all_variants = np.unique(adata.obs['mutant_type'].values)

results = {}
for i, stim in enumerate(stims):

    tmp_adata = adata[adata.obs['stimulation'] == stim].copy()
    valid_groups = [group for group in all_variants if np.sum(tmp_adata.obs['mutant_type'] == group) > 1]
    scanpy.tl.rank_genes_groups(tmp_adata, 'mutant_type', groups=valid_groups, reference='AAVS', method='wilcoxon')

    for j, variant in enumerate(all_variants):
        key = f"{stim}_{variant}"
        if variant not in tmp_adata.uns['rank_genes_groups']['pvals_adj'].dtype.names:
            continue
        names = tmp_adata.uns['rank_genes_groups']['names'][variant]
        pvals_adj = tmp_adata.uns['rank_genes_groups']['pvals_adj'][variant]
        logfoldchanges = tmp_adata.uns['rank_genes_groups']['logfoldchanges'][variant]

        for gene, pval, logfc in zip(names, pvals_adj, logfoldchanges):
            if gene not in results:
                results[gene] = {}
            results[gene][f"{key}_pval"] = pval
            results[gene][f"{key}_logfc"] = logfc

In [None]:
output_path = "where/to/write/stim_de_file.txt"

# first we pull all the 'unannotated' from mutant_type since we shouldn't be comparing edited cells for this
subdata = adata[adata.obs['mutant_type'] == 'unannotated']

# use scanpy to do the DE
scanpy.tl.rank_genes_groups(subdata, 'stimulation', reference='Ctrl', method='wilcoxon')

# simulations with the exception of control stimulation
all_stims = [stim for stim in subdata.obs['stimulation'].cat.categories if stim != 'Ctrl']

n_genes = len(subdata.var_names)
n_stims = len(all_stims)

# Initialize matrices for adjusted p-values and logFC
pval_matrix = np.full((n_genes, n_stims), np.nan)  # Use np.nan to indicate missing values
logfc_matrix = np.full((n_genes, n_stims), np.nan)

# Create a dictionary to map gene names to their indices in subdata.var_names
gene_to_index = {gene: index for index, gene in enumerate(subdata.var_names)}

# loop through and population the p_val matrix and logfc matrix from the DE results
for j, stim in enumerate(all_stims):
    de_genes = subdata.uns['rank_genes_groups']['names'][stim]
    pvals_adj = subdata.uns['rank_genes_groups']['pvals_adj'][stim]
    logfc = subdata.uns['rank_genes_groups']['logfoldchanges'][stim]

    for de_gene, pval, lf in zip(de_genes, pvals_adj, logfc):
        idx = gene_to_index[de_gene]
        pval_matrix[idx, j] = pval
        logfc_matrix[idx, j] = lf

# Write matrices to .txt file
with open(output_path, "w") as f:
    # Header
    headers = ["gene_names"] + [f"{stim}_{metric}" for stim in all_stims for metric in ['pval_adj', 'logfc']]
    f.write("\t".join(headers) + "\n")

    for i, gene in enumerate(subdata.var_names):
        values = []
        for j in range(n_stims):
            values.extend([str(pval_matrix[i, j]), str(logfc_matrix[i, j])])
        f.write(gene + "\t" + "\t".join(values) + "\n")