In [1]:
data_dir = "/data/projects/dschaub/anca-project/data"
save_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics/data/single-cell/exploratory"
working_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics"

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os

os.chdir(working_dir)

import yaml
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
import mudata as md
import muon as mu
import numpy as np
import scanpy as sc
import scanpy.external as sce
from scipy import sparse
from matplotlib.colors import LinearSegmentedColormap
import matplotlib as mpl

from utils.utils import *
from utils.plotting import *

sc.settings.verbosity = 0

Global seed set to 0
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [3]:
exploratory_mapping = {
    "P089": "E001",
    "P060": "E002",
    "P028": "E003",
    "P059": "E004",
    "P088": "E005",
    "P108": "E006",
    "P118": "E007",
    "P029": "E008",
    "P129": "E009",
    "P019": "E010",
    "P115": "E011",
    "P067": "E012",
    "P007": "E013",
    "P018": "E014",
    "P139": "E015",
    "P005": "E016",
    "P025": "E017",
    "P023": "E018",
    "P126": "E019",
    "P100": "E020",
    "P070": "E021",
    "P015": "E022",
    "P103": "E023",
    "P053": "E024",
    "P008": "E025",
    "P055": "E026",
    "P137": "E027",
    "P050": "E028",
    "P143": "E029",
    "P020": "E030",
    "P144": "E031",
    "P105": "E032",
    "P068": "E033",
    "P004": "E034",
}

## Load data

In [4]:
path = f"{data_dir}/anca/exploratory/ANCA_27PK27PB_T_harmony_r.h5mu"
mudata = md.read_h5mu(path)

mod_rna = mudata.mod["rna"]
mod_cite = mudata.mod["cite"]

mudata

In [5]:
# # clean rna information

# tmp = mod_rna.obs["patient"].apply(lambda x: exploratory_mapping[x])
# mod_rna.obs["sample"] = tmp.astype(str) + mod_rna.obs["tissue"].astype(str)
# mod_rna.obs.index = mod_rna.obs["sample"].astype(str) + "_" + mod_rna.obs.index.str[6:]
# mod_rna.obs.drop(columns=["cell_type_yu"], inplace=True)

# # clean cite information

# tmp = mod_cite.obs["patient"].apply(lambda x: exploratory_mapping[x])

# mod_cite.obs["sample"] = tmp.astype(str) + mod_cite.obs["tissue"].astype(str)
# mod_cite.obs.index = (
#     mod_cite.obs["sample"].astype(str) + "_" + mod_cite.obs.index.str[6:]
# )
# mod_cite.obs.drop(columns=["cell_type_yu"], inplace=True)

In [7]:
# clean rna information

mod_rna.obs["patient"] = mod_rna.obs["patient"].apply(lambda x: exploratory_mapping[x])
mod_rna.obs["sample"] = mod_rna.obs["patient"].astype(str) + mod_rna.obs[
    "tissue"
].astype(str)
mod_rna.obs.index = mod_rna.obs["sample"].astype(str) + "_" + mod_rna.obs.index.str[6:]

# clean cite information

mod_cite.obs["patient"] = mod_cite.obs["patient"].apply(
    lambda x: exploratory_mapping[x]
)

mod_cite.obs["sample"] = mod_cite.obs["patient"].astype(str) + mod_cite.obs[
    "tissue"
].astype(str)
mod_cite.obs.index = (
    mod_cite.obs["sample"].astype(str) + "_" + mod_cite.obs.index.str[6:]
)

In [8]:
mod_rna

AnnData object with n_obs × n_vars = 72416 × 21419
    obs: 'patient', 'sample', 'tissue', 'cell_type_yu', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 'leiden_0.6', 'leiden_0.7', 'leiden_0.8', 'leiden_0.9', 'leiden_1.0', 'leiden_1.1', 'leiden_1.2', 'leiden_1.3', 'leiden_1.4', 'leiden_1.5', 'leiden_1.6', 'leiden_1.7', 'leiden_1.8', 'leiden_1.9'
    var: 'n_cells', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'
    uns: 'cell_type_yu_colors', 'leiden', 'leiden_0.3_colors', 'leiden_0.4_colors', 'leiden_0.5_colors', 'leiden_0.6_colors', 'leiden_0.7_colors', 'leiden_0.8_colors', 'leiden_0.9_colors', 'leiden_1.0_colors', 'leiden_1.1_colors', 'leiden_1.2_colors', 'leiden_1.3_colors', 'leiden_1.4_colors', 'leiden_1.5_colors', 'leiden_1.6_colors', 'leiden_1.7_colors', 'leiden_1.8_colors', 'leiden_1.9_colors', 'log1p',

In [9]:
mod_rna.obs.head()

Unnamed: 0,patient,sample,tissue,cell_type_yu,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,total_counts_ribo,...,leiden_1.0,leiden_1.1,leiden_1.2,leiden_1.3,leiden_1.4,leiden_1.5,leiden_1.6,leiden_1.7,leiden_1.8,leiden_1.9
E016K_AAACGGGAGATCGGGT,E016,E016K,K,CD4/CD8 stressed,580,580,1136.0,18.0,1.584507,259.0,...,7,7,7,7,7,9,9,9,8,8
E016K_AAAGATGCACATCCAA,E016,E016K,K,CD8 EM/RM,750,749,1546.0,46.0,2.97542,367.0,...,5,5,5,4,4,4,4,4,4,4
E016K_AAAGATGTCCTTCAAT,E016,E016K,K,CD8 EM/RM,657,657,1452.0,29.0,1.997245,384.0,...,5,5,5,4,4,4,4,4,4,4
E016K_AACCATGGTGACTCAT,E016,E016K,K,CD8 EM/RM,887,887,2068.0,58.0,2.804642,553.0,...,5,5,5,4,4,4,4,4,4,4
E016K_AACCGCGGTTACCAGT,E016,E016K,K,CD8 EM/RM,818,818,2056.0,67.0,3.258755,564.0,...,5,5,5,4,14,15,16,16,16,16


In [12]:
mod_cite.obs.head()

Unnamed: 0,patient,sample,tissue,cell_type_yu,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts
E016K_AAACGGGAGATCGGGT,E016,E016K,K,CD4/CD8 stressed,0,0.0,0.0,0.0
E016K_AAAGATGCACATCCAA,E016,E016K,K,CD8 EM/RM,0,0.0,0.0,0.0
E016K_AAAGATGTCCTTCAAT,E016,E016K,K,CD8 EM/RM,0,0.0,0.0,0.0
E016K_AACCATGGTGACTCAT,E016,E016K,K,CD8 EM/RM,0,0.0,0.0,0.0
E016K_AACCGCGGTTACCAGT,E016,E016K,K,CD8 EM/RM,0,0.0,0.0,0.0


In [13]:
(mod_rna.obs_names == mod_cite.obs_names).all()

True

## Save everything

In [14]:
mudata = md.MuData(
    {
        "rna": mod_rna.copy(),
        "cite": mod_cite.copy(),
    }
)
mudata.write(f"{save_dir}/ANCA_exploratory_27PK27PB_Tcells_HarmonyR.h5mu")

... storing 'sample' as categorical
... storing 'sample' as categorical


## Everything combined

In [None]:
exploratory_mapping = {
    "P089": "E001",
    "P060": "E002",
    "P028": "E003",
    "P059": "E004",
    "P088": "E005",
    "P108": "E006",
    "P118": "E007",
    "P029": "E008",
    "P129": "E009",
    "P019": "E010",
    "P115": "E011",
    "P067": "E012",
    "P007": "E013",
    "P018": "E014",
    "P139": "E015",
    "P005": "E016",
    "P025": "E017",
    "P023": "E018",
    "P126": "E019",
    "P100": "E020",
    "P070": "E021",
    "P015": "E022",
    "P103": "E023",
    "P053": "E024",
    "P008": "E025",
    "P055": "E026",
    "P137": "E027",
    "P050": "E028",
    "P143": "E029",
    "P020": "E030",
    "P144": "E031",
    "P105": "E032",
    "P068": "E033",
    "P004": "E034",
}

# clean rna information

mod_rna.obs["patient"] = mod_rna.obs["patient"].apply(lambda x: exploratory_mapping[x])
mod_rna.obs["sample"] = mod_rna.obs["patient"].astype(str) + mod_rna.obs[
    "tissue"
].astype(str)
mod_rna.obs.index = mod_rna.obs["sample"].astype(str) + "_" + mod_rna.obs.index.str[6:]

# clean cite information

mod_cite.obs["patient"] = mod_cite.obs["patient"].apply(
    lambda x: exploratory_mapping[x]
)

mod_cite.obs["sample"] = mod_cite.obs["patient"].astype(str) + mod_cite.obs[
    "tissue"
].astype(str)
mod_cite.obs.index = (
    mod_cite.obs["sample"].astype(str) + "_" + mod_cite.obs.index.str[6:]
)

print((mod_rna.obs_names == mod_cite.obs_names).all())

mudata = md.MuData(
    {
        "rna": mod_rna.copy(),
        "cite": mod_cite.copy(),
    }
)
mudata.write(
    os.path.join(data_dir, "exploratory/ANCA_exploratory_27PK27PB_CD4Teff_TotalVI.h5mu")
)