In [1]:
working_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics"
data_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics/data/single-cell/ustekinumab"

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os

os.chdir(working_dir)

import yaml
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
import mudata as md
import muon as mu
import numpy as np
import scanpy as sc
from matplotlib import gridspec
import scanpy.external as sce
from scipy import sparse
from matplotlib.colors import LinearSegmentedColormap
import matplotlib as mpl

from utils.utils import *
from utils.plotting import *

sc.settings.verbosity = 0
sc.set_figure_params(dpi=80)
sns.set(style="white")

Global seed set to 0
  jax.tree_util.register_keypaths(data_clz, keypaths)
  jax.tree_util.register_keypaths(data_clz, keypaths)


In [3]:
path = os.path.join(data_dir, "ANCA_ustekinumab_4PK4PB_CD4Teff.h5mu")
anca_data = md.read_h5mu(path)
anca_data

In [4]:
anca_data.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_CITE,nFeature_CITE,frac.mito,frac.ribo,Feature_Count_ratio,CD3_count,RNA_snn_res.0.1,...,RNA_snn_res.0.9,RNA_snn_res.1,cell_type,Trm_score1,non_Trm_score2,CD4_Trm_score3,CD4_non_Trm_score4,CD8_Trm_score5,CD8_non_Trm_score6,Proinflam_cytokines_score7
P069K_AAACCTGAGAAAGTGG,P069,5842.0,2177,2466.0,27,0.018658,0.263266,0.372646,13.0,0,...,2,1,CD4TEM,0.062449,0.007477,-0.004029,0.009606,0.075980,-0.053810,0.062070
P069K_AAACCTGCACCCATGG,P069,2044.0,1235,3865.0,31,0.045010,0.073386,0.604207,6.0,0,...,3,3,CD4TEM,-0.045415,0.320776,-0.067211,0.034241,-0.071112,0.051801,-0.080551
P069K_AAACGGGAGGGTGTGT,P069,2941.0,1335,2548.0,29,0.042843,0.258075,0.453927,13.0,0,...,2,1,CD4TEM,-0.106338,-0.083652,-0.050932,-0.034059,-0.043653,-0.055951,-0.070555
P069K_AAACGGGCAAGGTTTC,P069,5139.0,2441,4982.0,30,0.045729,0.031524,0.474995,3.0,0,...,1,10,CD4TEM,0.110106,-0.091697,0.290357,-0.037737,0.044854,-0.058554,-0.153249
P069K_AAAGATGCATGCATGT,P069,6801.0,2272,3675.0,27,0.019997,0.231290,0.334069,16.0,0,...,0,0,CD4TEM,-0.017995,0.282115,-0.033006,0.187927,0.060232,0.072804,-0.097354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P140B_TTTATGCGTACATCCA-1,P140B,2546.0,1039,932.0,66,0.023566,0.335821,0.408091,4.0,0,...,2,1,CD4TEM,-0.186400,0.292414,-0.136548,0.046900,-0.025016,0.056470,-0.071519
P140B_TTTCCTCGTACGCTGC-1,P140B,4213.0,1943,1147.0,67,0.041775,0.191075,0.461192,13.0,0,...,2,1,CD4TEM,-0.081983,0.303853,-0.137213,0.039798,-0.116263,0.052634,-0.118028
P140B_TTTCCTCGTGTTTGTG-1,P140B,6310.0,2038,1245.0,108,0.036926,0.330586,0.322979,7.0,2,...,6,6,TRM17,-0.045223,0.014825,-0.037184,0.045153,-0.018667,0.017194,0.081508
P140B_TTTGGTTAGCTAACAA-1,P140B,4342.0,1802,1142.0,63,0.026025,0.283740,0.415016,12.0,0,...,2,1,CD4TEM,-0.007244,0.062604,-0.018739,0.009838,-0.023909,-0.006128,-0.108134


In [6]:
relevant_obs = [
    "patient",
    "sample",
    "tissue",
    "case",
    "orig.ident",
    "cell_type",
    "RNA_snn_res.0.3",
]
anca_data.obs = anca_data.obs[relevant_obs]

In [7]:
mod = "RNA"
rna_data = ad.AnnData(
    anca_data.mod[mod].layers["counts"].copy(),
    # obs=anca_data.obs.copy(),
    var=anca_data.mod[mod].var[[]],
    layers={"counts": anca_data.mod[mod].layers["counts"].copy()},
    obsm={
        "X_harmony": anca_data.obsm["X_harmony"].copy(),
        "X_umap": anca_data.obsm["X_umap"].copy(),
    },
)

In [8]:
mod = "CITE"
cite_data = ad.AnnData(
    anca_data.mod[mod].layers["counts"].copy(),
    # obs=anca_data.obs.copy(),
    var=anca_data.mod[mod].var[[]],
    layers={"counts": anca_data.mod[mod].layers["counts"].copy()},
    obsm={
        "X_harmony": anca_data.obsm["X_harmony"].copy(),
        "X_umap": anca_data.obsm["X_umap"].copy(),
    },
)

In [9]:
rna_data.obs = anca_data.obs.copy()
cite_data.obs = anca_data.obs.copy()

In [10]:
rna_data.obs["RNA_snn_res.0.3"].unique().tolist()

['1', '0', '4', '2', '3', '5']

In [11]:
rna_data.obs["cell_type_fine"] = rna_data.obs["RNA_snn_res.0.3"].replace(
    {
        "0": "TH1-like CD4+ EM",
        "1": "other CD4+ EM",
        "2": "TFH",
        "3": "TRM1",
        "4": "TRM17",
        "5": "TFH",
    }
)

In [12]:
anca_data_clean = md.MuData(
    {
        "rna": rna_data.copy(),
        "cite": cite_data.copy(),
    }
)

In [13]:
# save data
save_path = os.path.join(data_dir, "ANCA_ustekinumab_4PK4PB_CD4Teff_annotated.h5mu")
anca_data_clean.write_h5mu(save_path)

... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'tissue' as categorical
... storing 'case' as categorical
... storing 'orig.ident' as categorical
... storing 'patient' as categorical
... storing 'sample' as categorical
... storing 'tissue' as categorical
... storing 'case' as categorical
... storing 'orig.ident' as categorical
