In [None]:
data_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics/data/single-cell/exploratory"
working_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics"

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os

os.chdir(working_dir)

import yaml
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
import mudata as md
import muon as mu
import numpy as np
import scanpy as sc
from matplotlib import gridspec
import scanpy.external as sce
from scipy import sparse
import matplotlib as mpl

from utils.utils import *
from utils.plotting import *

sc.settings.verbosity = 0
sc.set_figure_params(dpi=100)
sns.set(style="white")

## Load data

In [None]:
path = os.path.join(data_dir, "ANCA_exploratory_27PK27PB_Tcells.h5mu")
mudata = md.read_h5mu(path)

## Preprocessing

In [None]:
relevant_obs = ["patient", "sample", "tissue", "cell_type_yu"]
mudata.obs = mudata.obs[relevant_obs].copy()

### Preprocess RNA

In [None]:
mod_rna = mudata.mod["RNA"]
mod_rna

In [None]:
mod_rna = ad.AnnData(
    mod_rna.layers["counts"].copy(),
    obs=mudata.obs.copy(),
    # obsm=anca_data.obsm.copy(),
    var=mod_rna.var[[]],
    layers={"counts": mod_rna.layers["counts"].copy()},
)

In [None]:
sc.pl.highest_expr_genes(
    mod_rna,
    n_top=20,
)

In [None]:
sc.pp.filter_cells(mod_rna, min_genes=200)
sc.pp.filter_genes(mod_rna, min_cells=10)

In [None]:
plot_qc(mod_rna, cell_type_key="cell_type_yu")

In [None]:
sns.displot(mod_rna.obs.n_genes_by_counts)
plt.show()
sns.displot(mod_rna.obs.total_counts)
plt.show()
sns.displot(mod_rna[mod_rna.obs.total_counts < 1500].obs.total_counts)
plt.show()

In [None]:
# get patient in order of numbers
patient_order = mod_rna.obs.groupby("patient").size().sort_values().index
patient_order

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
sns.boxplot(y=mod_rna.obs.total_counts, x=mod_rna.obs["patient"], ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
sns.boxplot(y=mod_rna.obs.n_genes_by_counts, x=mod_rna.obs["patient"], ax=ax)

In [None]:
sc.pl.scatter(mod_rna, x="total_counts", y="pct_counts_mt")
sc.pl.scatter(mod_rna, x="total_counts", y="n_genes_by_counts")

In [None]:
# get top x percent of cells
x = 1e-3
total_counts_max_mask = mod_rna.obs.total_counts > mod_rna.obs.total_counts.quantile(
    1 - x
)
total_counts_max_mask.sum()
# (mod_rna.obs.total_counts > 16000).sum()

mod_rna = mod_rna[
    mod_rna.obs.total_counts < mod_rna.obs.total_counts.quantile(1 - x), :
].copy()
sc.pp.filter_genes(mod_rna, min_cells=10)
mod_rna.shape

In [None]:
log_normalize(mod_rna)

In [None]:
get_adata_stats(mod_rna)

In [None]:
plot_qc(mod_rna, cell_type_key="cell_type_yu")

### Preprocess CITE

In [None]:
mod_cite = mudata.mod["CITE"]
mod_cite = mod_cite[mod_rna.obs_names, :].copy()
mod_cite

AnnData object with n_obs × n_vars = 72453 × 210
    layers: 'counts'

In [None]:
mod_cite = ad.AnnData(
    mod_cite.layers["counts"].copy(),
    obs=mudata.obs.loc[mod_cite.obs_names, :].copy(),
    # obsm=anca_data.obsm.copy(),
    var=mod_cite.var[[]],
    layers={"counts": mod_cite.layers["counts"].copy()},
)

In [None]:
var_names = [name for name in mod_cite.var_names if not "Hash" in name]
mod_cite = mod_cite[:, var_names].copy()

In [None]:
sc.pp.calculate_qc_metrics(mod_cite, inplace=True, percent_top=None)

In [None]:
sns.displot(mod_cite.obs.n_genes_by_counts)
plt.show()
sns.displot(mod_cite[mod_cite.obs.n_genes_by_counts < 60].obs.n_genes_by_counts)
plt.show()
sns.displot(mod_cite.obs.total_counts)
plt.show()
sns.displot(mod_cite.obs.query("total_counts>0 and total_counts<10000").total_counts)
plt.show()
sns.displot(
    mod_cite.obs.sample(frac=0.01).query("total_counts>10"),
    x="total_counts",
    log_scale=True,
    hue="patient",
    multiple="stack",
)
plt.show()

In [None]:
# get 0.1% of cells with highest total counts
x = 5e-4
total_counts_max_cutoff = mod_cite.obs.total_counts.quantile(1 - x)
total_counts_max_mask = mod_cite.obs.total_counts < total_counts_max_cutoff
print(total_counts_max_mask.sum())
print(total_counts_max_cutoff)

In [None]:
mod_cite = mod_cite[total_counts_max_mask, :].copy()

In [None]:
sc.pp.filter_genes(mod_cite, min_cells=10)
mod_cite.shape

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
sns.boxplot(y=mod_cite.obs.total_counts, x=mod_cite.obs["sample"], ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
sns.boxplot(y=mod_cite.obs.n_genes_by_counts, x=mod_cite.obs["patient"], ax=ax)

In [None]:
protein_clr(mod_cite)

## Harmonize RNA and CITE data

In [None]:
n_cells_cite = mod_cite.shape[0]
n_cells_rna = mod_rna.shape[0]
print(f"Number of cells in RNA: {n_cells_rna}")
print(f"Number of cells in CITE: {n_cells_cite}")

In [None]:
shared_obs_names = [name for name in mod_rna.obs_names if name in mod_cite.obs_names]
mod_rna = mod_rna[shared_obs_names, :].copy()
mod_cite = mod_cite[shared_obs_names, :].copy()

## Save data

In [None]:
mudata = md.MuData(
    {
        "rna": mod_rna.copy(),
        "cite": mod_cite.copy(),
    }
)
save_path = os.path.join(data_dir, "ANCA_exploratory_27PK27PB_Tcells_processed.h5mu")
mudata.write_h5mu(save_path)