In [None]:
data_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics/data/single-cell/exploratory"
working_dir = "/data/projects/dschaub/ANCA-GN_transcriptomics"

In [None]:
%load_ext autoreload
%autoreload 2

import os
os.chdir(working_dir)

import yaml
import anndata as ad
import matplotlib.pyplot as plt
import mudata as md
import numpy as np
import scanpy as sc
from scipy import sparse
from matplotlib.colors import LinearSegmentedColormap
import matplotlib as mpl
from matplotlib import font_manager
import pandas as pd

from utils.utils import *
from utils.plotting import *

sc.settings.verbosity = 0
sc.settings._vector_friendly = True
font_dirs = "/usr/share/fonts/truetype/msttcorefonts/"
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

plt.rcParams['font.family'] = "arial"
plt.rcParams['font.size'] = 12

## Load data

In [None]:
path = os.path.join(
    data_dir, "ANCA_exploratory_27PK27PB_Tcells_HarmonyR_annotated.h5mu"
)
mudata = md.read_h5mu(path)
mod_rna = mudata.mod["rna"]
mod_cite = mudata.mod["cite"]

path = os.path.join(data_dir, "ANCA_exploratory_27PK27PB_CD4Teff_annotated.h5mu")
mudata_cd4 = md.read_h5mu(path)
mod_rna_cd4 = mudata_cd4.mod["rna"]
mod_cite_cd4 = mudata_cd4.mod["cite"]

path = os.path.join(data_dir, "ANCA_exploratory_27PK27PB_CD8Teff_annotated.h5mu")
mudata_cd8 = md.read_h5mu(path)
mod_rna_cd8 = mudata_cd8.mod["rna"]
mod_cite_cd8 = mudata_cd8.mod["cite"]

In [None]:
mod_rna.obs["cell_type_v1_merged"] = mod_rna.obs["cell_type_v1"].astype(str)
mod_rna.obs.loc[mod_rna_cd4.obs_names, "cell_type_v1_merged"] = mod_rna_cd4.obs[
    "cell_type_fine"
].astype(str)

mod_rna.obs.loc[mod_rna_cd8.obs_names, "cell_type_v1_merged"] = mod_rna_cd8.obs[
    "cell_type_fine"
].astype(str)

mod_rna.obs["cell_type_v1_merged"] = mod_rna.obs["cell_type_v1_merged"].astype(
    "category"
)

In [None]:
# remove any obs that are not sub labeled
teff_obs_names = mod_rna[
    mod_rna.obs["cell_type_v1"].isin(["CD4+ Teff", "CD8+ Teff"])
].obs_names

In [None]:
extra_obs_names = list(
    (set(teff_obs_names) - set(mod_rna_cd4.obs_names)) - set(mod_rna_cd8.obs_names)
)
mod_rna = mod_rna[~mod_rna.obs_names.isin(extra_obs_names)].copy()

In [None]:
mod_rna.obs.head()

## Load subtype info

In [None]:
# Load a specific sheet by name
path = os.path.join(data_dir, "..", "MPO_vs_PR3.xlsx")
df_1 = pd.read_excel(
    path,
    sheet_name="scRNAseq",
    header=0,
)
# Set the first row as the column names
df_1.columns = df_1.iloc[0]
# Drop the first row from the DataFrame
df_1 = df_1.drop(df_1.index[0])
# Reset the index of the DataFrame
df_1 = df_1.reset_index(drop=True)

df_1["type"] = "MPO"
df_1.loc[df_1["PR3"] == "yes", "type"] = "PR3"
df_1.set_index("Pat.Nr.", inplace=True)
# remove spaces from index
df_1.index = df_1.index.str.replace(" ", "")

In [None]:
# Load a specific sheet by name
df_2 = pd.read_excel(
    path,
    sheet_name="Tabelle2",
    header=0,
)
# Set the first row as the column names
df_2.columns = df_2.iloc[0]
# Drop the first row from the DataFrame
df_2 = df_2.drop(df_2.index[0])
# Reset the index of the DataFrame
df_2 = df_2.reset_index(drop=True)

df_2["type"] = "MPO"
df_2.loc[df_2["PR3"] == "yes", "type"] = "PR3"
df_2.set_index("Pat.Nr.", inplace=True)
# remove spaces from index
df_2.index = df_2.index.str.replace(" ", "")

In [None]:
df_1.head()

## Join information

In [None]:
set(df_1.index.unique()) - set(mod_rna.obs.patient.unique())
# set(mod_rna.obs.patient.unique()) - set(df_1.index.unique())

In [None]:
mapping = {pat: df_1.loc[pat, "type"] for pat in df_1.index.tolist()}

In [None]:
mod_rna.obs["type"] = mod_rna.obs["patient"].map(mapping)

In [None]:
mod_rna.obs.head()

In [None]:
mod_rna.obs.type.isna().sum()

## Overview

In [None]:
plot_umap(
    mod_rna,
    color="type",
    # palette=colors,
    figsize=(5.5, 5),
    title="",
    size=3,
    axes_fraction=1.0,
    save_path="figures/supp/exploratory_anca_type.pdf",
)

## Analyze T cell composition per subtype

In [None]:
celltype_col = "cell_type_v1"

celltype_map = dict(zip(mod_rna.obs["cell_type"], mod_rna.obs[celltype_col]))

color_map = {
    "CD4+ naive": "#1f77b4",
    "CD4+/CD8+ CM": "#ff7f0e",
    "NKT/CTL": "#279e68",
    "CD4+ EM/RM": "#d62728",
    "CD8+ EM/RM": "#aa40fc",
    "CD8+ CM/EM": "#8c564b",
    "CD4+/CD8+ stressed": "#e377c2",
    "Treg": "#b5bd61",
    "CD8+ naive": "#17becf",
    "MAIT": "#ffbb78",
    "Prolif.": "#98df8a",
    "NKT": "#aec7e8",
    "gdT": "#c5b0d5",
    "CD4+ CM": "#ff9896",
    "NK": "#c49c94",
}
color_map = {celltype_map[k]: v for k, v in color_map.items()}

order = [
    "CD4+ EM/RM",
    "CD8+ EM/RM",
    "CD4+ naive",
    "CD4+ CM",
    "CD8+ naive",
    "CD8+ CM/EM",
    "CD4+/CD8+ CM",
    "CD4+/CD8+ stressed",
    "Treg",
    "gdT",
    "MAIT",
    "NKT/CTL",
    "NKT",
    "NK",
    "Prolif.",
]
order = [celltype_map[x] for x in order]
colors = [color_map[x] for x in order]
orig_col = mod_rna.obs[celltype_col].copy()
mod_rna.obs["cell_type_umap"] = pd.Categorical(
    mod_rna.obs[celltype_col], categories=order, ordered=True
)

In [None]:
plot_key_compositon_for_groups(
    mod_rna,
    "type",
    "cell_type_umap",
    colors,
    order=order,
    figsize=(1.5, 6),
    bbox_to_anchor=(1, 1),
    xlabel=None,
    ylabel="Cell type composition",
    save_path="figures/supp/exploratory_anca_type_composition.pdf",
)

## Teff subtype composition

In [None]:
cmap_teff = {
    "Tc1": "#6600B4",
    "Tc17-like": "#840AE2",
    "Tc1-like": "#A228FF",
    "NKT/CTL": "#D2D2D2",
    "Th17": "#7B1717",
    "Th1": "#A91F1F",
    "Tfh": "#E0B093",
    "Th1-like": "#D62728",
}

labels_teff = [
    "Th17",
    "Th1",
    "Th1-like",
    "Tfh",
    "Tc1",
    "Tc1-like",
    # "Tc1-like",
    "Tc17-like",
    "NKT/CTL",
]

colors_teff = [cmap_teff[x] for x in labels_teff]
mod_rna_sub = mod_rna[
    mod_rna.obs["cell_type_v1"].isin(["CD8+ Teff", "CD4+ Teff"])
].copy()
mod_rna_sub.obs["cell_type_umap"] = pd.Categorical(
    mod_rna_sub.obs["cell_type_v1_merged"], categories=labels_teff, ordered=True
)

In [None]:
comp_data = plot_key_compositon_for_groups(
    mod_rna_sub,
    "type",
    "cell_type_umap",
    colors_teff,
    order=labels_teff,
    figsize=(1.5, 6),
    bbox_to_anchor=(1, 1),
    xlabel=None,
    ylabel="Teff subset composition",
    save_path="figures/main/exploratory_anca_type_composition_teff.pdf",
)