In [None]:
# Global variables
PROJECT_NAME = "ANCA"


# Import section
import logging
logging.basicConfig(level=logging.INFO)

import warnings
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=DeprecationWarning)

import os
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from anndata import AnnData
import scanpy as sc
import scanpy.external as sce

sc.logging.print_header()

import scvelo as scv
import scirpy as ir

scv.logging.print_version()
scv.settings.verbosity = 3
scv.settings.presenter_view = True
scv.set_figure_params('scvelo')


BASE_DIR = os.getcwd()

DATA_DIR = os.path.join(BASE_DIR, "data")
PROJECT_CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
PDF_DIR = os.path.join(PROCESSED_DIR, "pdf")


PROJECTS = [
    "P053",
    "P067",
    "P070",
    "P129",
    "P137",
    "P138",
    "P139",
    "P140",
    "P143",
    "P150",
    "P152",
]


# Checkpoint handling functions

def save_checkpoint(adata_obj, filename, overwrite=False):
    filename = os.path.join(PROJECT_CHECKPOINT_DIR, filename)
    if os.path.isfile(filename) and not overwrite:
        raise FileExistsError(f"File '{filename}' already exists")
    adata_obj.write_h5ad(filename)

def load_checkpoint(filename):
    filename = os.path.join(PROJECT_CHECKPOINT_DIR, filename)
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Cant find file '{filename}'")
    return sc.read_h5ad(filename)

def list_checkpoints():
    found_checkpoints = glob(os.path.join(PROJECT_CHECKPOINT_DIR, "*"))
    found_checkpoints = [os.path.split(filename)[1] for filename in found_checkpoints]
    print(f"Found {len(found_checkpoints)} checkpoint files in dir '{PROJECT_CHECKPOINT_DIR}'")
    return found_checkpoints

def sfile(filename):
    _fname = os.path.join(PDF_DIR, f"{PROJECT_NAME}-{filename}")
    print(f"File save at '{_fname}'")
    return _fname


In [None]:
# Load checkpoints to AnnData Objects
adata_objs = {}

for patient in PROJECTS:
    adata_objs[patient] = load_checkpoint(f"{PROJECT_NAME}-{patient}-preprocessed.h5ad")

In [None]:
# Concat AnnData Objects
adata_concat = list(adata_objs.values())[0].concatenate(
    list(adata_objs.values())[1:],
    batch_categories=PROJECTS
)

del adata_objs

In [None]:
# Rerun concat dataset PCA and neighbor analysis
sc.pp.pca(adata_concat, svd_solver='arpack', n_comps=40)
sc.pp.neighbors(adata_concat, n_neighbors=40, n_pcs=40)
sc.tl.leiden(adata_concat, resolution = 0.5)
sc.tl.umap(adata_concat)

In [None]:
# Show batch Effekt
plot = sc.pl.umap(
    adata_concat,
    color=["batch", "leiden"],
    show = False,
    frameon = False,
    title=["Batches", "UMAP with leiden clustering"]
)

In [None]:
# Run harmony
sce.pp.harmony_integrate(adata_concat, 'batch', adjusted_basis = "X_pca_harmony")
assert 'X_pca_harmony' in adata_concat.obsm

In [None]:
sc.pp.neighbors(adata_concat, use_rep = "X_pca_harmony")
sc.tl.umap(adata_concat)
sc.tl.leiden(adata_concat, resolution = 0.5, key_added="leiden")

In [None]:
plot = sc.pl.umap(
    adata_concat,
    color=["batch", "leiden"],
    show = False,
    frameon = False,
    title=["Batches", "UMAP with leiden clustering"]
)

In [None]:
def exclude_cluster(adata_obj: AnnData, cluster: str, obs_name: str = "leiden") -> AnnData:
    total_cells = len(adata_obj.obs)
    adata_obj = adata_obj[adata_obj.obs[obs_name] != cluster, :]
    print(f"{len(adata_obj.obs)} / {total_cells} cells kept ({round(len(adata_obj.obs) / total_cells * 100, 2)} %).")
    return adata_obj

In [None]:
adata_concat = exclude_cluster(adata_concat, "13")

In [None]:
sc.pp.neighbors(adata_concat, use_rep = "X_pca_harmony")
sc.tl.umap(adata_concat)
sc.tl.leiden(adata_concat, resolution = 0.5, key_added="leiden")

In [None]:
plot = sc.pl.umap(
    adata_concat,
    color=["batch", "leiden"],
    show = False,
    frameon = False,
    title=["Batches", "UMAP with leiden clustering"]
)

In [None]:
adata_concat = exclude_cluster(adata_concat, cluster="11")
adata_concat = exclude_cluster(adata_concat, cluster="12")
adata_concat = exclude_cluster(adata_concat, cluster="9")

In [None]:
plot = sc.pl.umap(
    adata_concat,
    color=["batch", "leiden"],
    show = False,
    frameon = False,
    title=["Batches", "UMAP with leiden clustering"]
)

In [None]:
# Rename clusters
cluster_names = [
    "CD4+",
    "Naive CD4+ (SELL-high)",
    "CD4 NFATC2+",
    "Th17 1",
    "Th17 2",
    "pathogenic Th1 (GZM high)",
    "Th1 1",
    "Follikular T helper",
    "Treg",
    "Th1 2",
]

adata_concat.rename_categories("leiden", cluster_names)

In [None]:
fig, ax = plt.subplots(figsize = (7, 15))

marker_gene_list = [
    "TCF7",
    "CCR7",
    "SELL",
    "ITGA2",
    "RORA",
    "IL2",
    "TNF",
    "TGFBR2",
    "IL17RA",
    "IL17A",
    "RORC",
    "CCR6",
    "S1PR1",
    "NFATC2",
    "TGFBR3",
    "TBX21",
    "IFNG",
    "GZMA",
    "GZMB",
    "PRF1",
    "EOMES",
    "CXCR3",
    "FOXP3",
    "IL10",
    "CTLA4",
    "TIGIT",
    "IKZF2",
    "IL2RA",
    "CCR8",
    "AHR",
    "BCL6",
]

ax = sc.pl.matrixplot(
    adata_concat,
    marker_gene_list,
    groupby='leiden',
    cmap='viridis',
    swap_axes = True,
    standard_scale = "var",
    vmax=1,
    vmin=0,
    ax=ax,
    show = False,
    title = "Differential Expression"
)

fig.savefig(sfile("diff_expression_matrix_plot.pdf"), transparent=True)

In [None]:
fig, ax = plt.subplots(figsize = (7, 7))

ax.set_aspect('equal', 'box')

ax = sc.pl.umap(
    adata_concat,
    color="leiden",
    title="UMAP Leiden",
    frameon=False,
    ax = ax,
    show = False,
)


ax.set_aspect('equal', 'box')

fig.tight_layout()
fig.savefig(sfile("umap_leiden_cluster_numbers.pdf"), transparent=True, dpi=600)

### PAGA

In [None]:
adata_concat.uns['neighbors']['distances'] = adata_concat.obsp['distances']
adata_concat.uns['neighbors']['connectivities'] = adata_concat.obsp['connectivities']

scv.tl.paga(adata_concat, groups='leiden')

In [None]:
fig, ax = plt.subplots(figsize = (8, 8))

ax1 = scv.pl.paga(
    adata_concat,
    basis="umap",
    color="leiden",
    size=40,
    alpha=0.1,
    min_edge_width=2,
    node_size_scale=1,
    show = False,
    frameon=True,
    ax = ax,
)

ax2 = sc.pl.umap(adata_concat, ax=ax, color="leiden", alpha=0.3, title="PAGA graph")


ax.get_legend().remove()
ax.set_title("")
ax.set_xlabel("")
ax.set_ylabel("")

fig.tight_layout()
fig.savefig(sfile("paga-umap-square.pdf"), transparent=True, dpi=600)


### TCR analysis

In [None]:
def get_clone_ids_by_cluster(adata_obj, cluster) -> list:
    return np.unique(adata_obj.obs[
        (adata_obj.obs["leiden"] == cluster) & (~adata_obj.obs["clone_id"].isna())
    ]["clone_id"].tolist())

def inner_join(list1, list2) -> list:
    return list(set(list1) & set(list2))


def get_overlapping_clone_ids(cluster_1, cluster_2):
    return inner_join(
        get_clone_ids_by_cluster(adata_concat, cluster_1),
        get_clone_ids_by_cluster(adata_concat, cluster_2)
    )

def sanitize_cluster_name(value):
    return value.replace(" ", "_").replace("+", "pos").replace("(", "").replace(")", "")

In [None]:
cluster_names = [
    "CD4+",
    "Naive CD4+ (SELL-high)",
    "CD4 NFATC2+",
    "Th17 1",
    "Th17 2",
    "pathogenic Th1 (GZM high)",
    "Th1 1",
    "Follikular T helper",
    "Treg",
    "Th1 2",
]


for current_cluster, _ in enumerate(cluster_names):
    fig, ax = plt.subplots(figsize = (7, 7))


    adata_concat.obs["clone_id_san"] = adata_concat.obs["clone_id"].isin(
        get_overlapping_clone_ids(cluster_names[current_cluster], cluster_names[current_cluster])
    ).astype(str)
    adata_concat.obs[adata_concat.obs["clone_id_san"] == "True"]

    ax = sc.pl.umap(
        adata_concat,
        color=["clone_id_san"],
        title=f"Plastic clones - {cluster_names[current_cluster]}",
        palette={
            "True": adata_concat.uns["leiden_colors"][current_cluster],
            "False": "lightgrey",
        },
        groups = ["True"],
        legend_loc = None,
        size=[
            40 if c == "True" else 15 for c in adata_concat.obs["clone_id_san"]
        ],
        ax = ax,
        show = False,
    )

    fig.tight_layout()
    fig.savefig(sfile(f"umap-plastic-clones-square-{sanitize_cluster_name(cluster_names[current_cluster])}.pdf"))

In [None]:
for clone_id, expansion in adata_concat.obs["clone_id"].value_counts().head(20).to_dict().items():

    fig, ax = plt.subplots(figsize = (7, 7))

    umap_coords = adata_concat[adata_concat.obs[adata_concat.obs["clone_id"] == clone_id].index].obsm["X_umap"]

    ax = sc.pl.umap(
        adata_concat,
        show=False,
        title=f"Clone {clone_id} ({expansion} cells)",
        ax=ax
    )

    plt.scatter(
        x=np.array(umap_coords[:,0]),
        y=np.array(umap_coords[:,1]),
        c="#db2777",
        s=3,
    )
    fig.tight_layout()
    fig.savefig(sfile(f"highlight_clone_id_{clone_id}.pdf"))


In [None]:
ir.tl.repertoire_overlap(adata_concat, "leiden")

cg = ir.pl.repertoire_overlap(
    adata_concat,
    "leiden",
    yticklabels=True,
    xticklabels=True,
    row_cluster=False,
    col_cluster=False,
    ax=ax,
)

cg.ax_col_dendrogram.set_visible(False)

cg.fig.savefig(sfile(f"repertoire_overlap.pdf"))

In [None]:
fig, ax = plt.subplots(figsize = (8, 8))

ax = sc.pl.umap(adata_concat, ax=ax, color="batch", title="UMAP")

fig.tight_layout()
fig.savefig(sfile("umap-batches.pdf"), transparent=True, dpi=600)

In [None]:
adata_concat.obs["batch"].value_counts()

### Save checkpoint

In [None]:
save_checkpoint(
    adata_obj=adata_concat,
    filename=os.path.join(PROJECT_CHECKPOINT_DIR, f"{PROJECT_NAME}-merged.h5ad"),
    overwrite=False,
)