In [None]:
PROJECT_NAME = "CropSeq-23-1"
SAMPLE = "Th1"


import json
import os
import re

import pandas as pd
import numpy as np
from glob import glob

import seaborn as sns
import matplotlib.pyplot as plt

import scanpy as sc
from scipy import stats
import scvelo as scv
import scirpy as ir

BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "data")

CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")

PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
PDF_DIR = os.path.join(PROCESSED_DIR, "pdf")
NOTEBOOK_DIR = os.path.join(BASE_DIR, "notebooks")

RAW_DATA_DIR = os.path.join(DATA_DIR, "raw")


def sfile(filename):
    _fname = os.path.join(PDF_DIR, f"{PROJECT_NAME}_{SAMPLE}_{filename}")
    print(f"File save at '{_fname}'")
    return _fname

# Checkpoint handling functions

def save_checkpoint(adata_obj, filename, overwrite=False):
    filename = os.path.join(CHECKPOINT_DIR, filename)
    if os.path.isfile(filename) and not overwrite:
        raise FileExistsError(f"File '{filename}' already exists")
    adata_obj.write_h5ad(filename)

def load_checkpoint(filename):
    filename = os.path.join(CHECKPOINT_DIR, filename)
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Cant find file '{filename}'")
    return sc.read_h5ad(filename)

def list_checkpoints():
    found_checkpoints = glob(os.path.join(CHECKPOINT_DIR, "*"))
    found_checkpoints = [os.path.split(filename)[1] for filename in found_checkpoints]
    print(f"Found {len(found_checkpoints)} checkpoint files in dir '{CHECKPOINT_DIR}'")
    return found_checkpoints


### Load raw data

In [None]:
guide_adata = sc.read_10x_mtx(os.path.join(RAW_DATA_DIR, SAMPLE))

### Preprocess single cell data

In [None]:
guide_adata.var['mt'] = guide_adata.var_names.str.startswith('mt-')
guide_adata.var['ribo'] = guide_adata.var_names.str.startswith('Rpl') | guide_adata.var_names.str.startswith('Rps')
sc.pp.calculate_qc_metrics(guide_adata, qc_vars=['mt', "ribo"], percent_top=None, log1p=False, inplace=True)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize = (4, 8))
p1 = sc.pl.scatter(guide_adata, x='total_counts', y='n_genes_by_counts', show=False, ax=ax1)
p2 = sc.pl.scatter(guide_adata, x='total_counts', y='pct_counts_mt', show=False, ax=ax2)

In [None]:
# Filter mito genes by cutoff (%)
MITO_CUTOFF = 5

total_cell_count = len(guide_adata)
guide_adata = guide_adata[guide_adata.obs.pct_counts_mt < MITO_CUTOFF, :]

print(f"Filter by cutoff {MITO_CUTOFF}% out " \
      f"{total_cell_count - len(guide_adata)}/{total_cell_count} cells by parameter" \
      f"'pct_counts_mt' ({round(len(guide_adata) / total_cell_count * 100, 2)}%)")


print(f"Got a final count of {len(guide_adata)} cells in " \
      f"dataset ({round(len(guide_adata) / total_cell_count * 100, 2)}%)")

In [None]:
scv.pp.normalize_per_cell(guide_adata)
scv.pp.filter_genes_dispersion(
    guide_adata,
    min_mean=0.0125,
    max_mean=3,
    min_disp=0.5,
    subset=False
)

In [None]:
# Subsample dataset
sc.pp.subsample(guide_adata, n_obs=7000)

In [None]:
guide_adata.raw = guide_adata

In [None]:
sc.pp.regress_out(guide_adata, ['total_counts'], n_jobs=6)

In [None]:
sc.pp.scale(guide_adata, max_value=10)

In [None]:
sc.pp.pca(guide_adata)
sc.pp.neighbors(guide_adata)
sc.tl.leiden(guide_adata, resolution=0.5)
sc.tl.umap(guide_adata)

In [None]:
# Show batch Effekt
plot = sc.pl.umap(
    guide_adata,
    color=["leiden"],
    show = False,
    frameon = False,
    title=["UMAP with leiden clustering"]
)

### Load TCR data

In [None]:
filename = os.path.join(RAW_DATA_DIR, SAMPLE, "filtered_contig_annotations.csv")

In [None]:
# Load TCR
tcr = ir.io.read_10x_vdj(path=filename)

# Insert TCR data into full adata
guide_adata.obs = pd.DataFrame.merge(guide_adata.obs, tcr.obs, left_index=True, right_index=True, how="left")

# QC
ir.tl.chain_qc(guide_adata)
ax = ir.pl.group_abundance(guide_adata, groupby="receptor_subtype", target_col="leiden")

In [None]:
ax = ir.pl.group_abundance(guide_adata, groupby="chain_pairing", target_col="leiden")

In [None]:
ax = sc.pl.umap(guide_adata, color="chain_pairing", groups="single pair")

In [None]:
ir.pp.ir_dist(guide_adata)
ir.tl.define_clonotypes(guide_adata, receptor_arms="all", dual_ir="primary_only")

ir.tl.clonal_expansion(guide_adata)
sc.pl.umap(guide_adata, color="clonal_expansion")

In [None]:
def make_unique_clone_id(adata_obj, prefix):
    adata_obj.obs.loc[adata_obj.obs["clone_id"].isna(), "clone_id"] = None
    adata_obj.obs["clone_id"] = adata_obj.obs["clone_id"].astype(str)
    adata_obj.obs.loc[
        ~adata_obj.obs["clone_id"].isna(),
        "clone_id"
    ] = prefix + "-" + adata_obj.obs.loc[
        adata_obj.obs["clone_id"] != "nan",
        "clone_id"
    ]
    return adata_obj

In [None]:
filtered_rna = make_unique_clone_id(guide_adata, SAMPLE)

In [None]:
# Set default OBS values
guide_adata.obs["guide_target"] = "no_guide"
guide_adata.obs["guide_name"] = "no_guide"
guide_adata.obs["feature_call"] = "no_guide"
guide_adata.obs["num_features"] = 0.0
guide_adata.obs["num_umis"] = "0"
guide_adata.obs["guide_num_umis"] = 0.0

### Cluster analysis

In [None]:
sc.pp.neighbors(guide_adata, n_neighbors=40, n_pcs=40)
sc.tl.leiden(guide_adata, resolution = 0.5)
sc.tl.umap(guide_adata)

In [None]:
sc.pl.umap(guide_adata, color="leiden")

In [None]:
# Rename clusters
cluster_names = [
    "CD4+ Bcl6-high",
    "CD4+ Prf1-high",
    "Th17",
    "CD4+ undefined",
    "Th1",
    "Treg",
    "CD4+ Ifit1-high",
]

guide_adata.rename_categories("leiden", cluster_names)

In [None]:
# Copy to other column
guide_adata.obs["no_guide_cluster"] = guide_adata.obs["leiden"]

In [None]:

marker_gene_list = [
    "Bcl6",
    "Prf1",
    "Il17a",
    "Rorc",
    "Ccr6",
    "Il23r",
    "Il22",
    "Gzma",
    "Cxcr3",
    "Ifng",
    "Gzmb",
    "Foxp3",
    "Il10",
    "Irf7",
    "Ifit1",
]

fig = sc.pl.matrixplot(
    guide_adata,
    marker_gene_list,
    groupby='no_guide_cluster',
    cmap='viridis',
    swap_axes = False,
    standard_scale = "var",
    vmax=1,
    vmin=0,
    show = False,
    title = "Differential Expression",
    return_fig=True,
    figsize=(7, 7),
)

fig.savefig(sfile("th1-diff-expression-matrix-plot.pdf"), transparent=True)

### Save checkpoint

In [None]:
save_checkpoint(
    adata_obj=guide_adata,
    filename=os.path.join(CHECKPOINT_DIR, f"{PROJECT_NAME}-{SAMPLE}-preprocessed.h5ad"),
    overwrite=True
)