In [None]:
import os

import pandas as pd
import numpy as np
from glob import glob

import scanpy as sc


BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "data")

CHECKPOINT_DIR = os.path.join(DATA_DIR, "checkpoints")

PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
PDF_DIR = os.path.join(PROCESSED_DIR, "pdf")
NOTEBOOK_DIR = os.path.join(BASE_DIR, "notebooks")

RAW_DATA_DIR = os.path.join(DATA_DIR, "raw")

PROJECT_NAME = "CropSeq-19"

# Checkpoint handling functions

def save_checkpoint(adata_obj, filename, overwrite=False):
    filename = os.path.join(CHECKPOINT_DIR, filename)
    if os.path.isfile(filename) and not overwrite:
        raise FileExistsError(f"File '{filename}' already exists")
    adata_obj.write_h5ad(filename)

def load_checkpoint(filename):
    filename = os.path.join(CHECKPOINT_DIR, filename)
    if not os.path.isfile(filename):
        raise FileNotFoundError(f"Cant find file '{filename}'")
    return sc.read_h5ad(filename)

def list_checkpoints():
    found_checkpoints = glob(os.path.join(CHECKPOINT_DIR, "*"))
    found_checkpoints = [os.path.split(filename)[1] for filename in found_checkpoints]
    print(f"Found {len(found_checkpoints)} checkpoint files in dir '{CHECKPOINT_DIR}'")
    return found_checkpoints

def sfile(filename):
    _fname = os.path.join(PDF_DIR, f"{PROJECT_NAME}_merged_{filename}")
    print(f"File save at '{_fname}'")
    return _fname

### Load checkpoint

In [None]:
adata_concat = load_checkpoint("Cropseq_all_integrated_murine__gRNA_integrated_seuratObject.h5ad")

### Count control subset cells

In [None]:
"Th17 cells", len(adata_concat.obs[adata_concat.obs["seurat_clusters"].isin([1, 5]) & (adata_concat.obs["gRNA_group"] == "control")])

In [None]:
"Th1 cells", len(adata_concat.obs[adata_concat.obs["seurat_clusters"].isin([3]) & (adata_concat.obs["gRNA_group"] == "control")])

In [None]:
"Treg cells", len(adata_concat.obs[adata_concat.obs["seurat_clusters"].isin([0, 6]) & (adata_concat.obs["gRNA_group"] == "control")])

In [None]:
"All control", len(adata_concat.obs[(adata_concat.obs["gRNA_group"] == "control")])

### Caclulate distances to scambled control subclusters

In [None]:
def get_distance(barcode_list_1, barcode_list_2):
    return np.sqrt(
        np.sum(
            np.power(
                (
                    np.mean(adata_concat[barcode_list_1,:].raw.X, axis=0) - \
                    np.mean(adata_concat[barcode_list_2,:].raw.X, axis=0)
                ),
                2
            )
        )
    )

In [None]:
control_barcodes = adata_concat.obs[(adata_concat.obs["gRNA_group"] == "control")].index
th17_barcodes = adata_concat.obs[adata_concat.obs["seurat_clusters"].isin([1,5]) & (adata_concat.obs["gRNA_group"] == "control")].index
th1_barcodes = adata_concat.obs[adata_concat.obs["seurat_clusters"].isin([3]) & (adata_concat.obs["gRNA_group"] == "control")].index
treg_barcodes = adata_concat.obs[adata_concat.obs["seurat_clusters"].isin([0,6]) & (adata_concat.obs["gRNA_group"] == "control")].index

In [None]:
row_list = []

for target in adata_concat.obs["gRNA_group"].unique():

    all_grna_barcodes = adata_concat.obs[(adata_concat.obs["gRNA_group"] == target)].index


    row_list.append({
        'group': target,
        'Th17': get_distance(th17_barcodes, all_grna_barcodes[~all_grna_barcodes.isin(th17_barcodes)]),
        'Th1': get_distance(th1_barcodes, all_grna_barcodes[~all_grna_barcodes.isin(th1_barcodes)]),
        'Treg': get_distance(treg_barcodes, all_grna_barcodes[~all_grna_barcodes.isin(treg_barcodes)]),
        "Control": get_distance(control_barcodes, all_grna_barcodes),
    })

In [None]:
df = pd.DataFrame(row_list)

In [None]:
grna_convertion_dict = dict(zip([
    'SOCS1',
    'Ahr',
    'IL-23R',
    'tgfrb2',
    'IRF4',
    'JAK2',
    'TLR4',
    'IL-6RA',
    'CCR6',
    'Sykb',
    'tgfbr1',
    'TRAF6',
    'IL-17A',
    'CD40L',
    'SOCS3',
    'Rorc',
    'IL1r1',
    'STAT6',
    'NFkb1',
    'IL-7R',
    'CEBPB',
    'JAK1',
    'Runx1',
    'NFATC2',
    'Tbx21',
    'STAT3',
    'MyD88',
    'IL-12RB',
    "control",
], [
    'Socs1',
    'Ahr',
    'Il23r',
    'Tgfbr2',
    'Irf4',
    'Jak2',
    'Tlr4',
    'Il6ra',
    'Ccr6',
    'Syk',
    'Tgfbr1',
    'Traf6',
    'Il17a',
    'Cd40lg',
    'Socs3',
    'Rorc',
    'Il1r1',
    'Stat6',
    'Nfkb1',
    'Il7r',
    'Cebpb',
    'Jak1',
    'Runx1',
    'Nfatc2',
    'Tbx21',
    'Stat3',
    'Myd88',
    'Il12rb1',
    "control",
]))

In [None]:
df["group"] = df["group"].apply(lambda x: grna_convertion_dict[x])

In [None]:
pd.DataFrame(row_list).to_csv(sfile("euclidian-distances-to-scrambled-subclusters.csv"))