# Setup

In [1]:
import scanpy as sc
import scvi
import matplotlib.pyplot as plt
import warnings
import os
import pandas as pd
import anndata as ad
import seaborn as sns

Global seed set to 0


In [2]:
sc.set_figure_params(figsize=(4,4),  dpi=150, dpi_save=300, facecolor="white", frameon=False)
plt.rcParams["axes.grid"] = False
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
os.getcwd()

'/notebook/Joe/M-cells/code/epithelial_signature_processing'

## Set up data paths

### demeter

In [None]:
rawDataPath = "/mnt/iacchus/joe/raw_data/"
processedDataPath = "/mnt/iacchus/joe/processed_data/"

### gardner-lab-computer

In [None]:
# rawDataPath = "/mnt/e/Archive/Joe/raw_data/"
# processedDataPath = "/mnt/e/Archive/Joe/processed_data/"

# Data import

In [4]:
path = f"{rawDataPath}Tabula_Muris/droplet/data/"
adataDict = {}
for dir in os.listdir(path):
    print(f"Importing sample: {dir}")
    data = sc.read_10x_mtx(path + dir, cache=True)
    data.obs.index = dir.split("-")[1] + "_" + data.obs.index
    adataDict[dir] = data

Importing sample: Bladder-10X_P4_3
Importing sample: Bladder-10X_P4_4
Importing sample: Bladder-10X_P7_7
Importing sample: Heart-10X_P7_4
Importing sample: Kidney-10X_P4_5
Importing sample: Kidney-10X_P4_6
Importing sample: Kidney-10X_P7_5
Importing sample: Liver-10X_P4_2
Importing sample: Liver-10X_P7_0
Importing sample: Liver-10X_P7_1
Importing sample: Lung-10X_P7_8
Importing sample: Lung-10X_P7_9
Importing sample: Lung-10X_P8_12
Importing sample: Lung-10X_P8_13
Importing sample: Mammary-10X_P7_12
Importing sample: Mammary-10X_P7_13
Importing sample: Marrow-10X_P7_2
Importing sample: Marrow-10X_P7_3
Importing sample: Muscle-10X_P7_14
Importing sample: Muscle-10X_P7_15
Importing sample: Spleen-10X_P4_7
Importing sample: Spleen-10X_P7_6
Importing sample: Thymus-10X_P7_11
Importing sample: Tongue-10X_P4_0
Importing sample: Tongue-10X_P4_1
Importing sample: Tongue-10X_P7_10
Importing sample: Trachea-10X_P8_14
Importing sample: Trachea-10X_P8_15


In [5]:
adata = ad.concat(adataDict, label="batch")
adata.obs.index = [i[0] for i in adata.obs.index.str.split("-")]

## Add annotations

In [6]:
metadata = pd.read_csv(
    f"{rawDataPath}Tabula_Muris/droplet/annotations_droplet.csv",
    index_col="cell"
)
adata = adata[metadata.index]
adata.obs = adata.obs.join(metadata)

# Process data

In [9]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Create pseudobulk signatures

In [11]:
def create_signature_matrix(adata, obs_key, save_path):
    signatures = pd.DataFrame(index=adata.var_names)
    for cell_type in adata.obs[obs_key].unique():
        signatures[cell_type] = list(pd.DataFrame(adata[adata.obs[obs_key] == cell_type].X.todense()).mean(axis=0))
    signatures.to_csv(save_path)
    return signatures

## Full atlas

In [16]:
signatures = create_signature_matrix(
    adata,
    "cell_ontology_class",
    "../../analysis/cell_type_signatures/tabula_muris_signatures.csv"
)

In [22]:
signatures = signatures.drop("unknown", axis=1)
signatures

Unnamed: 0,mesenchymal cell,bladder cell,endothelial cell,basal cell of urothelium,leukocyte,fibroblast,cardiac muscle cell,endocardial cell,smooth muscle cell,erythrocyte,...,Fraction A pre-pro B cell,mesenchymal stem cell,chondroblast,skeletal muscle satellite cell,basal cell of epidermis,keratinocyte,epithelial cell,neuroendocrine cell,basal cell of epithelium of trachea,ciliated epithelial cell
Xkr4,0.000830,0.000959,0.000000,0.000000,0.000000,0.001931,0.000000,0.000000,0.011020,0.000000,...,0.000000,0.000725,0.000000,0.000000,0.001087,0.000614,0.003728,0.003207,0.000000,0.000000
Rp1,0.000000,0.000000,0.000000,0.000000,0.000825,0.000000,0.049764,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.002082,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.433208
Sox17,0.000475,0.001839,0.982717,0.000000,0.052291,0.020524,0.000000,0.011501,0.078793,0.007174,...,0.016530,0.032705,0.029102,0.026074,0.000000,0.000000,0.002680,0.010339,0.000000,0.000000
Mrpl15,0.197573,0.185654,0.193273,0.190909,0.146530,0.311251,0.338974,0.427502,0.350385,0.111927,...,0.933553,0.136700,0.104757,0.100069,0.431923,0.450696,0.169519,0.155978,0.268676,0.068708
Lypla1,0.170388,0.298757,0.245419,0.327175,0.163552,0.285693,0.355911,0.268577,0.270886,0.082993,...,0.568551,0.171137,0.205019,0.112644,0.448954,0.327226,0.182173,0.142375,0.089863,0.182171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERCC-00171,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Gfp_transgene,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Cre_transgene,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Tdtom_transgene,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [23]:
signatures.to_csv("../../analysis/cell_type_signatures/tabula_muris_signatures.csv")