# Import and settings

In [1]:
%load_ext autoreload
%autoreload 2

import os, re, gc, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.colors import ListedColormap
import seaborn as sn
import plotnine as pln
import anndata
import scanpy as sc
import sctk as sk

rcParams["pdf.fonttype"] = 42
rcParams["ps.fonttype"] = 42
expr_cmap = sk.expression_colormap()
np.set_printoptions(linewidth=150)

import numpy_groupies as npg
from sklearn.preprocessing import minmax_scale
import plotnine as pln
from plotnine import ggplot, aes

In [2]:
import scipy.sparse as sp

# Load data

In [3]:
cc_genes = sk.read_list("../data/misc/JP_cycle_genes.list")

In [4]:
pooled_ad = sc.read("../20200626_make_figure_for_Muzz/pooled_endothelium.processed.h5ad")

In [5]:
sc.tl.score_genes(
    pooled_ad, gene_list=cc_genes, ctrl_size=len(cc_genes), use_raw=True, score_name="cc_score"
)



In [6]:
pooled_ad

AnnData object with n_obs × n_vars = 2048 × 14908
    obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'nh3_annot1', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'rachel_annot3', 'sample_id', 'strain', 'week', 'annot', 'leiden_hm_r0_1', 'leiden_hm_r0_3', 'leiden_hm_r0_5', 'leiden_hm_r0_7', 'leiden_hm_r0_9', 'leiden_hm_split1', 'joint_annot', '__is_in_cluster__', 'leiden_r0_1', 'leiden_r0_3', 'leiden_r0_5', 'leiden_r0_7', 'leiden_r0_9', 'cc_score'
    var: 'gene_ids', 'hb', 'cc', 'mito', 'ribo', 'n_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'hvg_full'
    uns: 'dendrogram_joint_annot', 'leiden', 'neighbors', 'neighbors_hm', 'pca', 'rank_genes_groups_de.Capillary (arterial tip)-Capillary (venular tip)', 'rank_genes_groups_de.Capillary (arterial tip)-Capillary (venular tip)_filtered', 'rank_genes_groups_de.Capillary (venular tip)-Capillary (arterial tip)', 'rank_genes_groups_de.Capillar

In [7]:
pooled_ad.obs.joint_annot.cat.categories

Index(['Early endothelial cell', 'Early LE', 'LE', 'Arterial',
       'Tip cell (arterial)', 'Capillary (venular tip)',
       'Capillary/postcapillary venule', 'Postcapillary venule'],
      dtype='object')

In [8]:
pooled_ad.obs.batch.cat.categories

Index(['FCAImmP7241241', 'FCAImmP7316886', 'FCAImmP7316887', 'FCAImmP7316888',
       'FCAImmP7316896', 'FCAImmP7316897', 'FCAImmP7352190', 'FCAImmP7352191',
       'FCAImmP7462241', 'FCAImmP7528290', 'FCAImmP7528291', 'FCAImmP7555848',
       'FCAImmP7579213', 'FCAImmP7803024', 'FCAImmP7803025', 'FCAImmP7803026',
       'FCAImmP7803027', 'FCAImmP7803034', 'FCAImmP7803035', 'FCAImmP7803042',
       'FCAImmP7803043', 'FCAImmP7862095', 'FCAImmP7862096', 'FCAImmP7964503',
       'FCAImmP7964504', 'FCAImmP7964505', 'FCAImmP7964506', 'FCAImmP7964507',
       'FCAImmP7964508', 'FCAImmP7964509', 'FCAImmP7964510', 'v2_DSP',
       'v2_WA25', 'v3_DSP', 'v3_WA25'],
      dtype='object')

In [9]:
pooled_ad.obs.sample_id.cat.categories

Index(['0', '4', '5', '7', '8', '9', '10', '11', 'nan'], dtype='object')

In [10]:
pooled_ad.obs.dataset.cat.categories

Index(['fetal_skin', 'organoid'], dtype='object')

In [11]:
pooled_ad.obs.dataset.cat.rename_categories({"fetal_skin": "fsk", "organoid": "org"}, inplace=True)

In [12]:
pooled_count_ad = sk.restore_adata(
    pooled_ad, use_raw=True, use_n_counts=True, obsm_keys=["X_pca_hm", "X_umap_hm"]
)

In [13]:
pooled_count_ad.var = pooled_count_ad.var[
    [x for x in pooled_count_ad.var.columns if x.endswith("fetal_skin")]
].rename(columns=lambda x: x.replace("-fetal_skin", ""))[["gene_ids", "cc", "mito", "ribo", "hb"]]

In [16]:
pooled_count_ad.obs = pooled_count_ad.obs[
    [
        "batch",
        "chemistry_sorting",
        "dataset",
        "day",
        "donor",
        "gender",
        "n_counts",
        "n_genes",
        "percent_hb",
        "percent_mito",
        "percent_ribo",
        "percent_top50",
        "cc_score",
        "strain",
        "week",
        "annot",
        "joint_annot",
    ]
]

In [14]:
sk._obj_utils._rename_obsm_key(pooled_count_ad, "X_pca_hm", "X_pca")
sk._obj_utils._rename_obsm_key(pooled_count_ad, "X_umap_hm", "X_umap")

In [17]:
pooled_count_ad

AnnData object with n_obs × n_vars = 2048 × 15445
    obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'cc_score', 'strain', 'week', 'annot', 'joint_annot'
    var: 'gene_ids', 'cc', 'mito', 'ribo', 'hb'
    obsm: 'X_pca', 'X_umap'

In [18]:
count_ads = sk.split_by_group(pooled_count_ad, groupby="dataset")

In [19]:
count_ads["pooled"] = pooled_count_ad

In [20]:
for name, ad in count_ads.items():
    ad = ad[~ad.obs.joint_annot.isin(["LE", "Early LE"])].copy()
    ad.write(f"{name}.vascular_endothelium.count_with_PCA_UMAP_for_monocle.20220531.h5ad", compression="lzf")

In [21]:
count_ads

{'fsk': AnnData object with n_obs × n_vars = 1944 × 15445
     obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'cc_score', 'strain', 'week', 'annot', 'joint_annot'
     var: 'gene_ids', 'cc', 'mito', 'ribo', 'hb'
     obsm: 'X_pca', 'X_umap',
 'org': AnnData object with n_obs × n_vars = 104 × 15445
     obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'cc_score', 'strain', 'week', 'annot', 'joint_annot'
     var: 'gene_ids', 'cc', 'mito', 'ribo', 'hb'
     obsm: 'X_pca', 'X_umap',
 'pooled': AnnData object with n_obs × n_vars = 2048 × 15445
     obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'cc_score', 'strain', 'week', 'annot', 'joint_annot'
     var: 'gene_id