# Import and settings

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import anndata
import scanpy as sc
import scanpy_scripts as ss
import bbknn
import phate

np.set_printoptions(linewidth=180)
sc.settings.verbosity = 1
expr_cmap = ss.lib.expression_colormap()
ss.lib.set_figsize((4, 4))

# Processing

In [2]:
organoid_ad = sc.read('organoid.cellxgene.h5ad')

In [6]:
organoid_ad

AnnData object with n_obs × n_vars = 130811 × 27320 
    obs: 'sample_id', 'version', 'week', 'day', 'strain', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'leiden_bk_split5', 'nh3_annot1', 'nh3_broad_annot1'
    var: 'gene_ids', 'gene_symbols', 'highly_variable'
    obsm: 'X_umap_hm', 'X_umap_bk'

In [None]:
ss.lib.plot_embedding(organoid_ad, basis='umap_bk', groupby='nh3_broad_annot1')

In [171]:
ss.lib.cross_table(organoid_ad, 'nh3_annot1', 'strain')

strain,DSP,WA25
nh3_annot1,Unnamed: 1_level_1,Unnamed: 2_level_1
Endothelium,63,67
Merkel,56,83
Melanocytes,605,785
CNCC-like / Schwann,810,1612
CNCC-like,1415,685
Neuron progenitors,562,931
Immature neurons,761,754
Peridermal KC,465,689
Basal stem-like KC,1183,1526
Basal KC,5351,7076


In [173]:
organoid_ad.obs['week'].value_counts()

17-20_fetal_wks    72164
7-10_fetal_wks     21803
14-16_fetal_wks    18440
4-7_fetal_wks      18404
Name: week, dtype: int64

In [172]:
organoid_ad.obs['day'].value_counts()

day-133    72164
day-48     21803
day-85     18440
day-29     18404
Name: day, dtype: int64

In [128]:
organoid_ad.obs['nh3_broad_annot1'].value_counts()

Stroma           98067
Keratinocytes    23555
Neuronal          7530
Melanocytes       1390
Merkel             139
Endothelium        130
Name: nh3_broad_annot1, dtype: int64

In [3]:
ads = ss.lib.split_by_group(organoid_ad, groupby='nh3_broad_annot1')

In [8]:
ads

{'Endothelium': AnnData object with n_obs × n_vars = 130 × 27320 
     obs: 'sample_id', 'version', 'week', 'day', 'strain', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'leiden_bk_split5', 'nh3_annot1', 'nh3_broad_annot1'
     var: 'gene_ids', 'gene_symbols', 'highly_variable'
     obsm: 'X_umap_hm', 'X_umap_bk',
 'Keratinocytes': AnnData object with n_obs × n_vars = 23555 × 27320 
     obs: 'sample_id', 'version', 'week', 'day', 'strain', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'leiden_bk_split5', 'nh3_annot1', 'nh3_broad_annot1'
     var: 'gene_ids', 'gene_symbols', 'highly_variable'
     obsm: 'X_umap_hm', 'X_umap_bk',
 'Melanocytes': AnnData object with n_obs × n_vars = 1390 × 27320 
     obs: 'sample_id', 'version', 'week', 'day', 'strain', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'leiden_bk_split5', 'nh3_annot1', 'nh3_broa

### Keratinocytes

In [9]:
fKC_ad = sc.read('../20200403_post_annot3_cleanup/fetal_skin.keratinocytes.doublet_removed_processed.20200403.h5ad')

Only considering the two last: ['.20200403', '.h5ad'].
Only considering the two last: ['.20200403', '.h5ad'].


In [55]:
fKC_ad.X = fKC_ad.raw.X

In [62]:
fKC_ad

AnnData object with n_obs × n_vars = 1469 × 17905 
    obs: 'bh_doublet_pval', 'cell_caller', 'cluster_scrublet_score', 'doublet_pval', 'mt_prop', 'n_counts', 'n_genes', 'sanger_id', 'scrublet_score', 'chemistry', 'donor', 'gender', 'pcw', 'sorting', 'sample', 'chemistry_sorting', 'annot', 'hierarchy1', 'rachel_annot1', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'leiden_hm_r0_1', 'leiden_hm_r0_2', 'leiden_hm_r0_3', 'leiden_hm_r0_4', 'leiden_hm_r0_5', 'leiden_bk_r0_1', 'leiden_bk_r0_2', 'leiden_bk_r0_3', 'leiden_bk_r0_4', 'leiden_bk_r0_5', 'rachel_annot2', 'rachel_annot3'
    var: 'gene_ids', 'cc', 'mito', 'ribo', 'hb', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'hvg_full'
    uns: 'leiden', 'neighbors', 'neighbors_bk', 'neighbors_hm', 'pca'
    obsm: 'X_pca', 'X_pca_hm', 'X_umap_hm', 'X_umap_bk'
    varm: 'PCs'

In [66]:
fKC_ad.obs = fKC_ad.obs[[
    'sanger_id', 'chemistry_sorting', 'donor', 'gender', 'pcw', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'rachel_annot3',
]].rename(columns={'pcw': 'week'})

In [70]:
ads['Keratinocytes'].obs = ads['Keratinocytes'].obs[[
    'sample_id', 'batch', 'day', 'week', 'strain', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'nh3_annot1'
]]

In [85]:
oKC_ad = ss.lib.subsample(ads['Keratinocytes'], groupby='nh3_annot1', fraction=0.1, min_n=200)

In [86]:
kc_pooled = anndata.AnnData.concatenate(fKC_ad, oKC_ad, batch_key='dataset', batch_categories=['fetal_skin', 'organoid'])

In [91]:
k_org = kc_pooled.obs.dataset=='organoid'

In [92]:
kc_pooled.obs['annot'] = 'fsk_' + kc_pooled.obs['rachel_annot3'].astype(str)
kc_pooled.obs.loc[k_org, 'annot'] = 'org_' + kc_pooled.obs.loc[k_org, 'nh3_annot1'].astype(str)
kc_pooled.obs['annot'] = kc_pooled.obs['annot'].astype('category')

In [93]:
kc_pooled.obs.loc[~k_org, 'batch'] = kc_pooled.obs.loc[~k_org, 'sanger_id'].values
kc_pooled.obs['batch'] = kc_pooled.obs['batch'].astype('category')

In [94]:
kc_pooled.obs.batch.value_counts()

v3_WA25           1000
v3_DSP             914
v2_WA25            413
FCAImmP7964510     298
v2_DSP             240
FCAImmP7803042     231
FCAImmP7803043     180
FCAImmP7803026     148
FCAImmP7803027     119
FCAImmP7316897      94
FCAImmP7964508      93
FCAImmP7964509      89
FCAImmP7555848      52
FCAImmP7352191      43
FCAImmP7316888      36
FCAImmP7352190      28
FCAImmP7803034      14
FCAImmP7803024      13
FCAImmP7964505      11
FCAImmP7862096      11
FCAImmP7862095       9
Name: batch, dtype: int64

In [74]:
kc_pooled.obs.head()

Unnamed: 0,batch,chemistry_sorting,dataset,day,donor,gender,n_counts,n_genes,nh3_annot1,percent_hb,percent_mito,percent_ribo,percent_top50,rachel_annot3,sample_id,sanger_id,strain,week,annot
AAAGTAGAGTATCGAA-1-FCAImmP7316888-fetal_skin,,SC3Pv2_CD45N,fetal_skin,,F33,female,15891.0,2870,,0.018879,2.045183,54.080929,44.509471,Periderm,,FCAImmP7316888,,9,fsk_Periderm
AACCATGAGATTACCC-1-FCAImmP7316888-fetal_skin,,SC3Pv2_CD45N,fetal_skin,,F33,female,9631.0,2614,,0.020766,2.959194,35.01194,34.575849,Periderm,,FCAImmP7316888,,9,fsk_Periderm
ACCTTTAGTTCCACAA-1-FCAImmP7316888-fetal_skin,,SC3Pv2_CD45N,fetal_skin,,F33,female,5422.0,1885,,0.018443,3.430469,30.616009,34.267798,Periderm,,FCAImmP7316888,,9,fsk_Periderm
AGAGTGGTCGAACTGT-1-FCAImmP7316888-fetal_skin,,SC3Pv2_CD45N,fetal_skin,,F33,female,3173.0,1198,,0.0,1.922471,38.985188,33.217775,Periderm,,FCAImmP7316888,,9,fsk_Periderm
AGCGTCGTCAGAGCTT-1-FCAImmP7316888-fetal_skin,,SC3Pv2_CD45N,fetal_skin,,F33,female,5515.0,1528,,0.0,1.831369,47.978241,41.233001,Periderm,,FCAImmP7316888,,9,fsk_Periderm


In [95]:
ss.lib.simple_default_pipeline(kc_pooled, post_norm_only=True, batch='batch')

AnnData object with n_obs × n_vars = 4036 × 15277 
    obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'nh3_annot1', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'rachel_annot3', 'sample_id', 'sanger_id', 'strain', 'week', 'annot', 'leiden_hm_r0_1', 'leiden_hm_r0_3', 'leiden_hm_r0_5', 'leiden_hm_r0_7', 'leiden_hm_r0_9'
    var: 'gene_ids-fetal_skin', 'cc-fetal_skin', 'mito-fetal_skin', 'ribo-fetal_skin', 'hb-fetal_skin', 'n_cells-fetal_skin', 'highly_variable-fetal_skin', 'means-fetal_skin', 'dispersions-fetal_skin', 'dispersions_norm-fetal_skin', 'highly_variable_nbatches-fetal_skin', 'highly_variable_intersection-fetal_skin', 'hvg_full-fetal_skin', 'gene_ids-organoid', 'gene_symbols-organoid', 'highly_variable-organoid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'pca', 'neighbors_hm', 'neighbors', 'leiden'
    obsm: 'X_pca', 'X_pca_hm', 'X_umap_hm'
    varm: 'PCs'

In [103]:
ss.lib.simple_default_pipeline(kc_pooled, post_pca_only=True, batch=['dataset', 'batch'])

AnnData object with n_obs × n_vars = 4036 × 15277 
    obs: 'batch', 'chemistry_sorting', 'dataset', 'day', 'donor', 'gender', 'n_counts', 'n_genes', 'nh3_annot1', 'percent_hb', 'percent_mito', 'percent_ribo', 'percent_top50', 'rachel_annot3', 'sample_id', 'sanger_id', 'strain', 'week', 'annot', 'leiden_hm_r0_1', 'leiden_hm_r0_3', 'leiden_hm_r0_5', 'leiden_hm_r0_7', 'leiden_hm_r0_9'
    var: 'gene_ids-fetal_skin', 'cc-fetal_skin', 'mito-fetal_skin', 'ribo-fetal_skin', 'hb-fetal_skin', 'n_cells-fetal_skin', 'highly_variable-fetal_skin', 'means-fetal_skin', 'dispersions-fetal_skin', 'dispersions_norm-fetal_skin', 'highly_variable_nbatches-fetal_skin', 'highly_variable_intersection-fetal_skin', 'hvg_full-fetal_skin', 'gene_ids-organoid', 'gene_symbols-organoid', 'highly_variable-organoid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'pca', 'neighbors_hm', 'leiden', 'annot_colors', 'neighbors'
    obsm: 'X_pca', 'X_pca_hm', 'X_umap_hm'
    varm: 'PCs'

In [None]:
ss.lib.plot_embedding(kc_pooled, basis='umap_hm', groupby='annot', figsize=(6,6))

In [118]:
kc_pooled.obs['annot'].unique()

[fsk_Periderm, fsk_Early KC (stem cell?), fsk_Suprabasal, fsk_Hair follicle?, fsk_Basal KC, org_HF Basal KC, org_Suprabasal KC, org_Basal KC, org_Peridermal KC, org_Basal stem-like KC]
Categories (10, object): [fsk_Periderm, fsk_Early KC (stem cell?), fsk_Suprabasal, fsk_Hair follicle?, ..., org_Suprabasal KC, org_Basal KC, org_Peridermal KC, org_Basal stem-like KC]

In [None]:
ss.lib.highlight(kc_pooled, basis='umap_hm', groupby='annot', groups={
    'fetal skin': ['fsk_Periderm', 'fsk_Early KC (stem cell?)', 'fsk_Basal KC', 'fsk_Hair follicle?', 'fsk_Suprabasal'],
    'organoid': ['org_Peridermal KC', 'org_Basal stem-like KC', 'org_Basal KC', 'org_HF Basal KC', 'org_Suprabasal KC']
}, wspace=0.6, figsize=((4,4)))

In [None]:
ss.lib.set_figsize((4,4))
ss.lib.plot_scatter(kc_pooled, basis='umap_hm', color=['dataset', 'batch'], ncols=6, wspace=0.5)

In [None]:
ss.lib.set_figsize((4,4))
ss.lib.plot_scatter(kc_pooled, basis='umap_hm', color=['week'], ncols=6, wspace=0.5, palette='viridis')

In [None]:
ss.lib.set_figsize((3.5,3.5))
ss.lib.plot_scatter(kc_pooled, basis='umap_hm', color=['KRT4', 'SOX6', 'KRT14', 'KRT85', 'KRT1', 'CDK1', 'PRRX1'], ncols=7, color_map=expr_cmap)

In [169]:
kc_pooled = sc.read('pooled_keratinocytes.processed.h5ad')

In [176]:
ss.lib.clear_colors(kc_pooled)
kc_pooled.write('pooled_keratinocytes.processed.h5ad', compression='lzf')