# Import packages and data 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.axes._axes import _log as matplotlib_axes_logger
from scipy import sparse
matplotlib_axes_logger.setLevel('ERROR')
# silence NumbaPerformanceWarning
import warnings
from numba.errors import NumbaPerformanceWarning
warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)

  from pandas.core.index import RangeIndex


In [2]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

scanpy==1.4.4 anndata==0.7.1 umap==0.3.10 numpy==1.17.1 scipy==1.4.1 pandas==1.0.5 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


# Import the fetal YS progenitor compartment

In [3]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/liver/data/ys_total_annotation_20191127.h5ad')

In [4]:
adata

AnnData object with n_obs × n_vars = 10071 × 32084 
    obs: 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'cell.labels', 'fetal.ids', 'sort.ids', 'tissue', 'lanes', 'stages', 'sample.type', 'gender', 'AnnatomicalPart', 'doublets', 'LouvainClustering', 'predicted.cell.labels', 'full.ids', 'batch', 'bh_pval', 'sample', 'scrublet_cluster_score', 'scrublet_score', 'n_counts', 'louvain', 'is_doublet', 'is_doublet_poptrim', 'res.30', 'backlabeled_1', 'cell.labels_progen'

In [5]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
DC progenitor                78
Early mast cell             133
Endothelium                 160
Erythroid AB                892
Erythroid EZ               1167
Erythroid M                 275
ILC precursor                25
Lymphoid progenitor         184
MEMP                        121
Monocyte                     38
Myeloid progenitor           61
NK early                     50
yolk sac Macrophage I      3063
yolk sac Macrophage II     3619
yolk sac progenitor/MPP     205
dtype: int64

In [6]:
cell_numbers = adata.obs.groupby(["cell.labels_progen"]).apply(len)
cell_numbers

cell.labels_progen
CMP                    55
ELP                   174
GMP                    40
HSC                   105
MEMP                   55
MEP                    56
Monocyte precursor     26
macrophage             42
neut myeloid           18
dtype: int64

In [7]:
from scipy import sparse
array_vals = adata.X
adata.X = sparse.csr_matrix(adata.X)
adata.raw = adata
adata.X = array_vals

In [8]:
adata = adata[adata.obs['cell.labels_progen'].isin(['HSC', 'MEMP', 'GMP', 'CMP', 'ELP', 'MEP'])].copy()

In [9]:
cell_numbers = adata.obs.groupby(["cell.labels_progen"]).apply(len)
cell_numbers

cell.labels_progen
CMP      55
ELP     174
GMP      40
HSC     105
MEMP     55
MEP      56
dtype: int64

In [10]:
adata.obs["dataset"] = "ys"
adata.obs['cell.labels'] = adata.obs['cell.labels_progen']

In [11]:
cell_numbers = adata.obs.groupby(["fetal.ids", "cell.labels"]).apply(len)
pd.DataFrame(cell_numbers).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/ys_progen_nrs_by_sample_20200722.csv")
cell_numbers

fetal.ids          cell.labels
F32_female_7+6PCW  CMP              3
                   ELP             12
                   GMP              1
                   HSC              5
                   MEMP             8
                   MEP              8
F35_female_7+6PCW  CMP             11
                   ELP             19
                   GMP              2
                   HSC             19
                   MEMP            23
                   MEP             38
F37_female_4PCW    CMP             41
                   ELP            143
                   GMP             37
                   HSC             81
                   MEMP            24
                   MEP             10
dtype: int64

In [12]:
ys = adata

# Import the fetal liver progenitor compartment

In [13]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/liver/data/fetal_liver_alladata_Copy1.h5ad')

In [14]:
adata

AnnData object with n_obs × n_vars = 113063 × 27080 
    obs: 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'fetal.ids', 'sort.ids', 'tissue', 'lanes', 'stages', 'sample.type', 'gender', 'AnnatomicalPart', 'doublets', 'cell.labels', 'combined.labels', 'batch', 'sample', 'n_counts', 'cell.labels_progen'
    obsm: 'X_fdg', 'X_tsne', 'X_umap'

In [15]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
B cell                            1079
DC1                                336
DC2                               3954
DC precursor                       330
Early Erythroid                  11985
Early lymphoid_T lymphocyte        767
Endothelial cell                  3348
Fibroblast                        1713
HSC_MPP                           3439
Hepatocyte                        2479
ILC precursor                     1726
Kupffer Cell                     24841
Late Erythroid                    3180
MEMP                              1342
Mast cell                         1308
Megakaryocyte                     3983
Mid Erythroid                    27000
Mono-Mac                          6590
Monocyte                          2586
Monocyte precursor                 350
NK                                6706
Neutrophil-myeloid progenitor      658
Pre pro B cell                     234
VCAM1+ EI macrophage               161
pDC precursor                      253
pre-B cell   

In [16]:
from scipy import sparse
array_vals = adata.X
adata.X = sparse.csr_matrix(adata.X)
adata.raw = adata
adata.X = array_vals

In [17]:
cell_numbers = adata.obs.groupby(["cell.labels_progen"]).apply(len)
cell_numbers

cell.labels_progen
ELP                        525
GMP                        362
HSC                        242
MEMP                       505
MEP                        639
MPP                       2012
early MK                    81
early erythroid            333
eo/baso/mast precursor      86
erythroid-MPP hybrid        14
macrophage                  81
myeloid DC progenitor       94
pDC progenitor              41
dtype: int64

In [18]:
adata = adata[adata.obs['cell.labels_progen'].isin(['HSC', 'MEMP', 'GMP', 'ELP', 'MPP', 'MEP', 
                                                    'eo/baso/mast precursor', 'myeloid DC progenitor', 
                                                   'pDC progenitor'])].copy()

In [19]:
cell_numbers = adata.obs.groupby(["cell.labels_progen"]).apply(len)
cell_numbers

cell.labels_progen
ELP                        525
GMP                        362
HSC                        242
MEMP                       505
MEP                        639
MPP                       2012
eo/baso/mast precursor      86
myeloid DC progenitor       94
pDC progenitor              41
dtype: int64

In [20]:
adata.obs["dataset"] = "liver"
adata.obs['cell.labels'] = adata.obs['cell.labels_progen']

In [21]:
cell_numbers = adata.obs.groupby(["fetal.ids", "cell.labels"]).apply(len)
pd.DataFrame(cell_numbers).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/liver_progen_nrs_by_sample_20200722.csv")
cell_numbers

fetal.ids           cell.labels           
F16_male_8+1PCW     ELP                         4
                    GMP                         3
                    HSC                         1
                    MEMP                        4
                    MEP                         9
                                             ... 
F45_female_13+6PCW  MEP                        61
                    MPP                       268
                    eo/baso/mast precursor      8
                    myeloid DC progenitor      11
                    pDC progenitor              4
Length: 125, dtype: int64

In [22]:
liver = adata

# Import the FBM progenitor compartment

In [23]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/bm_plus_19pcw/data/bm_plus_19pcw_raw_dr_20200718.h5ad')

In [24]:
adata

AnnData object with n_obs × n_vars = 104562 × 33712 
    obs: 'cell.labels', 'doublets', 'fetal.ids', 'gender', 'is_doublet', 'is_doublet_poptrim', 'is_doublet_wolock', 'lanes', 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'processing.type', 'scrublet_cluster_score', 'scrublet_score', 'sequencing.type', 'sort.ids', 'april_cell.labels', 'cell.labels_20200708', 'cell.labels_20200713', 'cell.labels_20200718', 'nk_meta'
    var: 'gene_ids-1', 'feature_types-1'
    obsm: 'X_orig_pca', 'X_pca', 'X_umap'

In [25]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD4 T cell             327
CD8 T cell             171
CD14 monocyte         8787
CD56 bright NK         450
CMP                    425
                      ... 
schwann cells            9
sinusoidal EC          550
stromal macrophage    1493
tDC                    193
tip EC                 363
Length: 64, dtype: int64

In [26]:
from scipy import sparse
array_vals = adata.X
adata.X = sparse.csr_matrix(adata.X)
adata.raw = adata
adata.X = array_vals

In [27]:
adata = adata[adata.obs['cell.labels'].isin(['HSC', 'LMPP', 'MPP myeloid', 'MEMP', 'MEP', 'CMP', 'ELP', 
                                            'eo/baso/mast precursor', 'GMP', 'myeloid DC progenitor', 
                                             'pDC progenitor'])].copy()

In [28]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CMP                        425
ELP                       1358
GMP                       1285
HSC                         92
LMPP                        34
MEMP                        16
MEP                        269
MPP myeloid                 92
eo/baso/mast precursor     175
myeloid DC progenitor       31
pDC progenitor              23
dtype: int64

In [29]:
adata.obs["dataset"] = "FBM"

In [30]:
cell_numbers = adata.obs.groupby(["fetal.ids", "cell.labels"]).apply(len)
pd.DataFrame(cell_numbers).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/fbm_progen_nrs_by_sample_20200722.csv")
cell_numbers

fetal.ids         cell.labels           
F21_male_16+2PCW  CMP                       19
                  ELP                       74
                  GMP                       58
                  HSC                       10
                  LMPP                       2
                                            ..
F51_female_15PCW  myeloid DC progenitor      5
                  pDC progenitor             2
SB19PCW           ELP                       63
                  GMP                       21
                  eo/baso/mast precursor    16
Length: 90, dtype: int64

In [31]:
fbm = adata

# Import the ABM progenitor compartment

In [32]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/adult_bm/data/abm_raw_dr_20200717.h5ad')

In [33]:
adata

AnnData object with n_obs × n_vars = 142026 × 33694 
    obs: 'sample', 'lanes', 'sex', 'age', 'ethnicity', 'mad_prd', 'auto_prd', 'cell.labels', 'cell.labels2', 'leiden', 'percent.mito', 'nGene', 'nUMI', 'cell.labels_july', 'cell.labels_20200717'
    var: 'gene_ids'
    obsm: 'X_orig_pca', 'X_pca', 'X_umap'

In [34]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD14 monocyte             3670
CD16 monocyte             1938
CD56 bright NK            1228
CLP                        882
CMP                        288
DC1                        135
DC2                        481
DC3                        550
DC precursor               462
HSC                        497
LMPP                        80
MEMP                       785
MK                         577
MOP                       1440
MPP                        365
Treg                      6327
early MK                   136
early erythroid           5441
erythroid macrophage        77
immature B cell           2728
late erythroid            1150
mature CD8 T cell        15725
mature NK                 6074
memory B cell             4106
memory CD4 T cell        22197
mid erythroid             2192
monocyte-DC                515
myelocyte                 6675
myeloid DC progenitor      110
naive B cell             19265
naive CD4 T cell          5873
naive CD8 T cell          8

In [35]:
from scipy import sparse
array_vals = adata.X
adata.X = sparse.csr_matrix(adata.X)
adata.raw = adata
adata.X = array_vals

In [36]:
adata = adata[adata.obs['cell.labels'].isin(['HSC', 'LMPP', 'MPP', 'MEMP', 'CMP', 'CLP', 
                                             'myeloid DC progenitor'])].copy()

In [37]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CLP                      882
CMP                      288
HSC                      497
LMPP                      80
MEMP                     785
MPP                      365
myeloid DC progenitor    110
dtype: int64

In [38]:
adata.obs["dataset"] = "ABM"

In [39]:
cell_numbers = adata.obs.groupby(["sample", "cell.labels"]).apply(len)
pd.DataFrame(cell_numbers).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/abm_progen_nrs_by_sample_20200722.csv")
cell_numbers

sample     cell.labels          
MantonBM1  CLP                      195
           CMP                       79
           HSC                      116
           LMPP                      22
           MEMP                     156
           MPP                       77
           myeloid DC progenitor     30
MantonBM2  CLP                       48
           CMP                       18
           HSC                       21
           LMPP                       6
           MEMP                      38
           MPP                       15
           myeloid DC progenitor      9
MantonBM5  CLP                      377
           CMP                      106
           HSC                      171
           LMPP                      22
           MEMP                     389
           MPP                      125
           myeloid DC progenitor     44
MantonBM6  CLP                      262
           CMP                       85
           HSC                      189
       

In [40]:
abm = adata

# Import the CB progenitor compartment

In [41]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/cord_blood/data/cord_blood_raw_dr_20200717.h5ad')

In [42]:
adata

AnnData object with n_obs × n_vars = 148442 × 33694 
    obs: 'sample', 'lanes', 'mad_prd', 'auto_prd', 'cell.labels', 'cell.labels2', 'leiden', 'percent.mito', 'nGene', 'nUMI', 'cell.labels_20200717'
    var: 'gene_ids'
    obsm: 'X_orig_pca', 'X_pca', 'X_umap'

In [43]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD8 T cell          16345
CD14 monocyte       13324
CD16 monocyte         888
CD56 bright NK       4066
CMP                   272
DC1                    67
DC2                   155
DC precursor          169
GMP                   203
HSC                   194
ILC precursor        1519
MEMP                  338
MK                   1262
early MK              496
early erythroid       532
late erythroid        878
mature NK            7860
mid erythroid        2627
myelocyte            3726
naive B cell        19516
naive CD4 T cell    69338
neutrophil           3458
pDC                   242
preDC                 269
promonocyte           607
tDC                    91
dtype: int64

In [44]:
from scipy import sparse
array_vals = adata.X
adata.X = sparse.csr_matrix(adata.X)
adata.raw = adata
adata.X = array_vals

In [45]:
adata = adata[adata.obs['cell.labels'].isin(['HSC', 'MEMP', 'GMP', 'CMP'])].copy()

In [46]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CMP     272
GMP     203
HSC     194
MEMP    338
dtype: int64

In [47]:
adata.obs["dataset"] = "CB"

In [48]:
cell_numbers = adata.obs.groupby(["sample", "cell.labels"]).apply(len)
pd.DataFrame(cell_numbers).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/cb_progen_nrs_by_sample_20200722.csv")
cell_numbers

sample     cell.labels
MantonCB1  CMP             50
           GMP             45
           HSC             50
           MEMP            67
MantonCB2  CMP             80
           GMP             63
           HSC             53
           MEMP           185
MantonCB5  CMP             68
           GMP             46
           HSC             51
           MEMP            64
MantonCB6  CMP             74
           GMP             49
           HSC             40
           MEMP            22
dtype: int64

In [49]:
cb = adata