# Import packages and data 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
import rpy2

In [2]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

scanpy==1.4.4 anndata==0.7.1 umap==0.3.10 numpy==1.17.1 scipy==1.4.1 pandas==0.23.4 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Save UMAP for Monocle3 from the processed dataset

In [4]:
adata = sc.read("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/data/fig5e_5f_fbm_eryth_dr_20200625.h5ad")

In [5]:
# save the info on variable genes used in scanpy so these can be transferred to monocle
gene_df = adata.var

In [6]:
pd.DataFrame(adata.obsm["X_draw_graph_fa"]).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/fbm_eryth_fdg_20200625.csv")

# Load the fetal BM cells from the raw dataset

In [7]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/bm_plus_19pcw/data/bm_plus_19pcw_raw_dr_20200414.h5ad')

In [8]:
adata.raw = adata

In [9]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD4 T                        327
CD8 T                        171
CD14 monocyte               8787
CD56 bright NK               540
CMP                          425
DC1                           50
DC2                          598
DC3                          705
DC precursor                 201
EI macrophage                 92
ELP                         1358
GMP                         1285
HSC                           92
ILC precursor                 67
Immature B cell             1998
MEMP                          16
MEP                          269
MK                          1036
MPP lymphoid                  34
MPP myeloid 1                 46
MPP myeloid 2                 46
NKT                          137
Naive B cell                1423
Pre B progenitor           14234
Pre pro B progenitor        5428
Pro B progenitor            5530
Treg                          62
adipo-CAR                    359
arteriolar fibroblast         84
basophil                     13

In [10]:
celltypes = ['HSC', 'MEMP', 'MEP', 'early erythroid']

In [11]:
adata = adata[adata.obs['cell.labels'].isin(celltypes)].copy()

In [12]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
HSC                  92
MEMP                 16
MEP                 269
early erythroid    7534
dtype: int64

In [13]:
cell_numbers = adata.obs.groupby(["fetal.ids"]).apply(len)
cell_numbers

fetal.ids
F21_male_16+2PCW       517
F29_female_17+0PCW     530
F30_male_14+3PCW       616
F38_male_12PCW         202
F41_female_16PCW      1326
F45_female_13+6PCW     292
F50_female_15PCW      1811
F51_female_15PCW      1981
SB19PCW                636
dtype: int64

In [14]:
adata = adata[adata.obs['fetal.ids'].isin(["F38_male_12PCW", "F45_female_13+6PCW"])].copy()

In [15]:
adata.obs['cell.labels'] = adata.obs['cell.labels'].replace(["MEP", "MEMP"], ["MEP_MEMP", "MEP_MEMP"])

In [16]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
HSC                 15
MEP_MEMP            27
early erythroid    452
dtype: int64

In [17]:
cell_numbers = adata.obs.groupby(["fetal.ids"]).apply(len)
cell_numbers

fetal.ids
F38_male_12PCW        202
F45_female_13+6PCW    292
dtype: int64

In [18]:
adata.var = gene_df

In [19]:
adata.shape

(494, 33712)

# Convert to CDS

# Load rpy2 link

In [20]:
# convert anndata to sce
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [21]:
%R d = as.data.frame( R.Version() )
%R d = d['version.string']

Unnamed: 0,version.string
1,R version 3.6.2 (2019-12-12)


In [22]:
%%R 

library(scran)
library(RColorBrewer)
library(slingshot)
library(monocle)
library(gam)
library(clusterExperiment)
library(ggplot2)
library(plyr)

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find

# Convert Scanpy object to CellDataSet object and prepare obj

In [23]:
%%R -i adata
adata

class: SingleCellExperiment 
dim: 33712 494 
metadata(0):
assays(1): X
rownames(33712): RP11-34P13.3 RP11-34P13.7 ... CTB-58E17.2
  RP5-1077H22.1
rowData names(6): gene_ids.1 feature_types.1 ... dispersions
  dispersions_norm
colnames(494):
  F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAACCTGAGATTACCC
  F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAAGATGTCATCTGTT ...
  F45_boneMarrow_CD45-_FCAImmP7579221_3prime_TGTTCCGTCCATGAAC
  F45_boneMarrow_CD45-_FCAImmP7579221_3prime_TTCTCCTCAGGATCGA
colData names(18): cell.labels doublets ... sort.ids april_cell.labels
reducedDimNames(3): X_orig_pca PCA UMAP
spikeNames(0):
altExpNames(0):


In [24]:
#Preprocessing for monocle
data_mat_mon = adata.X.T
var_mon=adata.var.copy()
obs_mon=adata.obs.copy()

In [25]:
var_mon

Unnamed: 0,gene_ids-1,feature_types-1,highly_variable,means,dispersions,dispersions_norm
RP11-34P13.3,ENSG00000243485,Gene Expression,False,3.258677e-03,0.477731,0.451974
RP11-34P13.7,ENSG00000238009,Gene Expression,False,6.345411e-03,0.226394,-0.110246
RP11-34P13.8,ENSG00000239945,Gene Expression,False,1.000000e-12,,
RP11-34P13.9,ENSG00000241599,Gene Expression,False,1.000000e-12,,
FO538757.3,ENSG00000279928,Gene Expression,False,1.762106e-02,0.076405,-0.445760
FO538757.2,ENSG00000279457,Gene Expression,False,8.746856e-01,0.272516,-0.666872
AP006222.2,ENSG00000228463,Gene Expression,False,3.842548e-01,0.258154,-0.562262
RP4-669L17.10,ENSG00000237094,Gene Expression,False,2.258327e-02,0.197047,-0.175893
RP5-857K21.4,ENSG00000230021,Gene Expression,False,1.499506e-02,-0.131434,-0.910679
RP11-206L10.9,ENSG00000237491,Gene Expression,True,8.010783e-02,1.339261,2.379149


In [26]:
%%R -i data_mat_mon -i obs_mon -i var_mon

#Set up the CellDataSet data structure
print("setting up cell dataset structure")
print("printing pd - cells")
pd <- AnnotatedDataFrame(data = obs_mon)
print(pd)
print("printing fd - genes")
fd <- AnnotatedDataFrame(data = var_mon)
print(fd)
# assigning cells as column name and genes as rownames of GEX matrix (data_mat_mon)
colnames(data_mat_mon) <- rownames(pd)
rownames(data_mat_mon) <- rownames(fd)
print("create newCellDataSet using the matrix, cells and genes")
ie_regions_cds <- newCellDataSet(cellData=data_mat_mon, phenoData=pd, featureData=fd, expressionFamily=negbinomial.size())

# print ie_regions_cds
print("printing ie_regions_cds")
print(ie_regions_cds)

[1] "setting up cell dataset structure"
[1] "printing pd - cells"
An object of class 'AnnotatedDataFrame'
  rowNames: F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAACCTGAGATTACCC
    F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAAGATGTCATCTGTT ...
    F45_boneMarrow_CD45-_FCAImmP7579221_3prime_TTCTCCTCAGGATCGA (494
    total)
  varLabels: cell.labels doublets ... april_cell.labels (18 total)
  varMetadata: labelDescription
[1] "printing fd - genes"
An object of class 'AnnotatedDataFrame'
  rowNames: RP11-34P13.3 RP11-34P13.7 ... RP5-1077H22.1 (33712 total)
  varLabels: gene_ids.1 feature_types.1 ... dispersions_norm (6 total)
  varMetadata: labelDescription
[1] "create newCellDataSet using the matrix, cells and genes"
[1] "printing ie_regions_cds"
CellDataSet (storageMode: environment)
assayData: 33712 features, 494 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames:
    F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAACCTGAGATTACCC
    F38_boneMarrow_CD45-_FCAImmP7

# Within CDS, select HVGs (previously defined in scanpy)

In [27]:
%%R 

#Filter highly variable genes from our analysis (normalisation by log and size factor will be performed in monocle3)
print("filter hvgs")
hvg_mask = fData(ie_regions_cds)$highly_variable
ie_regions_cds <- ie_regions_cds[hvg_mask,]

# print ie_regions_cds
print("printing cds again")
print(ie_regions_cds)

[1] "filter hvgs"
[1] "printing cds again"
CellDataSet (storageMode: environment)
assayData: 3442 features, 494 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames:
    F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAACCTGAGATTACCC
    F38_boneMarrow_CD45-_FCAImmP7528282_3prime_AAAGATGTCATCTGTT ...
    F45_boneMarrow_CD45-_FCAImmP7579221_3prime_TTCTCCTCAGGATCGA (494
    total)
  varLabels: cell.labels doublets ... Size_Factor (19 total)
  varMetadata: labelDescription
featureData
  featureNames: RP11-206L10.9 SAMD11 ... AC240274.1 (3442 total)
  fvarLabels: gene_ids.1 feature_types.1 ... dispersions_norm (6 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:  


# Save object for input into Monocle3

In [28]:
%%R 

saveRDS(ie_regions_cds, 
        "/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/data/fig5e_5f_fbm_eryth_cds_20200625.RDS")