# Import packages and data 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
import rpy2

  from pandas.core.index import RangeIndex


In [2]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

scanpy==1.4.4 anndata==0.7.1 umap==0.3.10 numpy==1.17.1 scipy==1.4.1 pandas==1.0.5 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Save UMAP for Monocle3 from the processed dataset

In [4]:
adata = sc.read("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk2/data/figs5d_downs_eryth_dr_20210125.h5ad")

In [5]:
# save the info on variable genes used in scanpy so these can be transferred to monocle
gene_df = adata.var

In [6]:
pd.DataFrame(adata.obsm["X_draw_graph_fa"]).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk2/resources_for_pipelines/downs_eryth_fdg_20210125.csv")

# Load the fetal BM cells from the raw dataset

In [7]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/fbm_ds_cite_seq/data/fbm_ds_citeseq_plus_scrna_forMS_20210119.h5ad')

In [8]:
adata.raw = adata

In [9]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CAR                          4
CD8 T cell                 181
CD14 monocyte              320
CD56 bright NK              79
CMP                         50
DC1                         45
DC2                        228
DC3                        108
HSC                        105
ILC precursor               13
MEMP                       130
MK                          83
MOP                        422
MSC                         53
Treg                         8
chondrocyte                  4
early B cell                42
early MK                    34
early erythroid           1348
endothelium                111
eo/baso/mast precursor      53
eosinophil                  63
late erythroid            6336
macrophage                 113
mast cell                   39
mast cell                   27
mature B cell               31
mature NK                  147
mid erythroid             5230
myelocyte                  243
neutrophil                 273
osteoblast                 

In [10]:
celltypes = ['HSC', 'MEMP', 'early erythroid']

In [11]:
adata = adata[adata.obs['cell.labels'].isin(celltypes)].copy()

In [12]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
HSC                 105
MEMP                130
early erythroid    1348
dtype: int64

In [13]:
adata.var = gene_df

In [14]:
adata.shape

(1583, 33694)

# Convert to CDS

# Load rpy2 link

In [15]:
# convert anndata to sce
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [16]:
%R d = as.data.frame( R.Version() )
%R d = d['version.string']

Unnamed: 0,version.string
1,R version 3.6.2 (2019-12-12)


In [17]:
%%R 

library(scran)
library(RColorBrewer)
library(slingshot)
library(monocle)
library(gam)
library(clusterExperiment)
library(ggplot2)
library(plyr)

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find

# Convert Scanpy object to CellDataSet object and prepare obj

In [18]:
%%R -i adata
adata

class: SingleCellExperiment 
dim: 33694 1583 
metadata(0):
assays(1): X
rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
rowData names(10): gene_ids.0 gene_ids.1 ... dispersions
  dispersions_norm
colnames(1583): AAACCTGAGTACGTTC-1-DSOX4 AAAGATGTCTTATCTG-1-DSOX4 ...
  TTTGGTTAGCCACTAT-1-DSOX19 TTTGGTTTCCCTCTTT-1-DSOX19
colData names(20): X__is_in_cluster__ age ... souporcell_assignment
  souporcell_doublet
reducedDimNames(3): X_orig_pca PCA UMAP
spikeNames(0):
altExpNames(0):


In [19]:
#Preprocessing for monocle
data_mat_mon = adata.X.T
var_mon=adata.var.copy()
obs_mon=adata.obs.copy()

In [20]:
var_mon

Unnamed: 0,gene_ids-0,gene_ids-1,highly_variable-1,means-1,dispersions-1,dispersions_norm-1,highly_variable,means,dispersions,dispersions_norm
RP11-34P13.3,ENSG00000243485,ENSG00000243485,False,2.620183e-04,-0.696002,-1.658968,True,6.173132e-02,3.201796,0.926771
FAM138A,ENSG00000237613,ENSG00000237613,False,1.000000e-12,,,False,1.000000e-12,,
OR4F5,ENSG00000186092,ENSG00000186092,False,1.000000e-12,,,False,1.000000e-12,,
RP11-34P13.7,ENSG00000238009,ENSG00000238009,False,4.279811e-04,0.157936,-0.109613,True,1.892436e-02,3.300606,0.985676
RP11-34P13.8,ENSG00000239945,ENSG00000239945,False,1.000000e-12,,,False,1.000000e-12,,
...,...,...,...,...,...,...,...,...,...,...
AC233755.2,ENSG00000277856,ENSG00000277856,False,9.785641e-05,-0.229752,-0.813021,False,9.900274e-03,2.720622,0.639918
AC233755.1,ENSG00000275063,ENSG00000275063,False,1.000000e-12,,,False,1.620117e-04,-1.360684,-1.793160
AC240274.1,ENSG00000271254,ENSG00000271254,False,3.771025e-03,0.424717,0.374424,True,3.945190e-02,2.678236,0.614650
AC213203.1,ENSG00000277475,ENSG00000277475,False,1.000000e-12,,,False,1.000000e-12,,


In [21]:
%%R -i data_mat_mon -i obs_mon -i var_mon

#Set up the CellDataSet data structure
print("setting up cell dataset structure")
print("printing pd - cells")
pd <- AnnotatedDataFrame(data = obs_mon)
print(pd)
print("printing fd - genes")
fd <- AnnotatedDataFrame(data = var_mon)
print(fd)
# assigning cells as column name and genes as rownames of GEX matrix (data_mat_mon)
colnames(data_mat_mon) <- rownames(pd)
rownames(data_mat_mon) <- rownames(fd)
print("create newCellDataSet using the matrix, cells and genes")
ie_regions_cds <- newCellDataSet(cellData=data_mat_mon, phenoData=pd, featureData=fd, expressionFamily=negbinomial.size())

# print ie_regions_cds
print("printing ie_regions_cds")
print(ie_regions_cds)

[1] "setting up cell dataset structure"
[1] "printing pd - cells"
An object of class 'AnnotatedDataFrame'
  rowNames: AAACCTGAGTACGTTC-1-DSOX4 AAAGATGTCTTATCTG-1-DSOX4 ...
    TTTGGTTTCCCTCTTT-1-DSOX19 (1583 total)
  varLabels: X__is_in_cluster__ age ... souporcell_doublet (20 total)
  varMetadata: labelDescription
[1] "printing fd - genes"
An object of class 'AnnotatedDataFrame'
  rowNames: RP11-34P13.3 FAM138A ... FAM231B (33694 total)
  varLabels: gene_ids.0 gene_ids.1 ... dispersions_norm (10 total)
  varMetadata: labelDescription
[1] "create newCellDataSet using the matrix, cells and genes"
[1] "printing ie_regions_cds"
CellDataSet (storageMode: environment)
assayData: 33694 features, 1583 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: AAACCTGAGTACGTTC-1-DSOX4 AAAGATGTCTTATCTG-1-DSOX4 ...
    TTTGGTTTCCCTCTTT-1-DSOX19 (1583 total)
  varLabels: X__is_in_cluster__ age ... Size_Factor (21 total)
  varMetadata: labelDescription
featureData
  featureNames:

# Within CDS, select HVGs (previously defined in scanpy)

In [22]:
%%R 

#Filter highly variable genes from our analysis (normalisation by log and size factor will be performed in monocle3)
print("filter hvgs")
hvg_mask = fData(ie_regions_cds)$highly_variable
ie_regions_cds <- ie_regions_cds[hvg_mask,]

# print ie_regions_cds
print("printing cds again")
print(ie_regions_cds)

[1] "filter hvgs"
[1] "printing cds again"
CellDataSet (storageMode: environment)
assayData: 7289 features, 1583 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: AAACCTGAGTACGTTC-1-DSOX4 AAAGATGTCTTATCTG-1-DSOX4 ...
    TTTGGTTTCCCTCTTT-1-DSOX19 (1583 total)
  varLabels: X__is_in_cluster__ age ... Size_Factor (21 total)
  varMetadata: labelDescription
featureData
  featureNames: RP11-34P13.3 RP11-34P13.7 ... AC240274.1 (7289 total)
  fvarLabels: gene_ids.0 gene_ids.1 ... dispersions_norm (10 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:  


# Save object for input into Monocle3

In [23]:
%%R 

saveRDS(ie_regions_cds, 
        "/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk2/data/figs5d_downs_eryth_cds_20210125.RDS")