# Import packages and data 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
import rpy2

  from pandas.core.index import RangeIndex


In [2]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

scanpy==1.4.4 anndata==0.7.1 umap==0.3.10 numpy==1.17.1 scipy==1.4.1 pandas==1.0.5 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Save UMAP for Monocle3 from the processed dataset

In [4]:
adata = sc.read("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/data/fig3a_bcell_dr_20200715.h5ad")

In [5]:
# save the info on varibale genes used in scanpy so these can be transferred to monocle
gene_df = adata.var

In [6]:
pd.DataFrame(adata.obsm["X_draw_graph_fa"]).to_csv("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/resources_for_pipelines/bcell_fdg_20200715.csv")

# Load the fetal BM B cells from the raw dataset

In [7]:
adata = sc.read("/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/data/fig3a_bcell_raw_20200715.h5ad")

In [8]:
adata.var = gene_df

In [9]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
HSC                        92
LMPP                       34
ELP                      1358
pre pro B progenitor     5428
pro B progenitor         5530
pre B progenitor        14234
immature B cell          1998
naive B cell             1423
dtype: int64

In [10]:
adata.shape

(30097, 33712)

# Convert to CDS

# Load rpy2 link

In [11]:
# convert anndata to sce
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [12]:
%R d = as.data.frame( R.Version() )
%R d = d['version.string']

Unnamed: 0,version.string
1,R version 3.6.2 (2019-12-12)


In [13]:
%%R 

library(scran)
library(RColorBrewer)
library(slingshot)
library(monocle)
library(gam)
library(clusterExperiment)
library(ggplot2)
library(plyr)

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find

# Convert Scanpy object to CellDataSet object and prepare obj

In [14]:
%%R -i adata
adata

class: SingleCellExperiment 
dim: 33712 30097 
metadata(1): cell.labels_colors
assays(1): X
rownames(33712): RP11-34P13.3 RP11-34P13.7 ... CTB-58E17.2
  RP5-1077H22.1
rowData names(6): gene_ids.1 feature_types.1 ... dispersions
  dispersions_norm
colnames(30097):
  F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACATACGGTAAATGAC
  F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACCCACTCAGCCTTGG ...
  TTTGTTGTCAATGTCG-1-WSSS_F_BON8710643
  TTTGTTGTCACCCTCA-1-WSSS_F_BON8710643
colData names(20): cell.labels doublets ... cell.labels_20200708
  cell.labels_20200713
reducedDimNames(3): X_orig_pca PCA UMAP
spikeNames(0):
altExpNames(0):


In [15]:
#Preprocessing for monocle
data_mat_mon = adata.X.T
var_mon=adata.var.copy()
obs_mon=adata.obs.copy()

In [16]:
var_mon

Unnamed: 0,gene_ids-1,feature_types-1,highly_variable,means,dispersions,dispersions_norm
RP11-34P13.3,ENSG00000243485,Gene Expression,False,1.766774e-05,-0.631581,-1.628701
RP11-34P13.7,ENSG00000238009,Gene Expression,False,1.150199e-03,0.601993,0.345859
RP11-34P13.8,ENSG00000239945,Gene Expression,False,6.797688e-05,-0.269254,-1.048732
RP11-34P13.9,ENSG00000241599,Gene Expression,False,2.904247e-05,-0.134556,-0.833123
FO538757.3,ENSG00000279928,Gene Expression,False,7.022216e-04,-0.556981,-1.509291
...,...,...,...,...,...,...
TRBJ2-2,ENSG00000211765,Gene Expression,False,1.000000e-12,,
MAGEA2,ENSG00000268606,Gene Expression,False,1.000000e-12,,
CTD-2308B18.4,ENSG00000248600,Gene Expression,False,1.000000e-12,,
CTB-58E17.2,ENSG00000274996,Gene Expression,False,1.000000e-12,,


In [17]:
%%R -i data_mat_mon -i obs_mon -i var_mon

#Set up the CellDataSet data structure
print("setting up cell dataset structure")
print("printing pd - cells")
pd <- AnnotatedDataFrame(data = obs_mon)
print(pd)
print("printing fd - genes")
fd <- AnnotatedDataFrame(data = var_mon)
print(fd)
# assigning cells as column name and genes as rownames of GEX matrix (data_mat_mon)
colnames(data_mat_mon) <- rownames(pd)
rownames(data_mat_mon) <- rownames(fd)
print("create newCellDataSet using the matrix, cells and genes")
ie_regions_cds <- newCellDataSet(cellData=data_mat_mon, phenoData=pd, featureData=fd, expressionFamily=negbinomial.size())

# print ie_regions_cds
print("printing ie_regions_cds")
print(ie_regions_cds)

[1] "setting up cell dataset structure"
[1] "printing pd - cells"
An object of class 'AnnotatedDataFrame'
  rowNames: F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACATACGGTAAATGAC
    F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACCCACTCAGCCTTGG ...
    TTTGTTGTCACCCTCA-1-WSSS_F_BON8710643 (30097 total)
  varLabels: cell.labels doublets ... cell.labels_20200713 (20 total)
  varMetadata: labelDescription
[1] "printing fd - genes"
An object of class 'AnnotatedDataFrame'
  rowNames: RP11-34P13.3 RP11-34P13.7 ... RP5-1077H22.1 (33712 total)
  varLabels: gene_ids.1 feature_types.1 ... dispersions_norm (6 total)
  varMetadata: labelDescription
[1] "create newCellDataSet using the matrix, cells and genes"
[1] "printing ie_regions_cds"
CellDataSet (storageMode: environment)
assayData: 33712 features, 30097 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames:
    F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACATACGGTAAATGAC
    F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACCCAC

# Within CDS, select HVGs (previously defined in scanpy)

In [18]:
%%R 

#Filter highly variable genes from our analysis (normalisation by log and size factor will be performed in monocle3)
print("filter hvgs")
hvg_mask = fData(ie_regions_cds)$highly_variable
ie_regions_cds <- ie_regions_cds[hvg_mask,]

# print ie_regions_cds
print("printing cds again")
print(ie_regions_cds)

[1] "filter hvgs"
[1] "printing cds again"
CellDataSet (storageMode: environment)
assayData: 2753 features, 30097 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames:
    F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACATACGGTAAATGAC
    F21_boneMarrow_CD45+_FCAImmP7179367_3prime_ACCCACTCAGCCTTGG ...
    TTTGTTGTCACCCTCA-1-WSSS_F_BON8710643 (30097 total)
  varLabels: cell.labels doublets ... Size_Factor (21 total)
  varMetadata: labelDescription
featureData
  featureNames: RP11-206L10.9 TNFRSF18 ... MATR3-1 (2753 total)
  fvarLabels: gene_ids.1 feature_types.1 ... dispersions_norm (6 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:  


# Save object for input into Monocle3

In [19]:
%%R 

saveRDS(ie_regions_cds, 
        "/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs/data/fig3e_figs3e_bcell_cds_20200715.RDS")