# Import packages and data 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.axes._axes import _log as matplotlib_axes_logger
from scipy import sparse
matplotlib_axes_logger.setLevel('ERROR')

In [2]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

scanpy==1.4.4 anndata==0.7.1 umap==0.3.10 numpy==1.17.1 scipy==1.4.1 pandas==0.23.0 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


# Import the DSB-normalised fetal BM MNC CITE-seq data

In [3]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/fbm_mnc_cite_seq/data/080421_mq224_postQC_postDSB_postUMAP_celllabels_MNCs_ADTonmRNA_xSimone.h5ad')

In [4]:
adata

AnnData object with n_obs × n_vars = 8978 × 198 
    obs: 'leiden', 'louvain', 'assignment', 'cell.labels', 'cell.labels.sorted'
    var: 'bg_mean'
    uns: 'assignment_colors', 'draw_graph', 'leiden', 'leiden_colors', 'louvain', 'neighbors', 'pca', 'umap'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_umap', 'umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [5]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD4 T cell                39
CD14 monocyte           1384
CD56 bright NK            66
CMP                       78
DC1                       13
DC2                       87
DC3                       20
ELP                      177
GMP                      108
HSC                       36
MK                        31
MOP                      280
basophil                  15
early MK                  91
early erythroid          517
eosinophil                22
immature B cell          403
late erythroid           670
mast cell                 57
mid erythroid            466
naive B cell             249
neutrophil               294
osteoclast                58
pDC                      139
pre B progenitor        2241
pre pro B progenitor     248
pro B progenitor         366
promonocyte              620
promyelocyte             103
sinusoidal EC             42
stromal macrophage        47
tip EC                    11
dtype: int64

In [6]:
adata.obs["cell.labels"] = adata.obs["cell.labels"].replace(["HSC", "CMP", "GMP", "ELP"], 
                                                            ["CD38- pro.", "CD38+ pro.", "CD38+ pro.", "CD38+ pro."])

In [7]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD14 monocyte           1384
CD38+ pro.               363
CD38- pro.                36
CD4 T cell                39
CD56 bright NK            66
DC1                       13
DC2                       87
DC3                       20
MK                        31
MOP                      280
basophil                  15
early MK                  91
early erythroid          517
eosinophil                22
immature B cell          403
late erythroid           670
mast cell                 57
mid erythroid            466
naive B cell             249
neutrophil               294
osteoclast                58
pDC                      139
pre B progenitor        2241
pre pro B progenitor     248
pro B progenitor         366
promonocyte              620
promyelocyte             103
sinusoidal EC             42
stromal macrophage        47
tip EC                    11
dtype: int64

In [8]:
adata.X

array([[-0.48057145, -0.26291963, -0.41175377, ..., -0.572452  ,
         1.6221988 ,  0.9438633 ],
       [-0.48057145, -0.26291963, -0.41175377, ..., -0.572452  ,
        -0.36134648, -0.66506106],
       [-0.48057145, -0.26291963, -0.41175377, ..., -0.572452  ,
         0.22465831, -0.66506106],
       ...,
       [ 1.6224257 ,  3.4519799 , -0.41175377, ..., -0.572452  ,
         0.7415516 ,  0.9438633 ],
       [-0.48057145,  3.4519799 ,  1.6438955 , ..., -0.572452  ,
         0.7415516 , -0.66506106],
       [ 6.299284  ,  3.4519799 ,  1.6438955 , ..., -0.572452  ,
         6.7017026 ,  7.0606465 ]], dtype=float32)

In [9]:
ref = adata

We will now save data for input into the Use Rpart package (for decision tree building)

# Save train data (k=11 per cell type)

In [10]:
tot_adata = anndata.AnnData
for i in adata.obs["cell.labels"].unique().tolist():
    #print(i)
    subset = adata[adata.obs["cell.labels"].isin([i])].copy()
    #print(subset.shape)
    sc.pp.subsample(subset, n_obs = 11, random_state=1)
    tot_adata = tot_adata.concatenate(subset, join='outer', index_unique=None)

In [11]:
# Save the metadata
adata = tot_adata
metadata = adata.obs["cell.labels"]
df = pd.DataFrame(data=adata.X, index=adata.obs.index, columns=adata.var.index)
df.insert(0, "cell.labels", metadata)
df.to_csv('/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk3/resources_for_pipelines/figs3a_fbm_mnc_citeseq_dsb_counts_train_20210421.csv', sep=',', header=True, index=True, index_label='X')

# Save test data (k=11 per cell type)

In [12]:
adata = ref

In [13]:
tot_adata = anndata.AnnData
for i in adata.obs["cell.labels"].unique().tolist():
    #print(i)
    subset = adata[adata.obs["cell.labels"].isin([i])].copy()
    #print(subset.shape)
    sc.pp.subsample(subset, n_obs = 11, random_state=2)
    tot_adata = tot_adata.concatenate(subset, join='outer', index_unique=None)

In [14]:
# Save the metadata
adata = tot_adata
metadata = adata.obs["cell.labels"]
df = pd.DataFrame(data=adata.X, index=adata.obs.index, columns=adata.var.index)
df.insert(0, "cell.labels", metadata)
df.to_csv('/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk3/resources_for_pipelines/figs3a_fbm_mnc_citeseq_dsb_counts_test_20210421.csv', sep=',', header=True, index=True, index_label='X')