# Import packages and data 

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
import rpy2

  from pandas.core.index import RangeIndex


In [2]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

scanpy==1.4.4 anndata==0.7.1 umap==0.3.10 numpy==1.17.1 scipy==1.4.1 pandas==1.0.5 scikit-learn==0.22.2.post1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


# Load the resident and progenitor subsets

In [3]:
adata = sc.read('/Users/b8058304/Documents/PhD_work/Coding/bm_plus_19pcw/data/bm_plus_19pcw_raw_dr_20201007.h5ad')

In [4]:
adata.raw = adata

In [5]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CD4 T cell             327
CD8 T cell             171
CD14 monocyte         8763
CD56 bright NK         449
CMP                    425
                      ... 
schwann cells            9
sinusoidal EC          550
stromal macrophage    1464
tDC                    193
tip EC                 362
Length: 64, dtype: int64

In [6]:
# all nine HSC_MPP celltypes vs all 20 stromal cell types (minus muscle lineage)

celltypes = ['adipo-CAR', 'arteriolar fibroblast', 'tip EC', 'stromal macrophage', 'sinusoidal EC', 
             'schwann cells', 'proliferating EC', 'osteoclast', 'osteochondral precursor', 
             'osteoblast precursor', 'osteoblast', 'myofibroblast',  
             'monocytoid macrophage', 'erythroid macrophage', 'immature EC', 'endosteal fibroblast', 
             'early osteoblast', 'chondrocyte', 'HSC', 'CMP', 'ELP', 'GMP', 'MEMP', 'MEP', 'LMPP',
             'MPP myeloid', 'eo/baso/mast precursor']

In [7]:
adata = adata[adata.obs['cell.labels'].isin(celltypes)].copy()

In [8]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

cell.labels
CMP                         425
ELP                        1357
GMP                        1281
HSC                          92
LMPP                         34
MEMP                         16
MEP                         269
MPP myeloid                  92
adipo-CAR                   353
arteriolar fibroblast        83
chondrocyte                  80
early osteoblast            280
endosteal fibroblast         54
eo/baso/mast precursor      175
erythroid macrophage         92
immature EC                  42
monocytoid macrophage       290
myofibroblast                78
osteoblast                  363
osteoblast precursor        456
osteochondral precursor     191
osteoclast                 1221
proliferating EC             26
schwann cells                 9
sinusoidal EC               550
stromal macrophage         1464
tip EC                      362
dtype: int64

In [9]:
adata.shape

(9735, 33712)

# Pre-process the dataset

In [10]:
# normalise count data 
sc.pp.normalize_total(adata)

In [11]:
# log the count data 
sc.pp.log1p(adata)

In [12]:
# find variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) 

In [13]:
# scale the data 
sc.pp.scale(adata, max_value=10)

In [14]:
# scale the negative values in adata.X (which have been normalised, logged and scaled) to zero 
adata.X = np.where(adata.X < 0, 0, adata.X)

# Save the metadata

In [15]:
metadata = adata.obs["cell.labels"]

In [16]:
metadata.to_csv('/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk2/resources_for_pipelines/fig6c_cpdb_stroma_vs_progen_meta_20210128.txt',  sep='\t')

# Save the count data 

In [17]:
df = pd.DataFrame(data=adata.X, index=adata.obs.index, columns=adata.var.index)

In [18]:
df = df.T

In [19]:
df.to_csv('/Users/b8058304/Documents/PhD_work/Coding/manuscript_figs_mk2/resources_for_pipelines/fig6c_cpdb_stroma_vs_progen_counts_20210128.txt', sep='\t', header=True, index=True, index_label='Gene')