# Extended figure 2d part 1

# Import packages and data 

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.stats
import anndata
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.axes._axes import _log as matplotlib_axes_logger
from scipy import sparse
matplotlib_axes_logger.setLevel('ERROR')

In [None]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
# Set up the plot config for viewing the annotation clearly.
sc.settings.set_figure_params(dpi=120, dpi_save=1000)
sc.logging.print_versions()

# Import the DSB-normalised fetal YS CITE-seq data

In [None]:
adata = sc.read('/home/jovyan/YS_project/YS_CiteSeq_final_script_templates/protein_pipeline_19112021/Cite_seq_protein_processed_20220331.h5ad')

In [None]:
adata

In [None]:
adata.obs.rename(columns = {"individual_anno":"cell.labels"}, inplace="True")

In [None]:
cell_numbers = adata.obs.groupby(["cell.labels"]).apply(len)
cell_numbers

In [None]:
adata.X

In [None]:
ref = adata

We will now save data for input into the Use Rpart package (for decision tree building)

# Save train data (k=10 per cell type) 
- reduced from 11 to 6

In [None]:
tot_adata = anndata.AnnData
for i in adata.obs["cell.labels"].unique().tolist():
    #print(i)
    subset = adata[adata.obs["cell.labels"].isin([i])].copy()
    #print(subset.shape)
    sc.pp.subsample(subset, n_obs = 6, random_state=1)
    tot_adata = tot_adata.concatenate(subset, join='outer', index_unique=None)

In [None]:
# Save the metadata
adata = tot_adata
metadata = adata.obs["cell.labels"]
df = pd.DataFrame(data=adata.X, index=adata.obs.index, columns=adata.var.index)
df.insert(0, "cell.labels", metadata)
df.to_csv('./protein_train_test_new_anno_20220331.csv', sep=',', header=True, index=True, index_label='X')

# Save test data (k=10 per cell type)

In [None]:
adata = ref

In [None]:
tot_adata = anndata.AnnData
for i in adata.obs["cell.labels"].unique().tolist():
    #print(i)
    subset = adata[adata.obs["cell.labels"].isin([i])].copy()
    #print(subset.shape)
    sc.pp.subsample(subset, n_obs = 6, random_state=2)
    tot_adata = tot_adata.concatenate(subset, join='outer', index_unique=None)

In [None]:
# Save the metadata
adata = tot_adata
metadata = adata.obs["cell.labels"]
df = pd.DataFrame(data=adata.X, index=adata.obs.index, columns=adata.var.index)
df.insert(0, "cell.labels", metadata)
df.to_csv('./protein_test_test_new_anno_20220331.csv', sep=',', header=True, index=True, index_label='X')

In [None]:
ref

In [None]:
list(ref.obs['cell.labels'].unique())