### 0. Import libraries

In [None]:
import pandas as pd
import numpy as np
import polars as pl
import anndata as ad

import scanpy as sc

import matplotlib.pyplot as plt

import scib

import scrublet as scr

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["figure.figsize"] = [6, 4.5]

# 1. Import data

## 1.1 Load metadata

In [None]:
# Load metadata file
df_metaData_with_lineage = pd.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_metaData_with_lineage.txt', sep="\t")

# Load metadata from the Seurat object to retrieve the majority fate
df_metadata_seurat = pd.read_csv("/data/benchmarks/scRNAseq_persisters/metadata_seuratobject.csv")

# Add majority fate of the cells retrieved from the metadata of seurat object (from R data file) to the dataframe 
df_metaData_with_lineage['majority_fate'] = df_metadata_seurat['majority_fate'].tolist()

### 1.1.1 Analyzing metadata

In [None]:
# Check mitochondrial fraction of cells
print('The number of cells with >0.1 mitochondrial fraction is =', len(df_metaData_with_lineage[df_metaData_with_lineage['percent.mito']>0.1]))
# check for cells with <1000 genes
print('The number of cells with <1000 genes is =', len(df_metaData_with_lineage[df_metaData_with_lineage['nGene']<1000]))
# check for cells with >4200 genes
print('The number of cells with >4200 genes is =', len(df_metaData_with_lineage[df_metaData_with_lineage['nGene']>4200]))

Since there are no cells with >0.1 mitochondrial fraction or with <1000 or >4200 genes, it looks like this data is already preprocessed before by Oren et al. (2021).

### 1.1.2 Preprocessing metadata

In [None]:
copy_df =df_metaData_with_lineage.copy() # copy of dataframe to make additions

# replace sample_type label: from 14_high to non-cycling etc. to avoid confusion
copy_df = copy_df.replace('14_high', 'Non-cycling')
copy_df = copy_df.replace('14_med', 'Moderate_cyclers')
copy_df = copy_df.replace('14_low', 'Cycling')

## 1.2 Load count matrix data & create to AnnData object

In [None]:
# Load data using polars (=more effective/efficient than pandas)
df_pc9_count_matrix = pl.read_csv('/data/benchmarks/scRNAseq_persisters/GSE150949_pc9_count_matrix.csv')

In [None]:
df_pc9_count_matrix.head(10)

In [None]:
gene_names = df_pc9_count_matrix[:, 0].to_list() # Extract gene names (=first column)
df_pc9_count_matrix_without_genenames = df_pc9_count_matrix[:, 1:] # Exclude first column which containes the gene names

cell_names = df_pc9_count_matrix_without_genenames.columns # Extract names of the cells

numpy_count_matrix = df_pc9_count_matrix_without_genenames.to_numpy()  # Convert to a numpy matrix to enable conversion to AnnData object

# Create AnnData object
adata = ad.AnnData(X=numpy_count_matrix.T,
                   var=pd.DataFrame(index=gene_names),
                   obs=pd.DataFrame(index=cell_names))

In [None]:
adata

So, the number of cells = 56419 and the number of genes = 22166

### 1.2.1 Enter relevant metadata to the AnnData object

In [None]:
# Enter relavant metadata to the AnnData object

# Get lineage barcode in adata object
adata.obs['lineage_barcode']=df_metaData_with_lineage['lineage_barcode'] # lineage barcodes from metadata of GEO

# Get time points as categorical in adata object
time_points_cat = df_metaData_with_lineage.time_point.astype('category') # convert dtype from int64 to category (for plotting lateron)
adata.obs['time_point'] = time_points_cat # add categorical time points to adata object

# Get sample types as categorical in adata object (= time points for cells from day 0 - 7 and cell fate categories for day-14 cells)
sample_type_cat = copy_df.sample_type.astype('category') # convert dtype to category (for plotting lateron)
adata.obs['sample_type'] = sample_type_cat # add categorical sample types to adata object

# Get majority fate of the lineages in adata object
majority_fate_cat = df_metaData_with_lineage.majority_fate.astype('category') # convert dtype from int64 to category (for plotting lateron)
adata.obs['majority_fate'] = majority_fate_cat # add categorical majority fate to adata object

adata

In [None]:
# Check cell distribution per sample type
adata.obs['sample_type'].value_counts()

# 2. Calculate QC

In [None]:
# mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('MT-') 
# ribosomal genes
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))
# hemoglobin genes.
adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P|E|S)]"))

adata.var

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo','hb'], percent_top=None, log1p=False, inplace=True)

In [None]:
# plot some of the QC variables
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb'], jitter=0.4, groupby = 'sample_type', rotation= 45)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color="sample_type")

# 3. Filtering

## 3.1 Filter on cells having enough genes and genes present in enough cells

In [None]:
# Store the original number of cells and genes
number_cells_before_filtering = adata.n_obs
number_genes_before_filtering = adata.n_vars

# Only consider cells with more than X genes --> not applied for now
# sc.pp.filter_cells(adata, min_genes=200)

# Only consider genes with more than 1 count
sc.pp.filter_genes(adata, min_counts=1)

# Print filtering results
print('Filtered out {} cells that have less than the minimum amount of genes expressed'.format(number_cells_before_filtering-adata.n_obs),'--> No filter on the cells to have a minimum amount of genes detected','\n',
      'Filtered out {} genes that are detected in less than 1 cell'.format(number_genes_before_filtering-adata.n_vars))

Apparently there were no zero-count genes

## 3.2 Filter on mitochondrial and ribosomal genes

In [None]:
print("Number of cells before mito and ribo percent filtering %d"%adata.n_obs)

# filter for percent mito
adata = adata[adata.obs['pct_counts_mt'] < 20, :]

print("Remaining cells after mito percent filtering %d"%adata.n_obs)

# filter for percent ribo > 0.05
adata = adata[adata.obs['pct_counts_ribo'] > 5, :]

print("Remaining cells after mito and ribo percent filtering %d"%adata.n_obs)

In [None]:
# Visualize QC after filtering
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'], jitter=0.4, groupby = 'sample_type', rotation = 45)

# 4. Normalization and logtransformation

In [None]:
# save normalized counts in raw slot.
adata.raw = adata

# normalize to depth 10 000
# sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
adata.X = adata.X.astype('float64') # Convert the main data matrix to float64, because normalization was not possible with int64 values
sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')

# logaritmize
sc.pp.log1p(adata)

# scale
sc.pp.scale(adata)

# 5. Check cell cycle state

In [None]:
# Cell cycle state
scib.preprocessing.score_cell_cycle(adata, organism='human')

In [None]:
sc.pl.violin(adata, ['S_score', 'G2M_score'], jitter=0.4, groupby = 'sample_type', rotation=45)

In [None]:
sc.pl.scatter(adata, x='S_score', y='G2M_score', color="phase")

# 5. Check for doublets

In [None]:
# Predicting doublets

# split per batch into new objects.
batches = adata.obs['sample_type'].cat.categories.tolist()
alldata = {}
for batch in batches:
    tmp = adata[adata.obs['sample_type'] == batch,]
    print(batch, ":", tmp.shape[0], " cells")
    scrub = scr.Scrublet(tmp.raw.X)
    out = scrub.scrub_doublets(verbose=False, n_prin_comps = 20)
    alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
    print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")

In [None]:
# add predictions to the adata object.
scrub_pred = pd.concat(alldata.values())
adata.obs['doublet_scores'] = scrub_pred['doublet_score'] 
adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets'] 

sum(adata.obs['predicted_doublets'])

In [None]:
# add in column with singlet/doublet instead of True/Fals
%matplotlib inline

adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)
sc.pl.violin(adata, 'n_genes_by_counts', jitter=0.4, groupby = 'doublet_info', rotation=45)

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample_type'])

In [None]:
print(adata.shape)

# remove predicted doublets
adata = adata[adata.obs['doublet_info'] == 'False',:]
print(adata.shape)