In [None]:
import pegasus as pg
import numpy as np
import pandas as pd

This tutorial illustrates basic pegasus functionality using 3k PBMCs from a Healthy Donor from 10X Genomics. 
The dataset is available [here](https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/pbmc3k).

Read in Cell Ranger output

In [None]:
#!wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
adata = scc.read_input('filtered_gene_bc_matrices/hg19/')
output_file = 'scc_tutorial_output'
adata

Generate QC metrics

In [None]:
n_cells = adata.shape[0]
pg.qc_metrics(adata, min_genes = 200, max_genes = 2500, percent_cells = 3 / n_cells)

In [None]:
adata.var_keys()

In [None]:
adata.obs_keys()

Plot QC stats

In [None]:
pg.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], by='passed_qc')

In [None]:
pg.scatter(adata, 'n_genes', 'n_counts', color='passed_qc')

In [None]:
pg.violin(adata, ['n_cells'])

Filter cells and genes based on compted qc metrics

In [None]:
pg.filter_data(adata)
adata

Normalize counts and then transform to log space

In [None]:
pg.log_norm(adata, 1e4)

Select highly variable genes

In [None]:
pg.highly_variable_features(adata, consider_batch=False)

Plot variable genes

Compute PCA in variable gene space

In [None]:
pg.pca(adata)

Generate nearest neighbor graph

In [None]:
pg.neighbors(adata)

Run diffusion map

In [None]:
pg.diffmap(adata)

Cluster cells using leiden and louvain methods

In [None]:
pg.louvain(adata, resolution = 1.0)

See the composition of each leiden cluster

Generate embeddings using FIt-SNE and UMAP

In [None]:
pg.fitsne(adata) 

In [None]:
pg.embedding(adata, basis = 'fitsne', keys = ['louvain_labels'])

In [None]:
pg.umap(adata) 

Plot the cluster assignments

In [None]:
pg.embedding(adata, basis = 'umap', keys = ['louvain_labels'])

In [None]:
pg.fle(adata, file_name = "temp")

In [None]:
pg.embedding(adata, basis = 'fle', keys = ['louvain_labels'])

In [None]:
embedding_plot = pg.embedding(adata, basis = 'umap', keys = ['IL7R'])
embedding_plot

In [None]:
from IPython.display import display, clear_output
import scplot as sp
clear_output()
x = embedding_plot.df['X_umap1']
y = embedding_plot.df['X_umap2']
bounds = sp.get_bounds(embedding_plot[0,0])

if bounds is not None:
    selected_adata = adata[(x>=bounds[0]) & (x<=bounds[2]) & (y>= bounds[1])&(y<=bounds[3])]
    if selected_adata.shape[0] > 0:
        print('{} cells'.format(selected_adata.shape[0]))
        pg.pca(selected_adata)
        pg.neighbors(selected_adata)
        pg.umap(selected_adata)
        display(sp.embedding(selected_adata, basis='umap', keys=['IL7R']))
else:
    print('No cells selected')

Find differentially expressed genes

In [None]:
pg.de_analysis(adata, cluster='louvain_labels')

In [None]:
de_results = adata.varm['de_res']
sorted(de_results.dtype.names)

In [None]:
markers = pg.find_markers(adata, label_attr='louvain_labels') # TODO, store result in adata

In [None]:
markers = {
	"title" : "Cell markers",
	"cell_types" : [
		{
			"name" : "CD4 T cells",
			"markers" : [
				{
					"genes" : ["IL7R+"],
					"weight" : 1.0
				}
			]
		},
		{
			"name" : "B cells",
			"markers" : [
				{
					"genes" : ["MS4A1+"],
					"weight" : 1.0
				}
			]
		}
	]
}

pg.infer_cell_types(adata, markers, de_test = 't')
#scc.infer_cell_types(adata, markers = 'human_immune', de_test = 't')

Plot marker genes

In [None]:
pg.dotplot(adata, by='louvain_labels', 
           keys=['IL7R', 'CCR7', 'S100A4', 'CD14', 'LYZ', 'MS4A1', 'CD8A', 'FCGR3A', 'MS4A7', 'GNLY', 'NKG7', 'FCER1A', 'CST3', 'PPBP'])

In [None]:
pg.write_output(adata, output_file)