# Tutorial 3 - Working with scanpy

## Load required packages

In [None]:
# Load necessary packages as we did in week 1 using the import and from commands
#
# Jupyter notebooks have so called "magic" functions that lets us interact
# with packages in a command line like format. 
# %matplotlib inline directs the output of plotting commands to appear directly
# below the cell that produced it.
#
# As academic software in paticular is constantly evolving, some of the tools
# that packages rely on can update at different speeds. If certain commands are
# scheduled to be phased out, they often warn users with a "FutureWarning"
# The last line silences these warnings, to avoid distraction.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn as sk
import scipy as sp
import csv
import scanpy as sc
import copy
import re
from collections import Counter
from igraph import *
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Set the present working directory with the getcwd() function from the os library
pwd = os.getcwd()

In [None]:
# Check this is set correctly to '/home/<SUNetID>/BIOC281/Classes/3' on FarmShare
# Check this is set correctly to '/home/groups/<GroupName>/BIOC281/Classes/3' on Sherlock
pwd

## Data and metadata ingest

In [None]:
# See https://scanpy.readthedocs.io/en/stable/api/index.html#reading
# for examples on how to read other file types
# The path.join() command from the os library works similar to here() in R
# sc.read_csv() expects cells to be rows, so .T transposes the object
adata = sc.read_csv(filename=os.path.join(pwd, 'krasnow_hlca_10x_UMIs.csv')).T
adata = adata[adata.obs_names.sort_values()]

In [None]:
# Read in the meta.data using read_csv() from pandas, like in the first tutorial
# adata is an AnnData object (https://anndata.readthedocs.io/en/latest/)
# It is similar to a Seurat object in that it stores important information in different shelves
# You can access them with a "." symbol followed by their name.
# In this case, the AnnData expects the "obs" shelf to contain the cellular metadata
tmp = pd.read_csv(filepath_or_buffer=os.path.join(pwd, 'krasnow_hlca_10x_metadata.csv'), index_col=0).sort_index()
adata.obs = tmp.copy()

In [None]:
# Subset the dataset to include only data from patient 1
# Recall from the first tutorial that columns from pandas dataframes (called pandas series)
# can be accessed using a "." symbol followed by their column name
adata = adata[adata.obs.patient == 1]

In [None]:
# Write the adata object to an h5ad file to ease import, also seems to halve the memory usage
# h5ad is a special binary file format that extends python's hd5f format (https://www.h5py.org)
adata.write(os.path.join(pwd, 'krasnow_hlca_10x_raw.h5ad'))

In [None]:
# Uncomment and execute if you need to read in the object again
# Useful as sc.read_csv is slow
adata = sc.read_h5ad(filename = os.path.join(pwd, 'krasnow_hlca_10x_raw.h5ad'))

In [None]:
# Keep a copy for subclustering later
# There is an important difference between R and Python in how they save equivalent variables
# In Python, if you set a = [1, 2], then set b = a, and then set b[1] = 10, a[1] will also equal 10
# In R, if you set a = c(1,2), then set b = a, then set b[1] = 10, a[1] will NOT equal 10
# Copying variables as below allows us to copy the adata object and break the linkage between them
# We use deepcopy from the copy library as adata is a multilevel, complex object as we will explore
adata_raw = copy.deepcopy(adata)

In [None]:
# Uncomment and execute if you need to reset the object
# Useful as sc.read_csv is slow
#adata = copy.deepcopy(adata_raw)

## Explore the scanpy object

In [None]:
# Show the basic contents of adata
adata

In [None]:
# As noted above, the obs self in the AnnData object holds the cellular metadata
# The first five rows and columns can be accessed using the iloc() function from pandas
# In python, the "0" preceeding the ":" symbol is assumed and can be omitted
adata.obs.iloc[:5,:5]

In [None]:
# Specific columns from metadata can be accessed by name using a "." symbol
# We then select the first five cells
adata.obs.nGene[:5]

In [None]:
# Specific cells can also be accessed by subsetting the whole adata object
adata[['P1_1_AAACCTGAGCGATAGC', 'P1_1_AACTCCCAGGGCATGT']].obs

In [None]:
# The UMI/counts matrix starts in the "X" shelf of the adata object
# As you proceed through data normalization and scaling, scanpy automatically changes X
# Each column is a cell with genes as rows
adata.X

In [None]:
# Specific genes can be referenced from the "X" shelf, the first 5 cells are shown
# adata objects can be subset by [rows, columns] much like R data frames
adata[:5, ['ACTB', 'UBB']].X

In [None]:
# scanpy updates the X layer as we normalize and scale the data, so we need to save the
# original UMIs somewhere else. The standard place to store additional expression matrices
# with the same dimensions as X is the "layers" shelf. As noted above, when we equate variables
# in python they remain linked unless we copy them.
adata.layers['UMIs'] = adata.X.copy()

## Basic quality control

In [None]:
# Set the dimensions for the figures plotted by the code below
plt.rcParams['figure.figsize'] = [12, 6]

# The violin() function in scanpy's plotting library (pl) allows us to plot comtinuous
# variables similar to Seurat. Many of the functions in scanpy's preproccing (pp) and
# tools (tl) libraries have corresponding plotting functions as we will see below
sc.pl.violin(adata, keys=['nGene', 'nUMI', 'percent.ribo'], multi_panel=True)

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

# The scatter() function in scanpy's plotting library (pl) allows us to plot two continuous
# variables against one another
sc.pl.scatter(adata, x='nUMI', y='percent.ribo')
sc.pl.scatter(adata, x='nUMI', y='nGene')

In [None]:
# Subset the data to include only cells with greater than 500 genes and 1000 UMIs detected
# In python, if numbers are omitted around a ":" symbol the range is assumed to include all values
adata = adata[adata.obs.nGene > 500, :]
adata = adata[adata.obs.nUMI > 1000, :]

In [None]:
# Copy the adata object for complex normalizations with scVI and MAGIC later on
# Both assume adata.X is are unnormalized UMIs
adata_scvi = copy.deepcopy(adata)
adata_magic = copy.deepcopy(adata)

## Basic data normalization

In [None]:
# Convert UMIs to UMIs per 10K and then take the natural log of all values plus 1
# This updates the X shelf in the AnnData object, which is why we stashed the UMIs
# in adata.layers['UMIs']. Function is similar to Seurat's NormalizeData() function
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
# Many of scanpy's internal functions assume adata.raw.X contains the normalized expression values,
# while adata.X contains the centered and scaled z-scores (set after sc.pp.scale is run below)
#
# Copy adata to adata.raw to preserve the normalized expression values
adata.raw = adata

In [None]:
# Copy the adata object for complex normalizations with sam later
# SAM assumes that adata.X are normalized expression values
adata_sam = copy.deepcopy(adata)

## Feature selection

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

# Select and plot genes that have high dispersion (>0.5) and mean expression (>0.0125)
# This selection is similar to "mean.var.plot" feature selection called with
# the FindVariableFeatures function in Seurat
# Note how the highly_variable_genes() function in scanpy's preprocessing (pp) library
# has a matching function with its plotting (pl) library
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=6, min_disp=0.5)
sc.pl.highly_variable_genes(adata)

## Linear dimensionality reduction

In [None]:
# Center and scale the expression values in adata.X, making them z-scores
sc.pp.scale(adata, max_value=10)
# Run principle component analysis
sc.tl.pca(adata, svd_solver='arpack', n_comps=100)

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

# Plot the variance explained by each principle compnent
# Similar to ElbowPlot() function in Seurat
sc.pl.pca_variance_ratio(adata, n_pcs=100)

In [None]:
plt.rcParams['figure.figsize'] = [12, 50]

# Plot expression of the top and bottom 15 genes (based on loading) in each principle component
# from the 500 cells with the highest or lowest scores in that component
# Similar to Seurat's DimHeatmap() command
genes = adata.var_names.to_series()
for pc in range(1,51):
    g = adata.varm['PCs'][:,pc-1]
    o = np.argsort(g)
    sel = np.concatenate((o[:15],o[-15:])).tolist()
    emb = adata.obsm['X_pca'][:,pc-1]
    # order by position on that pc
    cells = np.append(np.argsort(emb)[:250], np.argsort(emb)[-250:])
    tempdata = adata[cells,]
    print('PC = ' + str(pc))
    sc.pl.heatmap(tempdata, var_names=genes[sel].index.tolist(), swap_axes=True, groupby=None)

## First round of clustering on the whole dataset

In [None]:
# Calculate the nearest neighbor map from the first 50 principle components 
# with k=10 nearest neighbors. These settings are identical to what we
# found was "optimal" in Seurat.
#
# scanpy does not perform pruning on its nearest neighbor network like Seurat
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)

# Cluster the cells using the nearest neighbor graph with the leiden and louvain
# algorithms and a resolution of 0.5 (again identical to the "optimal" paramters from Seurat)
# The output of both are stored in "obs" shelf of adata, under the columns "leiden" and "louvain"
sc.tl.leiden(adata, resolution=0.5)
sc.tl.louvain(adata, resolution=0.5)

## Calculate two dimensional coordinates for all cells with UMAP

In [None]:
# Calculate UMAP coordinates using the nearest neighbor map calculated above
# scanpy uses the same dimensions as the neighbors() function from scanpy's
# preprocessing (pp) library
sc.tl.umap(adata)

In [None]:
plt.rcParams['figure.figsize'] = [10, 10]

# Plot the UMAP coordinates for each cell then color by leiden cluster
sc.pl.umap(adata, color=['leiden'], legend_loc="on data", legend_fontsize=14)

## Comparison of leiden versus louvain clustering

In [None]:
# Compare leiden and louvain versus ground truth and each other
# We can use the same adjusted mutual information metric we used in the Seurat notebook
# The adjusted_mutual_info_score() comes from the metrics section of the sk-learn library
print('Leiden vs Truth AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata.obs.free_annotation, adata.obs.leiden), 2)))
print('Louvain vs Truth AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata.obs.free_annotation, adata.obs.louvain), 2)))

In [None]:
# Compare leiden to ground truth per tissue compartment
print('Leiden vs Truth')
for i in adata.obs.compartment.cat.categories:
    print(i + ' AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata[adata.obs.compartment == i].obs.free_annotation, adata[adata.obs.compartment == i].obs.leiden), 2)))

In [None]:
# Compare louvain to ground truth per tissue compartment
print('Louvain vs Truth')
for i in adata.obs.compartment.cat.categories:
    print(i + ' AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata[adata.obs.compartment == i].obs.free_annotation, adata[adata.obs.compartment == i].obs.louvain), 2)))

In [None]:
# Compare Seurat louvain versus scanpy
seurat_clusters = pd.read_csv(os.path.join(pwd, 'seurat_clusters.csv'), index_col=0)
seurat_clusters.sort_index(inplace=True)
print('Leiden vs Seurat AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(seurat_clusters.seurat_clusters, adata.obs.leiden), 2)))
print('Louvain vs Seurat AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(seurat_clusters.seurat_clusters, adata.obs.louvain), 2)))

**Question:** How does the performance of Leiden and Louvain compare?

**Answer:**

## Continue exploring the scanpy object with clustering completed

In [None]:
# New location of UMIs
adata.layers['UMIs']

In [None]:
# Subset to the first 5 cells and specific genes
adata[:5,['ACTB', 'UBB']].layers['UMIs']

In [None]:
# Location of normalized expression data
adata.raw.X

In [None]:
# Subset to the first 5 cells and specific genes
adata.raw[:5,['ACTB', 'UBB']].X

In [None]:
# Location of expression z-scores
adata.X

In [None]:
# Subset to the first 5 cells and specific genes
adata[:5,['ACTB', 'UBB']].X

In [None]:
# Location of dimentionality reductions
adata.obsm

In [None]:
# Subset the first 5 PC coordinates of the first 5 cells
adata[:4].obsm['X_pca'][:,:5]

In [None]:
# Subset the first 10 gene loadings from the first 5 PCs
adata.varm['PCs'][:10,:5]

In [None]:
# Gene metadata
adata.var

In [None]:
# Access specific gene metadata columns
adata.var.dispersions[:5]

In [None]:
# Nearest neighbor distances from first 5 cells
adata[:5].obsp['distances'].todense()

In [None]:
# Nearest neighbor graph from first 5 cells
# 0 = not connected, 1 = connected
adata[:5].obsp['connectivities'].todense()

## Alternative: Self assembling manifolds

SAM iteratively rescales the input gene expression matrix to emphasize genes that are spatially variable along the intrinsic manifold of the data. It outputs the gene weights, nearest neighbor matrix, and a 2D projection (see Tarashansky et al (2019) _Elife_)

The adata input should contain unstandardized, non-negative values. Preferably, the data should be log-normalized and no genes should be filtered out. See https://scanpy.readthedocs.io/en/stable/external/scanpy.external.tl.sam.html#scanpy.external.tl.sam for more usage information

In [None]:
# SAM can accessed using the "external" library from scanpy which provides a common
# interface to seamlessly use academic software by other labs with adata objects
#
# By default SAM outputs a UMAP embedding based on its nearest neighbor matrix. We skip
# its calculation and ask SAM to use k=10 nearest neighbors, identical to what we used for PCA
# SAM dynamically chooses the number of principle components to use in each iteration
#
# SAM expects adata.X to be a sparse matrix of log-normalized expression values
# we can use the csr_matrix() function in the sparse section of the scipy library
# to convert it from a numpy matrix.
adata_sam.X = sp.sparse.csr_matrix(adata_sam.X)
sc.external.tl.sam(adata_sam, projection=None, k=10)

In [None]:
# Copy the nearest neighbor matrix from the sam object to the original adata object
adata.obsm['X_sam'] = adata_sam.obsm['X_pca'].copy()

# Remove the adata_sam object to preserve memory
del adata_sam

## Alternative: Markov affinity-based graph imputation of cells (MAGIC)

Markov Affinity-based Graph Imputation of Cells (MAGIC) is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold (see van Dijk et al (2018) Cell)

The magic() function expects an AnnData function where the raw.X shelf is unormalized counts or UMIs. See https://scanpy.readthedocs.io/en/stable/external/scanpy.external.pp.magic.html#scanpy.external.pp.magic for more usage information

In [None]:
# MAGIC expects genes with no expression across the entire dataset
# The sum() function from numpy allows us to sum the UMIs for each gene
# across all cells (summing the rows with axis=0; axis=1 would sum the UMIs for each cell)
# We can then create a boolean to select genes with any number of UMIs with greater than (>)
adata_magic = adata_magic[:,adata_magic.X.sum(axis=0) > 0]

# Set the raw.X shelf to the UMI table with undetected genes removed
adata_magic.raw = adata_magic

In [None]:
# Run MAGIC with the same paramters we used for PCA
sc.external.pp.magic(adata_magic, knn=10, n_pca=50, name_list="pca_only")

In [None]:
# Copy the nearest neighbor matrix from the MAGIC object to the original adata object
adata.obsm['X_magic'] = adata_magic.obsm['X_magic'].copy()

# Remove the adata_magic object to preserve memory
del adata_magic

## Alternative: Deep neural network with scVI

scVI uses stochastic optimization and deep neural networks to aggregate information across similar cells and genes and to approximate the distributions that underlie observed expression values, while accounting for batch effects and limited sensitivity (see Lopez et al (2018) _Nature Methods_)

scVI expects an AnnData object with X containing unnormalized count data. For more usage information see https://scanpy.readthedocs.io/en/stable/external/scanpy.external.pp.scvi.html

In [None]:
# Copy the highly variable genes detected by scanpy's highly_variable_genes()
# scVI can be run on only these genes rather than the whole expression matrix
# to reduce computational cost and save time
adata_scvi.var["highly_variable"] = adata.var.highly_variable.copy()

In [None]:
# Run scVI on the expression matrix of highly variable genes and reduce it to 50 dimensions
# n_epochs is the number of iterations to train the neural network, the default being 400
# We reduce this to 100 to save time and allow the model to train during class, but will
# lower the models accuracy.
sc.external.pp.scvi(adata_scvi, use_highly_variable_genes=True, n_latent=50, n_epochs=100)

# Calling scVI seems to cause a DeprecationWarning in jupyter that we silence
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
# Copy the nearest neighbor matrix from the scVI object to the original adata object
adata.obsm['X_scvi'] = adata_scvi.obsm['X_scvi'].copy()

# Remove the adata_scVI object to preserve memory
del adata_scvi

## Cluster based off alternative approaches


In [None]:
# Calculate the nearest neighbor and cluster based on the reduced space from SAM, MAGIC, and scVI
# Notice how use_rep is used to specify a paticular reduction and the neighbor map is saved with key_added
# We can then pass the neighbor map to leiden with the neighbors_key paramter and same to different
# metadata slots in the obs shelf of the AnnData object, again with the key_added
sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_sam", key_added="sam")
sc.tl.leiden(adata, resolution=0.5, neighbors_key="sam", key_added="sam_leiden")

sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_magic", key_added="magic")
sc.tl.leiden(adata, resolution=0.5, neighbors_key="magic", key_added="magic_leiden")

sc.pp.neighbors(adata, n_neighbors=10, use_rep="X_scvi", key_added="scvi")
sc.tl.leiden(adata, resolution=0.5, neighbors_key="scvi", key_added="scvi_leiden")

### SAM

In [None]:
# Compare SAM versus truth
seurat_clusters = pd.read_csv(os.path.join(pwd, 'seurat_clusters.csv'), index_col=0)
print('Leiden vs Truth AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata.obs.free_annotation, adata.obs.sam_leiden), 2)))

In [None]:
# Compare SAM to ground truth per tissue compartment
for i in adata.obs.compartment.cat.categories:
    print(i + ' AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata[adata.obs.compartment == i].obs.free_annotation, adata[adata.obs.compartment == i].obs.sam_leiden), 2)))

### MAGIC

In [None]:
# Compare MAGIC versus truth
seurat_clusters = pd.read_csv(os.path.join(pwd, 'seurat_clusters.csv'), index_col=0)
print('Leiden vs Truth AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata.obs.free_annotation, adata.obs.magic_leiden), 2)))

In [None]:
# Compare MAGIC to ground truth per tissue compartment
for i in adata.obs.compartment.cat.categories:
    print(i + ' AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata[adata.obs.compartment == i].obs.free_annotation, adata[adata.obs.compartment == i].obs.magic_leiden), 2)))

### scVI

In [None]:
# Compare scVI versus truth
seurat_clusters = pd.read_csv(os.path.join(pwd, 'seurat_clusters.csv'), index_col=0)
print('Leiden vs Truth AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata.obs.free_annotation, adata.obs.scvi_leiden), 2)))

In [None]:
# Compare scVI to ground truth per tissue compartment
for i in adata.obs.compartment.cat.categories:
    print(i + ' AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata[adata.obs.compartment == i].obs.free_annotation, adata[adata.obs.compartment == i].obs.scvi_leiden), 2)))

**Question:** How did the more complex normalization procedures influence the clustering accuracy?

**Answer:**

## Subset stromal cells to refine their clustering

In [None]:
# As we did with Seurat, we can subset stromal cells specifically to improve the clustering results
# As the adata object is continuously updated during clustering, we subset off the raw adata object
# we stashed early on.
#
# If metadata were not available to subset stromal cells, you could do it by cluster with
# adata_raw[i in ['1', '2', '3'] for i in list(adata.obs.leiden)]
adata_stromal = adata_raw[adata.obs.compartment == "Stromal"]
adata_stromal

In [None]:
# Copy UMIs to the layers shelf
adata_stromal.layers['UMIs'] = adata_stromal.X.copy()

# Log normalize UMIs per 10K
sc.pp.normalize_total(adata_stromal, target_sum=1e4)
sc.pp.log1p(adata_stromal)

# Stash the log normalized UMIs per 10K in adata_stromal.raw.X
adata_stromal.raw = adata_stromal

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

# Identify and plot highly variable genes
sc.pp.highly_variable_genes(adata_stromal, min_mean=0.0125, max_mean=6, min_disp=0.5)
sc.pl.highly_variable_genes(adata_stromal)

In [None]:
# Center and scale expression values and run PCA
sc.pp.scale(adata_stromal, max_value=10)
sc.tl.pca(adata_stromal, svd_solver='arpack')

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

sc.pl.pca_variance_ratio(adata_stromal, n_pcs=50)

In [None]:
plt.rcParams['figure.figsize'] = [12, 25]

# Plot expression of the top and bottom 15 genes (based on loading) in each principle component
# from the 250 cells with the highest or lowest scores in that component
# Similar to Seurat's DimHeatmap() command
genes = adata_stromal.var_names.to_series()
for pc in range(1,26):
    g = adata_stromal.varm['PCs'][:,pc-1]
    o = np.argsort(g)
    sel = np.concatenate((o[:15],o[-15:])).tolist()
    emb = adata_stromal.obsm['X_pca'][:,pc-1]
    # order by position on that pc
    cells = np.append(np.argsort(emb)[:125], np.argsort(emb)[-125:])
    tempdata = adata_stromal[cells,]
    print('PC = ' + str(pc))
    sc.pl.heatmap(tempdata, var_names=genes[sel].index.tolist(), swap_axes=True, groupby=None)

In [None]:
# Calculate nearest neighbor map using same parameters from Seurat
sc.pp.neighbors(adata_stromal, n_neighbors=20, n_pcs=18)

# Cluster the data using leiden algorithm
sc.tl.leiden(adata_stromal, resolution=0.45)

In [None]:
# Project stromal cells into 2 dimensions with UMAP
sc.tl.umap(adata_stromal)

In [None]:
# Print the AMI for the clustering compared to the ground truth
print('Leiden vs Truth AMI: ' + str(round(sk.metrics.adjusted_mutual_info_score(adata_stromal.obs.free_annotation, adata_stromal.obs.leiden), 2)))

In [None]:
plt.rcParams['figure.figsize'] = [12, 12]

# Plot the subsetted stromal cells in their new UMAP coordinates and color by
# the ground truth annotations and the leiden clusters
sc.pl.umap(adata_stromal, color=['leiden', 'free_annotation'], legend_loc='on data', legend_fontsize=20)

In [None]:
plt.rcParams['figure.figsize'] = [12, 12]
sc.set_figure_params(fontsize=12)

# Plot some basic meta data and marker genes
sc.pl.umap(adata_stromal,
           size=100,
           color=["nGene", "percent.ribo", # Basic QC
                  "COL1A2", "BGN", # Stromal markers
                  "ACTA2", "MYH11", # Contractile markers
                  "KCNA5", "DES", # Airway smooth muscle
                  "HIGD1B", "FAM105A", # Pericyte
                  "ASPN", 'TGFBI', # Myofibroblast
                  "SFRP2", "FBLN2", # Adventitial fibroblast
                  "GPC3", "FGFR4", # Alveolar fibroblast
                  "APOE", "MUM1L1", # Lipofibroblast
                  "UPK3B", "MSLN"]) # Mesothelial cell
           

In [None]:
# Identify differentially expressed genes in each cluster compared to all other clusters
# p-values come from a t-test with overestimated variance with a bonferroni correction
# See https://scanpy.readthedocs.io/en/stable/api/scanpy.tl.rank_genes_groups.html
# for information on other tests
sc.tl.rank_genes_groups(adata_stromal, groupby='leiden')

In [None]:
plt.rcParams['figure.figsize'] = [12, 12]
sc.set_figure_params(fontsize=12)

# Plot expression of 4 markers per cluster
sc.pl.umap(adata_stromal, color=np.ravel(np.array(pd.DataFrame(adata_stromal.uns['rank_genes_groups']['names']).head(4).T)), size=100)

In [None]:
plt.rcParams['figure.figsize'] = [10, 10]

# Map clusters to cell identities
# Clustering has some inherent randomness and you may need to change the order
new_cluster_names = {
    '0': 'Adventitial Fibroblast',
    '1': 'Airway Smooth Muscle',
    '2': 'Pericyte',
    '3': 'Alveolar Fibroblast',
    '4': 'Lipofibroblast',
    '5': 'Mesothelial',
    '6': 'Myofibroblast'}
adata_stromal.obs['new_annotation'] = [new_cluster_names[x] for x in adata_stromal.obs.leiden]
sc.pl.umap(adata_stromal, color=['new_annotation'], legend_loc='on data', legend_fontsize=12)

In [None]:
# Print the percentage of correctly annotated cells
tmp = adata_stromal.obs.free_annotation == adata_stromal.obs.new_annotation
print('% correct: ' + str(round(100 * tmp.sum() / len(tmp), 2)))

## Integrate 10x and SS2 data using scanpy

In [None]:
# Read in the SmartSeq2 data and sort the cells alphabetically
adata_facs = sc.read_csv(filename=os.path.join(pwd, 'krasnow_hlca_facs_counts.csv')).T
adata_facs = adata_facs[adata_facs.obs_names.sort_values()]

# Read in the SmartSeq2 metadata and sort the cells alphabetically
tmp = pd.read_csv(filepath_or_buffer=os.path.join(pwd, 'krasnow_hlca_facs_metadata.csv'), index_col=0).sort_index()
adata_facs.obs = tmp.copy()

# Subset the AnnData object to only data from patient 1
adata_facs = adata_facs[adata_facs.obs.patient == 1]

In [None]:
# Write the SmartSeq2 object to an h5ad object
adata_facs.write(os.path.join(pwd, 'krasnow_hlca_facs_raw.h5ad'))

In [None]:
# Read in SmartSeq2 h5ad file (if needed), much faster than sc.read_csv()
#adata_facs = sc.read_h5ad(os.path.join(pwd, 'krasnow_hlca_facs_raw.h5ad'))

In [None]:
# Reset the main 10x adata object
adata = copy.deepcopy(adata_raw)

# Subset both the SmartSeq2 and 10x adata objects to include only genes they have in common
var_names = adata.var_names.intersection(adata_facs.var_names)
adata_ref = adata[:, var_names]
adata_facs = adata_facs[:, var_names]

In [None]:
# Re-run scanpy pipline on new 10x adata object

# Normalize UMIs to UMIs per 10K, log them, and store the ln(UMIs per 10K) in adata_ref.raw.X
sc.pp.normalize_total(adata_ref, target_sum=1e4)
sc.pp.log1p(adata_ref)
adata_ref.raw = adata_ref

# Select highly variable genes
sc.pp.highly_variable_genes(adata_ref, min_mean=0.0125, max_mean=6, min_disp=0.5)

# Center and scale log normalized expression values and run PCA
sc.pp.scale(adata_ref, max_value=10)
sc.pp.pca(adata_ref, svd_solver='arpack')

# Build the nearest neighbor tree and compute UMAP coordinates
sc.pp.neighbors(adata_ref, n_neighbors=10, n_pcs=50)
sc.tl.umap(adata_ref)

In [None]:
plt.rcParams['figure.figsize'] = [12, 12]

# Plot the 10x cells by UMAP coordinates, colored by their ground truth annotations
sc.pl.umap(adata_ref, color=['free_annotation'], legend_loc='on data', legend_fontsize=10)

In [None]:
# Stash the ground truth annotations in a new metadata column in the obs shelf of the adata object
adata_facs.obs['orig_annotation'] = adata_facs.obs.free_annotation.copy()

# Run the scanpy data ingest() command on the SmartSeq2 dataset, using the new 10x object as a reference
# Request scanpy assign the SmartSeq2 cells a free_annotation based on its nearest neighbors in the 10x dataset
sc.tl.ingest(adata_facs, adata_ref, obs='free_annotation')

In [None]:
plt.rcParams['figure.figsize'] = [12, 12]

# Plot the SmartSeq2 cells emedded in the 10x objects' coordinates 
# Color by the ground truth annotation (orig_annotation) and scanpy's guess
sc.pl.umap(adata_facs, color=['orig_annotation', 'free_annotation'], legend_loc='on data', legend_fontsize=10)

In [None]:
# free_annotation and orig_annotation are both set to categorical series by scanpy
# As there is not perfect overlap in the constituent groups, we need to convert them to strings
# with the astype() function from the pandas library
adata_facs.obs.orig_annotation = adata_facs.obs.orig_annotation.astype(str)
adata_facs.obs.free_annotation = adata_facs.obs.free_annotation.astype(str)

# Create a temporary boolean comparing the two and then print the number of correct calls
tmp = adata_facs.obs.free_annotation == adata_facs.obs.orig_annotation
print('Total % correct: ' + str(round(100 * tmp.sum() / len(tmp), 2)))

In [None]:
# The with command can be used to run another command with certain global parameters set
# In this case, we are preventing pandas from restricting the number of rows and columns shown
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    
    # Use the groupby() command from the pandas library to summarize each cells
    # ground truth and predicted annotations
    print(adata_facs.obs.groupby(["orig_annotation", "free_annotation"]).size().reset_index(name="Number"))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    
    # Use the groupby() command from the pandas library to show 
    # the number of cells in each ground truth type
    print(adata_facs.obs.groupby(["orig_annotation"]).size().reset_index(name="Number"))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    
    # Use the groupby() command from the pandas library to show 
    # the number of cells in each predicted type
    print(adata_ref.obs.groupby(["free_annotation"]).size().reset_index(name="Number"))

**Question:** Looking carefully at scanpy's accuracy and what cell types exist in both datasets, why do you think it struggled and succeeded where it did?

**Answer:**