# Tutorial 5 - Biologically motivated analyses

## Load required packages

In [None]:
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
import os
import os
import copy
import re
import random
import mygene
import pandas as pd
import numpy as np
import scanpy as sc
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sc.logging.print_header()
sc.settings.n_jobs = 4

In [None]:
pwd = os.getcwd()

In [None]:
# Check this is set correctly to '/home/<SUNetID>/BIOC281/Classes/5' on FarmShare
# On Sherlock it should be /home/groups/<Group_Name>/BIOC281/Classes/5/
pwd

## Data ingest

In [None]:
# Read in the SmartSeq2 data and sort the cells alphanumerically
adata = sc.read_csv(filename=os.path.join(pwd, 'krasnow_hlca_facs_counts.csv')).T
adata = adata[adata.obs_names.sort_values()]

# Read in the SmartSeq2 metadata and sort the cells alphanumerically
tmp = pd.read_csv(filepath_or_buffer=os.path.join(pwd, 'krasnow_hlca_facs_metadata.csv'), index_col=0).sort_index()
adata.obs = tmp.copy()

# Stash counts in a new layer
adata.layers['counts'] = adata.X.copy()

# Normalize counts to counts per million, log them, and store the ln(counts per million + 1) in adata.raw.X
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)
adata.raw = adata

# Select highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=6, min_disp=0.5)

# Center and scale log normalized expression values and run PCA
sc.pp.scale(adata, max_value=10)

# Write the SmartSeq2 object to an h5ad object
adata.write(os.path.join(pwd, 'krasnow_hlca_facs_normalized.h5ad'))

If you get a memory error running any of the code chunks below, 
click on "Kernel" above in the file menu and then click "Restart Kernel and Clear All Outputs...". 
After Kernel has restarted, run the three chunks of code on the very top within the section "Load required packages". 
After that skip the current section (Data ingest) and move directly to the next section below "Load saved h5ad". 
This ensures that memory requirements stay low.

# Load saved h5ad

In [None]:
# Read in the h5ad object, this seems to halve the memory usage
adata = sc.read_h5ad(os.path.join(pwd, 'krasnow_hlca_facs_normalized.h5ad'))

## Identify differentially expressed genes

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):   
    # Use the groupby() command from the pandas library to show 
    # the number of cells in each predicted type
    print(adata.obs.groupby(["free_annotation"]).size().reset_index(name="Number"))

In [None]:
# The number of cells in each type vary in this dataset over 3 orders of magnitude
# If you performed differential gene expression analysis as-is scanpy would compare each group of cells
# versus all other cells, which is currently dominated by NK, Club, and Capillary cells, among others.
#
# To more accurately identify genes specific to each group, it is important to downsample groups
# with large numbers of cells. It is also important to think about what comparison would yeild
# cell type specific markers. Cell types are most similar to those in their compartments, so if 
# you compare them against all other cells, you tend to find their compartment markers. 
# If, instead, you compare them to cells within their compartment, you often find specific marker genes.
#
# Both decisions carry risks, for example, the downsampled cells not being representative or finding markers that are
# also found in other comparements. But the benefits usually outweigh the downsides
# Generally, there is not a single "correct" way to calculate differential gene expression and, as always,
# your research question should guide you

# Create a copy of the adata object to downsample groups with large numbers of cells
adata_ds = copy.deepcopy(adata)

In [None]:
# Python lists are collections of objects (usually strings and numbers)
# that can be referenced by their position and appended to
# such as the my_list = [1, 'two', 3], where my_list[0] would return 1
# and my_list[:1] would return [1, 'two']
# Lists can be combined with the "+" operator like this:
# my_list + ['four', 5, 6] would return [1, 'two', 3, 'four', 5, 6]
#
# Create an empty list of cells to use in the downsampled adata object
cells = []

# Loop through each cell type, called with the cat.categories function in pandas
for i in adata.obs.free_annotation.cat.categories:
    
    # Get a list of cells whose free_annotation matches the current cell type
    tmp = adata[adata.obs.free_annotation == i].obs_names.to_list()
    
    # If it has more than 100 cells, randomly sample 100 of them using the sample()
    # function in the random library
    if len(tmp) > 100:
        cells = cells + random.sample(tmp, k=100)
    
    # If not, take all of them
    else:
        cells = cells + tmp

# Subset adata_ds to the chosen cells
adata_ds = adata_ds[cells]
adata_ds

In [None]:
# Python dictionaries allow us to store information in keys, for exmaple:
# my_dict = {'key': value, 'science': 'is', 'really': 'fun'}
# my_dict['science'] will return "is" and my_dict['really'] will return "fun"
# In this case we'll use a dict where each "key" is a cell type and the information stored
# (the "value") is a data frame of differentially expressed genes for that cell type
marker_genes = {}

# Loop through each tissue compartment
for i in adata.obs.compartment.cat.categories:
    
    # Create a temporary adata object that includes only cells from the current tissue compartment
    adata_tmp = adata_ds[adata_ds.obs.compartment == i]
    
    # Identify differentially expressed genes for each cell type within the current tissue compartment
    # Uses the same test as Seurat (wilcoxon rank-sum) for p-values, but a different
    # correction for the false discovery correction (Benjamini-hochberg)
    sc.tl.rank_genes_groups(adata_tmp,
                            groupby="free_annotation",
                            method="wilcoxon",
                            tie_correct=True)
    
    # Python has something called comprehension that allows for 1-liner loops
    # In the simplest case, something like [print(x) for x in [1, 2, 3]] would print 1, 2 and 3 on separate lines.
    # It is equivalent to
    #
    # for x in [1, 2, 3]:
    #    print(x)
    #
    # In this case, we use two nested loops
    # The inner loop, for key in ['names', 'pvals_adj', 'logfoldchanges'], pulls the gene symbols, adjusted p-values,
    # and logfoldchanges from the differential gene expression results in the temporary adata object for each
    # cell type, which is set by the outer loop, for group in groups. The outer loop then stores the results of the inner
    # loop to a temporary dictionary with the cell types name as the key.
    result = adata_tmp.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    tmp = {group: pd.DataFrame({key: result[key][group] for key in ['names', 'pvals_adj', 'logfoldchanges']}) for group in groups}
    
    # Merge the temporary dict holding the current compartments' cell types differentially expressed genes with the others
    marker_genes = {**marker_genes, **tmp}

In [None]:
# You can explore how scanpy stores differentially expressed genes in the "uns" shelf
# To better understand how the nested for loops above work
display(adata_tmp.uns)

## Read in gene information

We can aide interpetation of the differentially expressed genes by gathering important information about each gene, such as its name and whether its a transcription factor, receptor, ligand, or enzyme or if mutations in it have cause a human disease with mendelian inheritance (and are therefore causal). Choosing good sources is important, many groups have created lists like these and not all are created equal. This tutorial highlights a few we have found quite useful, but we encourage you to explore!

Most people get stumpped staring at gene lists and start using gene set enrichment analysis (GSEA), go term analysis, or other similar approaches like DAVID (https://david.ncifcrf.gov). You _can_ do this, and we can discuss offline how, but today we are going to explore alternative ways to explore gene lists

In [None]:
# Connect to RefSeq using the mygene library (see https://mygene.info for more uses)
mg = mygene.MyGeneInfo()

# Create a table of gene symbols and their corresponding names with the querymany() function from mygene
# You can obtain the unique values from the pandas dataframe with pandas' unique() function
# and then convert them to a list (expected by mygene) with pandas' tolist() function
symbolToName = mg.querymany(adata.var_names.tolist(),
             scopes='symbol,name',
             species='human',
             fields="name",
             as_dataframe=True)

# Remove rows where a gene names was not found
symbolToName = symbolToName[symbolToName.notfound.isna()]

# Save the data frame index to a column
symbolToName['query'] = symbolToName.index.to_series()

# Remove rows with duplicates in the query column
# Keeps the first instance by default
symbolToName.drop_duplicates(subset="query", inplace=True)

# Remove extra columns from the table
symbolToName = symbolToName[['name', 'query']]

symbolToName

In [None]:
symbolToName[symbolToName.index.duplicated()]

In [None]:
# Read in a list of transcription factors (also includes their family) from the AnimalTFDB
tfs = pd.read_csv(os.path.join(pwd, 'dbs', 'AnimalTFDB.tsv'), sep="\t")

# Set the index of the table to the gene symbols
tfs.index = tfs.Symbol
tfs

In [None]:
# Read in a list of protein ligands from the Guide to Pharmacology (GtP) database
# The information we want is spread across two tab separated values tables
# We set the index of both to the column that has the GtP IDs
ligands = pd.read_csv(os.path.join(pwd, 'dbs', 'GtP_ligands.tsv'), sep="\t", index_col=0)
GtPToHGNC = pd.read_csv(os.path.join(pwd, 'dbs', 'GtP_to_HGNC_mapping.tsv'), sep="\t", index_col=3)

# Merge the tables with pandas' merge() function based on the index of both tables, keeping
# only those that they have in commmon
ligands = ligands.merge(GtPToHGNC, how="inner", left_index=True, right_index=True, copy=False)

# The second file also had receptors, we can remove them by looking for "ligandID" in the GtP URL
# This is another example of a comprehension loop, where 'ligandId' in x will evaluate to True or
# False for each URL. This creates a boolean we can use to subset
ligands = ligands[['ligandId' in x for x in ligands.gtp_url]]

# Set the index of the table to the gene symbols
ligands.index = ligands.hgnc_symbol

ligands

In [None]:
# Read in a list of receptors from the Guide to Pharmacology (GtP) database
receptors = pd.read_csv(os.path.join(pwd, 'dbs', 'GtP_receptors.tsv'), sep="\t")

# Set the index of the table to the gene symbols
receptors.index = receptors['HGNC symbol']

# Remove the enzymes from the list, since we'll build a list for them next from a different source
receptors = receptors[receptors.Type != 'enzyme']

receptors

In [None]:
# ExPASy does not make reading their database easy, run less ExPASy.tsv in terminal to see why

# Create empty dictionaries to hold the enzyme uniprotIDs and classes
enzyme_uniprots = []
enzyme_classes = []

# Compile regular expressions to match patterns in the ExPASy file

# ExPASy stores enzyme classes on lines that start with DE and look like:
# DE   Enzyme class.
# The "^" matches the start of the line, followed by DE, followed by 3 spaces
# The ([^\\.]+) matches any characters that are not a literal period.
# In regex, the "." matches any character so "\\." is used to match literal periods
# The parentheses save the characters before the final period to retrieve later as the class
re_class = re.compile('^DE   ([^\\.]+)\\.')

# The uniprotIDs are stored on lines that start with DR and look like:
# DR   UNIPROT, SYMBOL_SPECIES1;  UNIPROT, SYMBOL_SPECIES2;  UNIPROT, SYMBOL_SPECIES3
# Below we split these lines by ";  " and then use this regex to capture
# the uniprotIDs. from "DR   UNIPROT, SYMBOL_SPECIES1", "UNIPROT, SYMBOL_SPECIES2",
# and "UNIPROT, SYMBOL_SPECIES3" separately.
#
# The (?:DR   )? matches the first part of the line, but the "?:" keeps regex from
# saving the characters for later. the "?" outside the parentheses allows the regex to
# match even if its not there, since the second and third strings do not have it
# ([^,]+) matches characters that are not a comma and saves them, while [^_]+
# matches any characters that are not an underscore.
re_uniprot = re.compile('(?:^DR   )?([^,]+), [^_]+_HUMAN')

# Opems the enzymes file using base python's open() command
# The with statement allows us to set the contents to fp and use them in the code below
with open(os.path.join(pwd, 'dbs', 'ExPASy.txt')) as fp:
    
    # Loop through each line with base python's enumerate function
    for cnt, line in enumerate(fp):
        
        # If the line is empty move to the next line
        if len(line) == 0:
            continue
            
        # If its not empty    
        else:
            # Test if it matches a class line or a human
            enzyme_match = re_class.match(line)
            
            # If it does
            if enzyme_match is not None:
                
                # Save the enzyme class
                enzyme_class = enzyme_match[1]
            
            # Test if _HUMAN is in the line
            if "_HUMAN" in line:
                
                # If it is, split the line by ";  "
                sublines = line.split(';  ')
                
                # Loop through each piece of the line
                for x in sublines:
                    
                    # Test if the current piece has _HUMAN
                    uniprot = re_uniprot.match(x)
                    
                    # If it does
                    if uniprot is not None:
                        
                        # Append the uniprotID to enzyme_uniprots
                        enzyme_uniprots.append(uniprot[1])
                        
                        # Append the current enzyme_class to enzyme_classes
                        # enzyme_class won't change until another DE line
                        # So if multiple enzymes entries are under a class
                        # They will all get properly assigned
                        enzyme_classes.append(enzyme_class)

# Create a pandas data frame from the enzyme_symbols and enzyme_classes lists
enzymes = pd.DataFrame(data={'uniprot': enzyme_uniprots, 'class': enzyme_classes})

# Set the index of the data frame to the uniprotIDs
enzymes.index = enzymes.uniprot

# Connect to RefSeq using the mygene library (see https://mygene.info for more uses)
mg = mygene.MyGeneInfo()

# Create a table of uniprotIDs and corresponding gene symbols with the querymany() function from mygene
# You can obtain the unique values from the pandas dataframe with pandas' unique() function
# and then convert them to a list (expected by mygene) with pandas' tolist() function
uniProtToGene = mg.querymany(enzymes.uniprot.unique().tolist(),
             scopes='uniprot,symbol',
             species='human',
             as_dataframe=True)

# Remove rows that could not be matched to a gene symbol
uniProtToGene = uniProtToGene[uniProtToGene.notfound.isna()]

# Remove all columns but the gene symbols
uniProtToGene = uniProtToGene['symbol']

# Merge the data frame into the enzymes one by their indices (which are the uniprotIDs in both)
enzymes = enzymes.merge(uniProtToGene, how="inner", left_index=True, right_index=True)

# Set the enzymes data frame index to the gene symbols
enzymes.index = enzymes.symbol

enzymes

In [None]:
# Read in the Online Mendelian Inheritance in Man (OMIM) database
# It has several commented lines above and below, we can have read_csv ignore them
# We can rename the columns (which are very long in the original file) with the names paramter
omim = pd.read_csv(os.path.join(pwd, 'dbs', 'OMIM.tsv'),
                   sep="\t",
                   comment='#',
                   index_col=False,
                   names=['chromosome',
                          'genomic_start',
                          'genomic_end',
                          'location',
                          'comp_location',
                          'mim',
                          'mim_symbols',
                          'name',
                          'symbol',
                          'entrezID',
                          'ensemblID',
                          'comments',
                          'phenotypes',
                          'mgi'])

# Remove entries without a gene symbol
omim = omim[omim.symbol.notna()]

# Remove entries without a disease
omim = omim[omim.phenotypes.notna()]

# Set the OMIM data frame index to the gene symbols
omim.index = omim.symbol

omim

In [None]:
# Loop through each cell types differential expression list
for k, i in marker_genes.items():
    
    # Set the index of the current marker genes data frame to the gene symbols
    i.index = i.names
    
    # Remove the extra gene symbols column
    i.drop(columns=['names'], inplace=True)
    
    # Create 5 empty columns with every row/gene set to ""
    i['TF'] = ''
    i['Lig'] = ''
    i['Rec'] = ''
    i['Enz'] = ''
    i['OMIM'] = ''
    
    # If a gene in the marker genes data frame is also on the transcription factor
    # ligand, receptor, enzyme, or OMIM lists, set the relevant column and row to True
    i.loc[np.intersect1d(i.index.values, tfs.Symbol.values), 'TF'] = True
    i.loc[np.intersect1d(i.index.values, ligands.hgnc_symbol.values), 'Lig'] = True
    i.loc[np.intersect1d(i.index.values, receptors['HGNC symbol'].values.astype(str)), 'Rec'] = True
    i.loc[np.intersect1d(i.index.values, enzymes.symbol.values), 'Enz'] = True
    i.loc[np.intersect1d(i.index.values, omim.symbol.values), 'OMIM'] = True
    
    # Merge in the names we obtained earlier from mygene
    i = i.merge(symbolToName, how='left', left_index=True, right_index=True)
    
    # Remove the extra symbol column added by the merge above
    i.drop(columns=['query'], inplace=True)
    
    marker_genes[k] = i.copy()

## Looking at differential gene expression lists with added gene information

In [None]:
# Print a cell types top 100 differentially expressed genes
# In this case, we show those for Pericytes
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(marker_genes['Pericyte'].head(100))

In [None]:
# Obtain information on the transcription factors in the top 100 differentially expressed genes in pericytes
tfs.loc[marker_genes['Pericyte'].iloc[np.where(marker_genes['Pericyte'].TF.head(100) == True)[0]].index]

In [None]:
# Obtain information on the receptors in the top 100 differentially expressed genes in pericytes
receptors.loc[marker_genes['Pericyte'].iloc[np.where(marker_genes['Pericyte'].Rec.head(100) == True)[0]].index]

In [None]:
# Obtain information on the ligands in the top 100 differentially expressed genes in pericytes
ligands.loc[marker_genes['Pericyte'].iloc[np.where(marker_genes['Pericyte'].Lig.head(100) == True)[0]].index]

In [None]:
# Obtain information on the enzymes in the top 100 differentially expressed genes in pericytes
enzymes.loc[marker_genes['Pericyte'].iloc[np.where(marker_genes['Pericyte'].Enz.head(100) == True)[0]].index]

In [None]:
# Obtain information on the OMIM genes in the top 100 differentially expressed genes in pericytes
omim.loc[marker_genes['Pericyte'].iloc[np.where(marker_genes['Pericyte'].OMIM.head(100) == True)[0]].index]

## Testing biologically motivated hypotheses

### Example 1

Pericytes are contractile stromal cells associated with microvascular capillaries in many organs that have long been suspected of regulating blood flow, but no one has demonstrated that in lung.

We can borrow the molecular logic for contracility from Vascular Smooth Muscle (VSM), which are on larger vessels and are known to regulate blood floow. In VSM, myosins are phosphorylated and dephosphorylated during their duty cycle by kinases and phosphotases, which are regulated by the second messengers cyclic AMP, cyclic GMP, and inostitol 1,4,5-triphosphate. These second messengers are produced and degraded by enzymes that are under the regulation of hormone receptors and can be shared with nearby cells through intercellular gap junctions. Finally, we know contracility requires depolarization of the membrane through calcium influx, which is mediated by voltage gated ion channels.

As you run through this section of code, consider: What does relevant pericyte-specific gene expression look like? Given that, how plausible is the idea that pericytes may be involved in regulating blood flow in lung capillaries? And if it is plausible, can we learn anything about molecular mechanisms that may be involved in regulating this process in pericytes?

In [None]:
# Build a list of myosin heavy and light chain genes
# In this regex ^ matches the start of a string
# [OHL] matches O or H or L, [0-9]{1,2} matches any one or two numbers
# [A-Z]? matches 0 or 1 capital letters, and $ matches the end of the string
re_myosin = re.compile('^MY[OHL][0-9]{1,2}[A-Z]?$')
myosin_genes = adata.var_names[[re_myosin.match(x) is not None for x in adata.var_names]].sort_values()
print(myosin_genes)

# Build a list of the kinases that phosphorylate myosin
re_myosin_kin = re.compile('^MYLK[0-9]?$')
myosin_kin_genes = adata.var_names[[re_myosin_kin.match(x) is not None for x in adata.var_names]].sort_values()
print(myosin_kin_genes)

# Build a list of the phosphatases that dephosphorylate myosin
re_myosin_phos = re.compile('^PPP1[RC][0-9]{1,2}[A-Z]?$')
myosin_phos_genes = adata.var_names[[re_myosin_phos.match(x) is not None for x in adata.var_names]].sort_values()
print(myosin_phos_genes)

# Use the enzymes list find enzymes that make and degrade second messenger cAMP
cAMP_syn_genes = enzymes[['Adenylate cyclase' in x for x in enzymes['class']]].index.sort_values()
cAMP_deg_genes = enzymes[["3',5'-cyclic-AMP phosphodiesterase" in x for x in enzymes['class']]].index.sort_values()
print(cAMP_syn_genes)
print(cAMP_deg_genes)

# Use the enzymes list find enzymes that make and degrade second messenger cGMP
cGMP_syn_genes = enzymes[['Guanylate cyclase' in x for x in enzymes['class']]].index.sort_values()
cGMP_deg_genes = enzymes[["3',5'-cyclic-GMP phosphodiesterase" in x for x in enzymes['class']]].index.sort_values()
print(cGMP_syn_genes)
print(cGMP_deg_genes)

# Use the enzymes list find enzymes that make and degrade second messenger IP3
IP3_syn_genes = enzymes[['phospholipase C' in x for x in enzymes['class']]].index.sort_values()
IP3_deg_genes = enzymes[["Phosphoinositide 5-phosphatase" in x for x in enzymes['class']]].index.sort_values()
print(IP3_syn_genes)
print(IP3_deg_genes)

# Read in the hormone receptor list for who can trigger second messenger synthases
hormones = pd.read_csv(os.path.join(pwd, 'dbs', 'Hormones.csv'), index_col=1)
hormones.index

# Use receptor list to find voltage gated ion channels needed for contractility
vgic_genes = receptors[receptors.Type == "vgic"].index
print(vgic_genes)

# Use receptor list to find gap junctions which can spread second messengers
gap_genes = receptors[receptors['Family name'] == "Connexins and Pannexins"].index.sort_values()
print(gap_genes)

In [None]:
# Reordering the cells by tissue compartment can make it easier to see patterns of gene expression
cell_order = ['Club', # Epithelial
              'Ciliated',
              'Basal',
              'Differentiating Basal',
              'Goblet',
              'Ionocyte',
              'Neuroendocrine',
              'Alveolar Epithelial Type 1',
              'Alveolar Epithelial Type 2',
              'Signaling Alveolar Epithelial Type 2',
              'Artery', # Endothelial
              'Vein',
              'Capillary Aerocyte',
              'Capillary',
              'Capillary Intermediate 1',
              'Bronchial Vessel 1',
              'Lymphatic',
              'Airway Smooth Muscle', # Stromal
              'Vascular Smooth Muscle',
              'Myofibroblast',
              'Fibromyocyte',
              'Adventitial Fibroblast',
              'Alveolar Fibroblast',
              'Lipofibroblast',
              'Pericyte',
              'B', # Immune
              'Plasma',
              'CD8+ Memory/Effector T',
              'CD8+ Naive T',
              'CD4+ Memory/Effector T',
              'CD4+ Naive T',
              'Natural Killer T',
              'Natural Killer',
              'Proliferating NK/T', 
              'Neutrophil',
              'Basophil/Mast 1',
              'Macrophage',
              'Plasmacytoid Dendritic',
              'Dendritic',
              'Myeloid Dendritic Type 2',
              'IGSF21+ Dendritic',
              'Classical Monocyte',
              'Nonclassical Monocyte',
              'Intermediate Monocyte']

# Use the set_categories() command part of the cat section of the pandas library
# It expects a pandas Index() object, so we convert the list from above to one
adata.obs.free_annotation.cat.set_categories(pd.Index(cell_order), inplace=True)

In [None]:
# This is scanpy's dotplot function, for more usage information see:
# https://scanpy.readthedocs.io/en/stable/api/scanpy.pl.dotplot.html#scanpy.pl.dotplot
# Plot myosin genes
sc.pl.dotplot(adata, var_names = myosin_genes, groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05)

In [None]:
# Plot combined myosin kinase (MYLK) and phosphatase genes (PP1)
sc.pl.dotplot(adata, var_names = myosin_kin_genes.union(myosin_phos_genes), groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05)

In [None]:
# Plot combined cAMP synthesis (ADCY) and degredation genes (PDE)
sc.pl.dotplot(adata, var_names = cAMP_syn_genes.union(cAMP_deg_genes), groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05)

In [None]:
# Plot combined cGMP synthesis (GUCY) and degredation genes (PDE)
sc.pl.dotplot(adata, var_names = cGMP_syn_genes.union(cGMP_deg_genes), groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05)

In [None]:
sc.pl.dotplot(adata, var_names = IP3_syn_genes.union(IP3_deg_genes), groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05)

In [None]:
sc.pl.dotplot(adata, var_names = np.intersect1d(hormones.index.to_list(),adata.var_names), groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05, swap_axes=True)

In [None]:
sc.pl.dotplot(adata, var_names = vgic_genes, groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05, swap_axes=True)

In [None]:
sc.pl.dotplot(adata, var_names = gap_genes, groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05)

**Question:** Does the expression of the various components of contractility support the hypothesis? Did we learn about specific myosins, second messenger enzymes, hormone receptors, ion channels, and gap junctions that are expressed in pericytes? Are some or all pericyte-specific factors we deteced the same as those expressed by Vascular Smooth Muscle cells? If yes, is it possible that pericytes and vascular smooth muscle cells co-opt same factors to perfom similar functions? 

**Answer:**

### Example 2

Many viruses enter our bodies through inhalation, after which they travel down the respiratory tract and then attach to and enter cells to start productive infections. The topic has come to the fore with the global pandemic, with many groups looking at single cell RNA sequencing datasets to identify where the putative SARS-CoV-2 receptor _ACE2_ and accessory protease _TMPRSS2_ are expressed. These genes can be plotted on Violin plots with the code block below

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

# We can use violin plots to show the distribution of expression across cell types 
sc.pl.violin(adata, keys=['ACE2', 'TMPRSS2'], groupby="free_annotation", rotation=90, size=3, linewidth=0.5)

**Question**: Many groups have implicated Alveolar Epithelial Type 2 cells as a susceptible cell type, does the expression of _ACE2_ and _TMPRSS2_ support this? Do the data suggest other cell types could also be vulnerable to infection?

**Answer:**

In addition to zeroing in on a specific virus, we could take a wider view and ask: What does expression of all protein receptors associated with viral entry for humans look like?

To do this, we leverage Gene Ontology terms, not for enrichment analysis, but as a starting place for a gene list. After restricting entries associated with the term "viral entry into host cell"(http://amigo.geneontology.org/amigo/term/GO:0046718) to human genes, without "NOT" qualifiers, and with UniProt as their source, we are left with roughly 100 putative receptors. We then search the contributing database (UniProt) for each gene product and identify the paper(s) that support the gene ontology annotation. From these, we filter out genes with weak evidence and identify the specific virus each receptor facilitates entry for (in some cases more than one). After assembling the list of all viruses with protein receptors (this does not include viruses that use non-protein receptors like sialic acid), we then cultivate metadata on each virus including its family, genus, genome type, size, pathology, and whether the lung is its primary entry site (another common one being oral-fecal).

Below, we are reading in two CSVs created by the process described above.

In [None]:
viral_entry = pd.read_csv(os.path.join(pwd, 'dbs', 'viral_entry.csv'), index_col=0)
virus_metadata = pd.read_csv(os.path.join(pwd, 'dbs', 'virus_metadata.csv'), index_col=0)
display(viral_entry)
display(virus_metadata)

In [None]:
# Rules for cleaning up and shortening the viral names for display purposes
vir_reg = {" ?virus": "",
           " ?human ?": "",
           " ?subgroup": "", 
           " ?type": "",
           " +$": "",
           "simplex ?": "",
           "Immunodeficiency": "HIV",
           "Mammalian": "Mam",
           "Hepatitis": "Hep",
           "Japanese": "Jp",
           "Venezuelan": "Vz",
           "Adeno-associated": "AAV",
           "Respiratory syncytial": "RSV",
           "Rift valley fever": "RVF",
           "Influenza": "Flu"}

# In python we can create custom functions that perform tasks on inputs
# and then return the outputs. The functions we have been using from pandas
# scanpy, velocyto, and more are all custom functions created by users
#
# You can define functions with "def" like so:
# def function_name(input_varable1, input_variable2, ...):
#    return task to perform with input_varable1 and input_variable2
#
# One example:
# def add_a_comma(word1, word2):
#    return ",".join(word1, word2)
#
# where add_a_comma('Travaglini', 'Kyle') would return 'Travaglini,Kyle'
#
# Function to loop through virus and apply virus renaming rules using the
# sub() function from the re library. re.I is a special flag that tells
# sub() to ignore case when searching for words to replace.
def replace_all(dict, text):
    for i, j in dict.items():
        text = re.sub(i, j, text, flags=re.I)
    return text

In [None]:
# Seperate the viruses whose primary route of infection is through the lung
# versus through other avenues into different lists
lung_viruses = virus_metadata[virus_metadata['Lung entry'] == True].index.tolist()
other_viruses = virus_metadata[virus_metadata['Lung entry'] != True].index.tolist()

# Create empty lists to hold genes and viruses that enter through the lungs
lung_entry_genes = []
lung_entry_viruses = []

# Create empty lists to hold genes and viruses that enter other ways
other_entry_genes = []
other_entry_viruses  = []

# Loop through each viral receptor
for i in viral_entry.index:
    
    # Subset the viral_entry table to the current viral receptor
    tmp = viral_entry[viral_entry.index == i]
    
    # Use nested list comprehension to identify **respiratory** viruses the current receptor facilitates entry for,
    # clean the name up with replace_all, and raise the first letter in each name to uppercase
    virus_tmp = [k[:1].upper() + k[1:] for k in [replace_all(vir_reg, j) for j in tmp.loc[i,:][[x in lung_viruses for x in tmp.iloc[0,:]]].tolist()]]
    
    # Sort the cleaned **respiratory** virus names alphabetically
    virus_tmp.sort()
    
    # If the current receptor facilities entry for repsiratory virus(es),
    # add it and the viruses to the lung list
    if len(virus_tmp) > 0:
        lung_entry_genes.append(i)
        lung_entry_viruses.append("   ".join(virus_tmp))

    # Use nested list comprehension to identify **non-respiratory** viruses the current receptor facilitates entry for,
    # clean the name up with replace_all, and raise the first letter in each name to uppercase    
    virus_tmp = [k[:1].upper() + k[1:] for k in [replace_all(vir_reg, j) for j in tmp.loc[i,:][[x in other_viruses for x in tmp.iloc[0,:]]].tolist()]]
    
    # Sort the cleaned **non-respiratory** virus names alphabetically
    virus_tmp.sort()
    
    # If the current receptor facilities entry for **non-repsiratory** virus(es),
    # add it and the viruses to the other list
    if len(virus_tmp) > 0:
        other_entry_genes.append(i)
        other_entry_viruses.append("   ".join(virus_tmp))

# Create dataframes for the respiratory and non-respiratory viral entry receptors and
# sort them based on the cleaned virus names
lung_entry = pd.DataFrame(data={'viruses': lung_entry_viruses}, index=lung_entry_genes)
lung_entry.sort_values(by="viruses", axis=0, inplace=True)
other_entry = pd.DataFrame(data={'viruses': other_entry_viruses}, index=other_entry_genes)
other_entry.sort_values(by="viruses", axis=0, inplace=True)

In [None]:
# Print the receptors and viruses for respiratory viruses
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(lung_entry)

In [None]:
plt.rcParams['figure.figsize'] = [12, 6]

# Plot the receptors for respiratory viruses
sc.pl.dotplot(adata, var_names = lung_entry.index, groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05, swap_axes=True)

In [None]:
# Print the receptors and viruses for non-respiratory viruses
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(other_entry)

In [None]:
plt.rcParams['figure.figsize'] = [12, 14]

# Plot the receptors for non-respiratory viruses
sc.pl.dotplot(adata, var_names = other_entry.index, groupby="free_annotation", mean_only_expressed=True, cmap="Greys", dot_min=0.05, swap_axes=True)

**Questions:** Do you notice any interesting expression patterns for respiratory and non-respiratory viruses? Why might we have included the non-respiratory viruses here?

**Answer:**

### Example 3

This one is up to you! The HLCA SmartSeq2 dataset contains cell types across all the major tissue compartments for the lung. Come up with a question, either directed like Example 1 or more open like Example 2, then practice making a gene list to address it. Once you have the gene list, visualize the genes using whatever plotting functions you feel are best suited to answer your question.

**Question:** What was your question? What was the answer?

**Answer:**