# Exemplary analysis of the PBMC3K single-cell RNA dataset

Dependencies

- scanpy==1.7.2
- anndata==0.7.6
- umap==0.5.1
- nmpy==1.19.4
- scipy==1.5.3
- pandas==1.1.4
- scikit-learn==0.23.2
- statsmodels==0.12.1
- python-igraph==0.9.1
- louvain==0.7.0


In [None]:
from typing import Optional
import csv
import scanpy as sc
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

rawdata_dir = os.path.join(os.getcwd(), "rawdata")
figure_dir = "/home/jovyan/work/figures"

# Global settings
sc.settings.verbosity = 0
sc.logging.print_header()
sc.settings.set_figure_params(dpi=300, facecolor='white')

In [None]:
adata = sc.read_10x_mtx(rawdata_dir, var_names="gene_symbols", cache=False)
adata.var_names_make_unique()

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

In [None]:
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]

In [None]:
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.leiden(adata)
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden')

### Rename leiden clusters

In [None]:
new_cluster_names = [
    'CD4+ T-cells',
    'CD14+ Monocytes',
    'B-cells',
    'CD8+ T-cells',
    'NK cells',
    'FCGR3A+ Monocytes',
    'Dendritic cells',
    'Megakaryocytes'
]
adata.rename_categories('leiden', new_cluster_names)

### Generate final figure versions for paper

* marker genes violin plot
* UMAP plot with cluster annotation

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,8))
ax.set_aspect('equal', adjustable='box')
sc.pl.umap(adata, color='leiden', legend_loc='right margin', title='', frameon=True, ax=ax, show=False)
plt.tight_layout(pad=3.0)

plt.savefig(os.path.join(figure_dir, "figure2.png"))

### Differential analysis results to pandas dataframe

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', n_genes=100)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)

In [None]:
def export_uns_data(adata_obj,
                    descriptor: Optional[str] = "rank_genes_groups",
                    filename: Optional[str] = None,
                    basedir: Optional[str] = None,
                    extract_cols=["names", "pvals", "pvals_adj", "logfoldchanges"],
                    overwrite=False):

    if not basedir:
        basedir = os.getcwd()

    # descriptor = "rank_genes_groups"
    print(f"Exporting differential expression data for '{descriptor}'\n" + \
         "\n".join([f"\t{key}: {value}" for key, value in adata_obj.uns[descriptor]["params"].items()]))

    n_clusters = len(adata_obj.uns[descriptor]["names"][0])
    n_clusters

    data = {"cluster": []}
    for col in extract_cols:
        data[col] = []

    results_df = pd.DataFrame(data)

    items_per_cluster = len(np.array([list(i) for i in np.array(adata_obj.uns[descriptor][extract_cols[0]])])[:,0])

    for cluster in range(0, n_clusters):

        data = {"cluster": []}
        for col in extract_cols:
            data[col] = []

        df = pd.DataFrame(data)

        df["cluster"] = pd.array([cluster] * items_per_cluster, dtype="Int32")

        for col in extract_cols:
            cluster_vars = np.array([list(i) for i in np.array(adata_obj.uns[descriptor][col])])[:,cluster]
            df[col] = cluster_vars

        results_df = results_df.append(df)

    if filename:
        # return results_df
        full_filename = os.path.join(basedir, filename)
        if os.path.isfile(full_filename) and not overwrite:
            raise FileExistsError(f"File {full_filename} already exists")

        results_df.to_csv(full_filename, index=False)

    return results_df

In [None]:
diffexp_df = export_uns_data(adata, filename="export_uns.csv", overwrite=True)
diffexp_df.head()

### Perform KEGGTOOLS enrichment analysis

In [None]:
# keggtool imports
from keggtools import (
    Pathway,
    Enrichment,
    EnrichmentResult,
    Resolver,
    Storage,
    Renderer,
    IMMUNE_SYSTEM_PATHWAYS,
)

In [None]:
organism_id = "hsa"
# Analysed Cluster 4 (NK cells)
cluster = 4

In [None]:
diffexp_df = diffexp_df[diffexp_df["cluster"] == cluster]
diffexp_df.head()

In [None]:
analysis = Enrichment(org=organism_id, pathways=IMMUNE_SYSTEM_PATHWAYS)

### Convert to list of entez ids using mygene

In [None]:
# gene_list
import mygene
mg = mygene.MyGeneInfo()

In [None]:
gene_list = list(diffexp_df["names"])
query_result = mg.querymany(gene_list, scopes="symbol", species="human")

In [None]:
entrz_gene_list= []
for item in query_result:
    if "entrezgene" in item:
        # Append entrez id to list
        entrz_gene_list.append(int(item["entrezgene"]))

        # Add entrez id to dataframe
        diffexp_df.loc[diffexp_df["names"] == item["query"], "entrez"] = int(item["entrezgene"])
    else:
        print(f"Not found entrez id in result:", item)
        diffexp_df.loc[diffexp_df["names"] == item["query"], "entrez"] = 0

### Run enrichment analysis

In [None]:
analysis.run_analysis(gene_list=entrz_gene_list)
result = analysis.to_dataframe()

In [None]:
# Export KEGG Analysis result
result.to_csv("kegg_enrichment.csv",index=False)

### Plot results of enrichment analysis

In [None]:
import matplotlib.pyplot as plt
import math

plt.figure(figsize=(8, 5), dpi=300)
scatter = plt.scatter(x=result["study_count"],
                      y=result["pathway_name"],
                      c=[-math.log10(x) for x in result["pvalue"]],
                      cmap="coolwarm")

cbar = plt.colorbar()
cbar.set_label("- log10(p value)")

plt.grid(b=None)
plt.tight_layout()
plt.savefig(os.path.join(figure_dir, "figure4.png"), bbox_inches='tight')
plt.show()

### Plot pathway

* "Natural killer cell mediated cytotoxicity" (hsa:04650) show a significant p value

In [None]:
pathway = Resolver(org="hsa").get_pathway("04650")

In [None]:
renderer = Renderer(kegg_pathway=pathway)

In [None]:
diffexp_df[["entrez"]] = diffexp_df[["entrez"]].astype(int)
diffexp_df.head()

In [None]:
overlay = dict(zip(list(diffexp_df["entrez"]), list(diffexp_df["logfoldchanges"])))
renderer.set_overlay(overlay)
dot_string = renderer.raw_render()

In [None]:
with open(os.path.join(figure_dir, "figure5.dot"), "w") as f:
    f.write(dot_string)

In [None]:
from IPython.display import Image, display
img = Image(os.path.join("/tmp", "figure5.png"))
display(img)