In [2]:
import pandas as pd
import numpy as np

import altair as alt
from altair_saver import save as alt_save

import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
from pyensembl import EnsemblRelease, find_species_by_name, genome_for_reference_name

import json
import requests
import gseapy

In [3]:
metmap_tissue = snakemake.params['metmap_tissue']

In [4]:
df = pd.read_csv(snakemake.input['deseq'], index_col=0)

In [5]:
df.head()

In [6]:
df["minuslog10p"] = df["padj"].apply(lambda p: -np.log10(p))
df["significant"] = df.apply(lambda row: row["padj"] < 0.05 and abs(row["log2FoldChange"]) > 2, axis='columns')
df["Significant at p 0.05"] = df.apply(lambda row: row["padj"] < 0.05, axis='columns')

In [7]:
df["should_label"] = df.apply(lambda row: row["minuslog10p"] > -np.log10(0.00000005) and abs(row["log2FoldChange"]) > 3.5, axis='columns')

In [8]:
er = EnsemblRelease(100, species=find_species_by_name("homo_sapiens"))

def get_gene_name(ens_vid):
    try:
        ens_id = ens_vid.split(".")[0]
        return er.gene_name_of_gene_id(ens_id)
    except ValueError:
        return ens_vid
df["gene_name"] = df.index.to_series().apply(get_gene_name)

In [9]:
df.loc[df["gene_name"].isin(["MLANA", "SLC45A2", "DCT", "TYR", "TRPM1", "SLC24A5"])].index.values.tolist()

In [10]:
label_df = df.loc[df["should_label"]]

In [11]:
plt.figure(figsize=(8,8))
ax = sns.scatterplot(data=df, x="log2FoldChange", y="minuslog10p", hue="Significant at p 0.05")
texts = [plt.text(label_df.iloc[i]["log2FoldChange"], label_df.iloc[i]["minuslog10p"], label_df.iloc[i]["gene_name"], ha='center', va='center') for i in range(label_df.shape[0])];
adjust_text(texts)
ax.hlines(y=-np.log10(0.05), xmin=-6, xmax=6, colors='black', linestyles='--', lw=2)
plt.title(f"Differentially expressed genes for {metmap_tissue} metastasis")
plt.xlabel("log_2(fold change)")
plt.ylabel("-log_10(p)")
plt.savefig(snakemake.output["deseq_plot"]) 

## Use Enrichr to analyze the set of differentially expressed genes

Reference: https://maayanlab.cloud/Enrichr/help#api

In [12]:
df = df.set_index("gene_name")

In [13]:
met_gene_set = df.loc[df["significant"]].index.values.tolist()
nonmet_gene_set = df.loc[~df["significant"]].index.values.tolist()

In [14]:
def get_enrichr_df(gene_set, gene_set_desc):
    ENRICHR_URL = 'http://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(gene_set)
    description = gene_set_desc
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')

    data = json.loads(response.text)
    
    ENRICHR_URL = 'http://maayanlab.cloud/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    user_list_id = data['userListId']
    gene_set_library = 'KEGG_2019_Human'
    response = requests.get(
        ENRICHR_URL + query_string % (user_list_id, gene_set_library)
     )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    data = json.loads(response.text)
    
    enrichr_df = pd.DataFrame(data=data[gene_set_library], columns=["rank", "term_name", "pval", "zscore", "combined_score", "overlapping_genes", "padj", "pold", "poldadj"])
    return enrichr_df

In [15]:
len(met_gene_set)

In [16]:
met_enrichr_df = get_enrichr_df(met_gene_set, 'Differentially significant genes')

In [17]:
met_enrichr_df.head()

In [18]:
met_enrichr_df.to_csv(snakemake.output["enrichr"], sep='\t')

In [19]:
met_enrichr_df = met_enrichr_df.loc[met_enrichr_df["padj"] < 0.999913]

In [20]:
met_enrichr_df["minuslog10p"] = met_enrichr_df["padj"].apply(lambda p: -np.log10(p))
met_enrichr_df["significant"] = met_enrichr_df["padj"] < 0.05

In [21]:
BINARY_SCALE = alt.Scale(domain=["false", "true"])

In [22]:
TERM_SORT = met_enrichr_df["term_name"].values.tolist()

plot = alt.Chart(met_enrichr_df).mark_bar().encode(
    x=alt.X("minuslog10p:Q", axis=alt.Axis(title="-log_10(p)")),
    y=alt.Y("term_name:N", sort=TERM_SORT, axis=alt.Axis(title="KEGG 2019 term")),
    color=alt.Color("significant:N", legend=alt.Legend(title="Significant at 0.05"), scale=BINARY_SCALE)
).properties(
    title=f"Pathways enriched in differentially expressed gene set for {metmap_tissue} metastasis"
)

plot

In [23]:
alt_save(plot, snakemake.output["enrichr_plot"])

## Use Enrichr on the positive and negative fold change sets separately

In [24]:
NUM_TERMS = 30

In [25]:
fc_pos_gene_set = df.loc[df["significant"] & (df["log2FoldChange"] > 0)].index.values.tolist()
fc_neg_gene_set = df.loc[df["significant"] & (df["log2FoldChange"] < 0)].index.values.tolist()

In [26]:
len(fc_pos_gene_set)

In [27]:
len(fc_neg_gene_set)

In [30]:
fc_pos_enrichr_df = get_enrichr_df(fc_pos_gene_set, 'Differentially significant genes, positive fold change')
fc_neg_enrichr_df = get_enrichr_df(fc_neg_gene_set, 'Differentially significant genes, negative fold change')

In [31]:
fc_pos_enrichr_df["minuslog10p"] = fc_pos_enrichr_df["padj"].apply(lambda p: -np.log10(p))
fc_pos_enrichr_df["significant"] = fc_pos_enrichr_df["padj"] < 0.05

In [32]:
TERM_SORT = fc_pos_enrichr_df["term_name"].values.tolist()

plot = alt.Chart(fc_pos_enrichr_df.head(NUM_TERMS)).mark_bar().encode(
    x=alt.X("minuslog10p:Q", axis=alt.Axis(title="-log_10(p)")),
    y=alt.Y("term_name:N", sort=TERM_SORT, axis=alt.Axis(title="KEGG 2019 term")),
    color=alt.Color("significant:N", legend=alt.Legend(title="Significant at p 0.05"), scale=BINARY_SCALE)
).properties(
    title={
        "text": f"Top {NUM_TERMS} pathways enriched in positive differentially expressed gene set for {metmap_tissue} metastasis",
        "subtitle": "Enrichr method"
    }
)

plot

In [33]:
alt_save(plot, snakemake.output["enrichr_pos_plot"])

In [34]:
fc_neg_enrichr_df["minuslog10p"] = fc_neg_enrichr_df["padj"].apply(lambda p: -np.log10(p))
fc_neg_enrichr_df["significant"] = fc_neg_enrichr_df["padj"] < 0.05

In [35]:
TERM_SORT = fc_neg_enrichr_df["term_name"].values.tolist()

plot = alt.Chart(fc_neg_enrichr_df.head(NUM_TERMS)).mark_bar().encode(
    x=alt.X("minuslog10p:Q", axis=alt.Axis(title="-log_10(p)")),
    y=alt.Y("term_name:N", sort=TERM_SORT, axis=alt.Axis(title="KEGG 2019 term")),
    color=alt.Color("significant:N", legend=alt.Legend(title="Significant at p 0.05"), scale=BINARY_SCALE)
).properties(
    title={
        "text": f"Top {NUM_TERMS} pathways enriched in negative differentially expressed gene set for {metmap_tissue} metastasis",
        "subtitle": "Enrichr method"
    }
)

plot

In [36]:
alt_save(plot, snakemake.output["enrichr_neg_plot"])

## Use GSEA prerank method for pathway enrichment computations

In [37]:
fc_pos_df = df.loc[(df["log2FoldChange"] > 0) & (df["padj"] <= 0.05)].sort_values(by="log2FoldChange", ascending=False)
fc_neg_df = df.loc[(df["log2FoldChange"] < 0) & (df["padj"] <= 0.05)].sort_values(by="log2FoldChange", ascending=True)

In [40]:
# Convert log2FoldChange values to their absolute values so that the ordering goes in the same direction for the positive and negative gene sets
fc_neg_abs_df = fc_neg_df.copy()
fc_neg_abs_df["log2FoldChange"] = fc_neg_abs_df["log2FoldChange"].apply(abs)
fc_neg_abs_df.head()

In [41]:
# using GSEA prerank tool
prerank_neg_df = gseapy.prerank(rnk=fc_neg_abs_df[["log2FoldChange"]], processes=4, gene_sets='KEGG_2019_Human', outdir='gsea', min_size=2, ascending=True, no_plot=True, permutation_num=1500, seed=2445).res2d
prerank_pos_df = gseapy.prerank(rnk=fc_pos_df[["log2FoldChange"]], processes=4, gene_sets='KEGG_2019_Human', outdir='gsea', min_size=2, ascending=True, no_plot=True, permutation_num=1500, seed=2445).res2d

In [42]:
prerank_pos_df = prerank_pos_df.reset_index().rename(columns={"Term": "term_name"}).sort_values(by="pval")
prerank_neg_df = prerank_neg_df.reset_index().rename(columns={"Term": "term_name"}).sort_values(by="pval")

In [43]:
prerank_neg_df.to_csv(snakemake.output["gsea_neg"], sep="\t")
prerank_pos_df.to_csv(snakemake.output["gsea_pos"], sep="\t")

In [44]:
prerank_neg_df["pval"] = prerank_neg_df["pval"].clip(lower=0.000001)
prerank_pos_df["pval"] = prerank_pos_df["pval"].clip(lower=0.000001)

In [45]:
prerank_neg_df["minuslog10p"] = prerank_neg_df["pval"].apply(lambda p: -np.log10(p))
prerank_neg_df["significant"] = (prerank_neg_df["pval"] < 0.05) & (prerank_neg_df["fdr"] < 0.05)

prerank_pos_df["minuslog10p"] = prerank_pos_df["pval"].apply(lambda p: -np.log10(p))
prerank_pos_df["significant"] = (prerank_pos_df["pval"] < 0.05) & (prerank_pos_df["fdr"] < 0.05)

In [46]:
TERM_SORT = prerank_neg_df.index.values.tolist()

plot = alt.Chart(prerank_neg_df.head(NUM_TERMS)).mark_bar().encode(
    x=alt.X("minuslog10p:Q", axis=alt.Axis(title="-log_10(p)")),
    y=alt.Y("term_name:N", sort=TERM_SORT, axis=alt.Axis(title="KEGG 2019 term")),
    color=alt.Color("significant:N", legend=alt.Legend(title="Significant at q 0.05"), scale=BINARY_SCALE)
).properties(
    title={
        "text": f"Top {NUM_TERMS} pathways enriched in negative differentially expressed gene set for {metmap_tissue} metastasis",
        "subtitle": "GSEA preranked method"
    }
)

plot

In [47]:
alt_save(plot, snakemake.output["gsea_neg_plot"])

In [48]:
TERM_SORT = prerank_pos_df.index.values.tolist()

plot = alt.Chart(prerank_pos_df.head(NUM_TERMS)).mark_bar().encode(
    x=alt.X("minuslog10p:Q", axis=alt.Axis(title="-log_10(p)")),
    y=alt.Y("term_name:N", sort=TERM_SORT, axis=alt.Axis(title="KEGG 2019 term")),
    color=alt.Color("significant:N", legend=alt.Legend(title="Significant at q 0.05"), scale=BINARY_SCALE)
).properties(
    title={
        "text": f"Top {NUM_TERMS} pathways enriched in positive differentially expressed gene set for {metmap_tissue} metastasis",
        "subtitle": "GSEA preranked method"
    }
)

plot

In [49]:
alt_save(plot, snakemake.output["gsea_pos_plot"])