In [91]:
import pandas as pd
import numpy as np

import altair as alt
from altair_saver import save as alt_save

import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
from pyensembl import EnsemblRelease, find_species_by_name, genome_for_reference_name

In [7]:
df = pd.read_csv(snakemake.input['deseq'], index_col=0)

In [16]:
df["minuslog10p"] = df["padj"].apply(lambda p: -np.log10(p))
df["significant"] = df.apply(lambda row: row["padj"] < 0.05 and abs(row["log2FoldChange"]) > 2, axis='columns')

In [123]:
df["should_label"] = df.apply(lambda row: row["minuslog10p"] > -np.log10(0.00000005) and abs(row["log2FoldChange"]) > 3.5, axis='columns')

In [124]:
label_df = df.loc[df["should_label"]]

In [125]:
er = EnsemblRelease(100, species=find_species_by_name("homo_sapiens"))

def get_gene_name(ens_vid):
    try:
        ens_id = ens_vid.split(".")[0]
        return er.gene_name_of_gene_id(ens_id)
    except ValueError:
        return ens_vid
label_df["gene_name"] = label_df.index.to_series().apply(get_gene_name)

In [126]:
plt.figure(figsize=(8,8))
ax = sns.scatterplot(data=df, x="log2FoldChange", y="minuslog10p", hue="significant")
texts = [plt.text(label_df.iloc[i]["log2FoldChange"], label_df.iloc[i]["minuslog10p"], label_df.iloc[i]["gene_name"], ha='center', va='center') for i in range(label_df.shape[0])];
adjust_text(texts)
ax.hlines(y=-np.log10(0.05), xmin=-6, xmax=6, colors='black', linestyles='--', lw=2)

In [127]:
plt.savefig(snakemake.output["plot"]) 