In [2]:
from utils import *
import scanpy as sc
import matplotlib.pyplot as plt
from pyensembl import EnsemblRelease, find_species_by_name, genome_for_reference_name

In [3]:
metmap_tissue = snakemake.wildcards["tissue"]

In [4]:
counts_df = pd.read_csv(snakemake.input["counts"], index_col=0)
conditions_df = pd.read_csv(snakemake.input["conditions"], index_col=0)
deseq_df = pd.read_csv(snakemake.input["deseq"], index_col=0)

In [5]:
significant = (deseq_df["padj"] <= 0.01) & (abs(deseq_df["log2FoldChange"]) >= 3.0)

In [6]:
counts_sig_df = counts_df.T.loc[significant].T
deseq_sig_df = deseq_df.loc[significant]

In [7]:
conditions_df = conditions_df.sort_values(by=["metastatic", "mean"])
deseq_sig_df = deseq_sig_df.sort_values(by="log2FoldChange")
counts_sig_df = counts_sig_df.loc[conditions_df.index.values.tolist()][deseq_sig_df.index.values.tolist()]

In [10]:
# Convert ensembl gene IDs to gene names
er = EnsemblRelease(100, species=find_species_by_name("homo_sapiens"))

def get_gene_name(ens_vid):
    try:
        ens_id = ens_vid.split(".")[0]
        return er.gene_name_of_gene_id(ens_id)
    except ValueError:
        return ens_vid
deseq_sig_df["gene_name"] = deseq_sig_df.index.to_series().apply(get_gene_name)
deseq_sig_df = deseq_sig_df.set_index("gene_name")

In [11]:
adata = AnnData(X=counts_sig_df.values, var=deseq_sig_df, obs=conditions_df)

In [12]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.scale(adata, max_value=3.5)

In [19]:
ax = sc.pl.heatmap(adata, var_names=adata.var_names, groupby="metastatic", cmap='PRGn', num_categories=2, show=False, show_gene_labels=True)
ax["heatmap_ax"].set_title(f"Expression of differentially expressed genes across cell lines, {metmap_tissue}")
ax["heatmap_ax"].set_xlabel("Gene")
ax["groupby_ax"].set_ylabel("Metastatic and Non-Metastatic Cell Lines")
plt.savefig(snakemake.output["heatmap_plot"])