# Recurrent non-synonymous mutations as determined by SUFAM

In [None]:
import pandas as pd
import numpy as np
import glob
import os
%matplotlib inline
import seaborn as sns

In [None]:
INPUT_FILES = {
    # should be all samples run through sufam on set of all NS mutations in all samples
    "sufam":"ALL_SUFAM",
    # should be the vcf as supplied to sufam with annotations
    "ann":"SUFAM_ANNOTATIONS_VCF"
}

In [None]:
PARAMETERS = {
    # should be a number, show only genes/mutations with x number of samples supporting it
    "min_nr_samples_with_mutation":int("MIN_NR_SAMPLES_WITH_MUTATION"),
    # should be a list of names
    "sample_order":"SAMPLE_ORDER".split()
}

In [None]:
df = pd.read_csv(INPUT_FILES["sufam"], dtype={"chrom":str}, sep="\t")

In [None]:
dfa = df.join(pd.read_csv(INPUT_FILES["ann"], dtype={"#CHROM":str}, sep="\t", na_values=".")\
        .drop("ID", axis=1)\
        .rename(columns={"#CHROM":"chrom",
                         "REF":"val_ref",
                         "ALT":"val_alt",
                         "POS":"pos"})\
        .set_index("chrom pos val_ref val_alt".split()),
        on=["chrom","pos","val_ref","val_alt"])

In [None]:
split = pd.DataFrame(dfa["ANN[*].GENE"].str.split("|").tolist(), index=dfa.index).stack()
split.name = "GENE_SPLIT"
split.index = split.index.droplevel(-1)
dfa_split = dfa.join(pd.DataFrame({"GENE_SPLIT":split,
              "IMPACT_SPLIT":[s for l in dfa["ANN[*].IMPACT"].str.split("|").tolist() for s in l]}, index=split.index))
dfa_split.index = list(range(len(dfa_split)))

## Do the samples have NS mutation(s) in the same gene?

In [None]:
gene_count = dfa_split[dfa_split["IMPACT_SPLIT"].isin(["MODERATE", "HIGH"]) & (dfa_split.val_maf > 0)].groupby(["sample", "GENE_SPLIT"]).chrom.count().unstack("sample").fillna(0)
gene_count = gene_count.reindex_axis(PARAMETERS["sample_order"], axis=1)

In [None]:
sns.clustermap(gene_count.astype(bool)[gene_count.astype(bool).apply(lambda x: x.sum(), axis = 1) >= PARAMETERS["min_nr_samples_with_mutation"]],
               row_cluster=False, col_cluster=False, figsize=[12, 12])
sns.plt.title("Mutation existence in gene")

## Do the samples have identical NS mutation(s)?

In [None]:
identical_mut_maf = dfa[(dfa["ANN[*].IMPACT"].str.contains("MODERATE") | dfa["ANN[*].IMPACT"].str.contains("HIGH")) & (dfa.val_maf > 0)].groupby(["sample", "chrom", "pos", "val_ref", "val_alt", "ANN[*].GENE", "ANN[*].HGVS_P"]).val_maf.max().unstack("sample").fillna(0)
identical_mut_maf = identical_mut_maf.reindex_axis(PARAMETERS["sample_order"], axis=1)

In [None]:
sns.clustermap(identical_mut_maf.astype(bool)[identical_mut_maf.astype(bool).apply(lambda x: x.sum(), axis = 1) >= PARAMETERS["min_nr_samples_with_mutation"]], 
               row_cluster=False, col_cluster=False, figsize=[12, 12])
sns.plt.title("Mutation existence")

## What is the MAF of the identical NS mutation(s)?

In [None]:
sns.clustermap(identical_mut_maf[identical_mut_maf.astype(bool).apply(lambda x: x.sum(), axis = 1) >= PARAMETERS["min_nr_samples_with_mutation"]],
               row_cluster=False, col_cluster=False, figsize=[12, 12], annot=True)
sns.plt.title("MAF per mutation")

## What is the max MAF of each gene with NS mutation(s)?

In [None]:
gene_max_maf = dfa_split[dfa_split["IMPACT_SPLIT"].isin(["MODERATE", "HIGH"]) & (dfa_split.val_maf > 0)].groupby(["sample", "GENE_SPLIT"]).val_maf.max().unstack("sample").fillna(0)
gene_max_maf = gene_max_maf.reindex_axis(PARAMETERS["sample_order"], axis=1)

In [None]:
sns.clustermap(gene_max_maf[gene_max_maf.astype(bool).apply(lambda x: x.sum(), axis = 1) >= PARAMETERS["min_nr_samples_with_mutation"]],
               row_cluster=False, col_cluster=False, figsize=[12, 12], annot=True)
sns.plt.title("Max mutation MAF per gene")