## Plot variance by mutation

Plots for Results section on influence of context on mutation.

In [None]:
import pickle
import sys, os
import gzip
import numpy as np
import pandas as pd
from cogent3 import DNA
import seaborn as sns
import datetime
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

path = "..."              #insert path to data
if not os.getcwd() == path:
    os.chdir(path)
sns.set_style("whitegrid")

Plot showing effect of context aggregated over mutation types.

In [None]:
filename = "..."           #insert name of file produced by aggregate_mutation_analysis.py
data = pd.read_csv(filename, sep=',')
fname = "Article_references/context-var.pdf"     
with PdfPages(fname) as pdf:
    fig = plt.figure()
    ax = sns.barplot(x='kmer', y='variance', hue='Marginalise over central base?', data=data)
    ax.set_ylim(top=0.0006)
    ax.set_xlabel('$k$')
    ax.set_ylabel('$\hat{\sigma }_k^2$')
    d = pdf.infodict()
    d['Title'] = 'Plot of variance due to context aggregated over mutation directions.' 
    d['Author'] = 'H. Simon'
    d['Subject'] = 'Datafile: ' + filename
    d['Keywords'] = 'Notebook: ' + 'plot_variance_by_mutation.ipynb'
    d['CreationDate'] = datetime.datetime.today()
    pdf.savefig(fig, bbox_inches='tight')
plt.show()

Print plot comparing effects of contexts for different mutation types.

In [None]:
job = '...'                #insert job number to identify input file
tables = list()
for k in ['1', '2', '3']:
    filename = 'data/bayes_var_samples_ba' + job + '_k=' + k + '.pklz'
    with gzip.open(filename, 'rb') as table:
        table = pickle.load(table)
    tables.append(table)
    fname = "Article_references/context-var-individual_" + job + ".pdf"
with PdfPages(fname) as pdf:
    fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(10,10))
    fig.subplots_adjust(wspace=1, hspace=0.5)
    palette = sns.color_palette("coolwarm", 3)
    alleles = ['C', 'T', 'A', 'G']
    for i, from_allele in enumerate(alleles):
        for j, to_allele in enumerate(alleles):
            axes[i, j].spines['right'].set_visible(False)
            axes[i, j].spines['top'].set_visible(False)
            axes[i, j].grid(b=None)
            axes[i, j].set_yticklabels([])
            if i == j:
                axes[i, j].spines['left'].set_visible(False)
                axes[i, j].spines['bottom'].set_visible(False)
                axes[i, j].set_xticklabels([])
            else:
                mut_type = from_allele + '->' + to_allele
                sns.kdeplot(tables[0][mut_type], ax=axes[i,j], label='3-mer')
                sns.kdeplot(tables[1][mut_type], ax=axes[i,j], label='5-mer')
                sns.kdeplot(tables[2][mut_type], ax=axes[i,j], label='7-mer')
                axes[i, j].spines['right'].set_visible(False)
                axes[i, j].spines['top'].set_visible(False)
                axes[i, j].spines['left'].set_color('black')
                axes[i, j].spines['bottom'].set_color('black')
                axes[i, j].ticklabel_format(axis='x', style='sci', scilimits=(0,0))
                axes[i, j].get_legend().remove()
                if mut_type not in ['C->T', 'G->A']:
                    axes[i,j].set_xlim(left=0) 
    for i, nuc in zip([0, 1, 2, 3], ['C', 'T', 'A', 'G']):
        axes[0, i].set_title(nuc, fontsize=20)
        axes[i, 0].set_ylabel(nuc, fontsize=20)
    fig.subplots_adjust(wspace=0.6, hspace=0.4)
    fig.text(0.5,0.96, 'TO', ha='center', fontsize=30)
    fig.text(0.00, 0.5, 'FROM', va='center', rotation='vertical', fontsize=30)
    handles, labels = axes[0, 1].get_legend_handles_labels()
    axes[3, 3].legend(handles, labels, mode='expand', title='LEGEND')
    d = pdf.infodict()
    d['Title'] = 'Plot of variance due to context by mutation type.' 
    d['Author'] = 'H. Simon'
    d['Subject'] = 'Datafile (7-mer): ' + filename
    d['Keywords'] = 'Notebook: ' + 'plot_variance_by_mutation.ipynb'
    d['CreationDate'] = datetime.datetime.today()
    pdf.savefig(fig)
plt.show()

Plot strand-symmetric mutation types together, mimicing the 3 x 3 format above (fig_ssa.png). Note that the from and to heads are meaningless.

In [None]:
job = "200"
filename = 'data/bayes_var_samples_ba' + job + '_k=3.pklz'
with gzip.open(filename, 'rb') as table:
    table = pickle.load(table)
fname = "Article_references/fig_ss_b.pdf"     # change to fig_ss_b.pdf for intergenic
with PdfPages(fname) as pdf:
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(30,10))
    palette = sns.color_palette("coolwarm", 3)
    mtypes = ['T\u2192C', 'C\u2192T', 'A\u2192C', 'A\u2192T', 'G\u2192C', 'G\u2192T']
    for i in range(2):
        for j in range(3):
            axes[i, j].locator_params(axis='x', nbins=5)
            ix = 3 * i + j
            mut_type1 = mtypes[ix]
            mut_type2 = DNA.complement(mut_type1[0]) + '\u2192' + DNA.complement(mut_type1[2])
            sns.kdeplot(table[mut_type1[0] + '->' + mut_type1[2]], ax=axes[i,j], label=mut_type1, lw=4)
            sns.kdeplot(table[mut_type2[0] + '->' + mut_type2[2]], ax=axes[i,j], label=mut_type2, lw=4)
            axes[i, j].ticklabel_format(axis='x', style='sci', scilimits=(0,0))
            axes[i, j].xaxis.get_offset_text().set_fontsize(28)
            for tick in axes[i, j].xaxis.get_major_ticks():
                tick.label.set_fontsize(28) 
            axes[i, j].ticklabel_format(axis='y', style='sci', scilimits=(0,0))
            axes[i,j].set_yticklabels([])  
            axes[i,j].set_xlim(left=0) 
            axes[i, j].spines['right'].set_visible(False)
            axes[i, j].spines['top'].set_visible(False)
            axes[i, j].spines['left'].set_color('black')
            axes[i, j].spines['bottom'].set_color('black')
            axes[i,j].legend(loc=3, fontsize=28)
    fig.subplots_adjust(wspace=0.2, hspace=0.4)
    d = pdf.infodict()
    d['Title'] = 'Plot of variance due to context showing strand-asymmetry.' 
    d['Author'] = 'H. Simon'
    d['Subject'] = 'Datafile: ' + filename
    d['Keywords'] = 'Notebook: ' + 'plot_variance_by_mutation.ipynb'
    d['CreationDate'] = datetime.datetime.today()
    #pdf.savefig(fig)
plt.show()

The cell below can be used to see if upper and lower quantiles of the posterior distributions for complementary mutations overlap.

In [None]:
print(np.quantile(table['T->A'], 0.975))
print(np.quantile(table['A->T'], 0.025))

Check minimum context count and how many variant count cells contain zeros.

In [None]:
filename = '...'
with gzip.open(filename, 'rb') as context_data:
        context_data = pickle.load(context_data)
a, b = context_data.stack().idxmin()
print(context_data.loc[[a], [b]])
filename = '...'
with gzip.open(filename, 'rb') as var_data:
        var_data = pickle.load(var_data)
count = 0
for i in var_data.index:
    for j in var_data.columns:
        if var_data.loc[i, j] < 1:
            count += 1
print('Number of zero variant counts = ', count, ' out of ', var_data.shape[0] * var_data.shape[1])