## Generate tables for manuscript

Generate latex code for tables used in manuscript and supplementary information.

Import modules.

In [1]:
import os
import numpy as np
import pandas as pd
from cogent3 import make_table
import gzip, pickle

Table showing variance due to recombination by 12 point mutations (Results)

In [None]:
path = "..."          
# Insert required path
if not os.getcwd() == path:
    os.chdir(path)
rfilename = '...'     #Insert name of file produced by ARMA_pq_analysis.py
rtable = pd.read_csv(rfilename, sep=',', index_col=0)
rtable = rtable[['mrate', 'pval', 'variance', 'variance25', 'variance975']]
#reindex so that strand-symmetric mutations are together.
newindex = ['C->T', 'G->A', 'T->C', 'A->G', 'C->G', 'G->C', 'T->G', 'A->C', 'T->A', 'A->T', 'C->A', 'G->T']
rtable = rtable.loc[newindex]
newix = [x[0] + '\\textrightarrow ' + x[3] for x in list(rtable.index.values)]
rtable.index = newix
column_headings = ['SNV Density', 'Probability', '$\hat{\sigma }^2_{rec}$', 'Lower CL 95\%', 'Upper CL 95\%']
rtable.columns = column_headings
rtable.insert(loc=0, column='Mutation', value=rtable.index)
t = make_table(data_frame=rtable, \
               title="Analysis of the linear relationship between recombination rates and SNV densities for \
chromosome 1 disaggregated by mutation direction. `SNV Density\' is the SNV density for that mutation \
direction (conditioned on ancestral allele); `Probability\' is the posterior probability that the slope \
parameter from the linear regression is less than zero; `$\hat{\sigma }^2_{rec}$\' is the estimated variance \
due to recombination and  `Lower CL 95\%\' and  `Upper CL 95\%\' are the limits of the 95\% credibility interval \
for $\hat{\sigma }^2_{rec}$. Since the estimated variance in SNV density due to recombination is \
calculated as the difference between the total variance in SNV density and the sum of squares of the residuals, \
it will be negative if the model fit is worse than for a line with zero slope. This is likely to occur when the \
`Probability\' value is significantly greater than zero and we reject the model.")
t.format_column('$\hat{\sigma }^2_{rec}$', "%.1e")
t.format_column('Lower CL 95\%', "%.1e")
t.format_column('Upper CL 95\%', "%.1e")
t.write("T1.tex", label="tab:recombination_mutation_types", justify="lccccc")

Supplementary Table (recombination).

In [None]:
path = "/Users/helmutsimon/Google Drive/Genetics/Neighbourhood Effects Project/Recombination_data"          # Insert required path
if not os.getcwd() == path:
    os.chdir(path)
fname = '...'  # Insert name of file produced by ARMA_pq_analysis_all_by_chrom.py
chfile = pd.read_csv(fname, sep=',', index_col=0)
recomb_chr_summary = np.zeros((22, 8))
chrs = list(np.arange(1, 23))
columns = ['SNV density', 'p', 'q', 'Slope', 'Intercept', '$\hat{\sigma }^2_{rec}$', 'Slope (M)', 'Percent']
recomb_chr_summary = pd.DataFrame(recomb_chr_summary, index=chrs, columns=columns)

for ix, ch in enumerate(chrs):
    recomb_chr_summary.loc[ch, columns] = \
                chfile.loc[ix, ['snvdens', 'p', 'q', 'beta', 'alpha', 'variance', 'slopem', 'mutperco']].to_numpy()
recomb_chr_summary['Slope'] = recomb_chr_summary['Slope'] / 0.0116  #bins are normalised rel to avge 0.0116 cM
recomb_chr_summary['Percent'] = (recomb_chr_summary['SNV density'] - recomb_chr_summary['Intercept']) * 100 \
                                    / recomb_chr_summary['SNV density']
recomb_chr_summary.insert(loc=0, column='Chr', value=recomb_chr_summary.index)
t = make_table(data_frame=recomb_chr_summary, \
               title="Results of analysis of variance due to recombination by chromosome. \
`p\' and `q\' define the ARMA(p,q) distribution used; `Slope\' and `Intercept\' are the estimated parameters of\
 the linear model expressed in terms of change in SNV density per centimorgan and SNV density respectively; \
`$\hat{\sigma }^2_{rec}$\' is the estimated variance in SNV density due to recombination; \
`Slope (M)\' is the estimated slope parameter expressed as change in mutation rate per centimorgan; and \
`Percent\' is the estimated percentage of SNVs due to recombination.")
t.format_column('p', "%1d")
t.format_column('q', "%1d")
t.format_column('$\hat{\sigma }^2_{rec}$', "%.2e")
t.format_column('Slope (M)', "%.2e")
t.format_column('Percent', "%.3f\%%")
t.write("S1.tex", label="tab:supp-recomb", justify="ccccccccc")

Supplementary table showing posterior probability for influence of recombination on mutation by mutation type and chromosome.

In [None]:
path = "..."           # Insert required path
if not os.getcwd() == path:
    os.chdir(path)
chroms = np.arange(1,23).astype(str)
result = np.zeros((len(chroms), 12))
muts = ['C->T', 'G->A', 'T->C', 'A->G', 'C->G', 'G->C', 'T->G', 'A->C', 'T->A', 'A->T', 'C->A', 'G->T']
result = pd.DataFrame(result, index=chroms, columns = muts)
for c in chroms:
    if c in np.arange(1, 23, 2).astype('str'):
        fname = 'Recombination_data/ARMApq_results_sexav_ARARMApq_ch' + c + '.csv'
    else:
        fname = 'Recombination_data/ARMApq_results_sexav_ARMApq_ch' + c + '.csv'
    cfile = pd.read_csv(fname, sep=',', index_col=0)
    result.loc[c] = cfile['pval']
newcols = [x[0] + '\\textrightarrow ' + x[3] for x in list(result.columns.values)]
result.columns = newcols
result.insert(loc=0, column='Chr', value=result.index)
print(result)
t = make_table(data_frame=result, \
               title="Posterior probability that recombination does not have a positive effect on mutation\
               by point mutation direction and chromosome.", digits=2)
t.write("S2.tex", label="supp_recombination_chromosomes", justify="lcccccccccccc")

Supplementary table showing variation due to context by 12 point mutations.

In [None]:
path = "..."            # Insert required path
if not os.getcwd() == path:
    os.chdir(path)
job = '...'           #Insert job identifier
filename = 'data/var_counts_1ba' + job + '.pklz'
with gzip.open(filename, 'rb') as var_counts:
        var_counts = pickle.load(var_counts)
filename = 'data/context_counts_1ba' + job + '.pklz'
with gzip.open(filename, 'rb') as context_counts:
        context_counts = pickle.load(context_counts)
tables = list()
for k in ['1', '2', '3']:
    filename = 'data/bayes_var_samples_ba' + job + '_k=' + k + '.pklz'
    with gzip.open(filename, 'rb') as table:
        table = pickle.load(table)
    tables.append(table)
table1 = np.zeros((12,4))
columns = ['Density', '$\hat{\sigma }^2_3$', '$\hat{\sigma }^2_5$', '$\hat{\sigma }^2_7$']
index = tables[0].columns
table1 = pd.DataFrame(table1, index=index, columns=columns)
for ix in index:
    for kmer, c in enumerate(columns[-3:]):
        table1.loc[ix, c] = tables[kmer][ix].mean()
for ix in index:
    var_sums = float(var_counts[[ix]].sum())
    context_sums = float(context_counts[[ix[0]]].sum())
    table1.loc[ix, 'Density'] = var_sums / context_sums
table1.index = [i[0] + '\\textrightarrow ' + i[3] for i in tables[0].columns]
table1.insert(loc=0, column='Mutation', value=table1.index)
t = make_table(data_frame=table1, \
               title="Variance in probability of SNVs due to context. $\hat{\sigma }^2_k$ denote the estimated\
 variances for context size $k$. The size of context includes the central allele. Results are conditioned on\
 mutation direction (ancestral and derived state). The column `Density\' shows the density for each\
 SNV direction (conditioned on the ancestral allele) for reference. See Methods and materials for data sources.")
t.format_column('$\hat{\sigma }^2_3$', "%.2e")
t.format_column('$\hat{\sigma }^2_5$', "%.2e")
t.format_column('$\hat{\sigma }^2_7$', "%.2e")
t.write("S3.tex", label="tab:supp_context", justify="ccccc")

Supplementary Table showing results for recombination using OLSR.

In [5]:
path = "..."        # Insert required path
if not os.getcwd() == path:
    os.chdir(path)
fname = path + '...'  # Insert filname
OLSLRfile = pd.read_csv(fname, sep=',', index_col=0)
OLSLRfile.columns = ['SNV density', 'Slope', 'Intercept', '$\hat{\sigma }^2_{rec}$', 'Mutations']
OLSLRfile.insert(loc=0, column='Chr', value=OLSLRfile.index)
t = make_table(data_frame=OLSLRfile, \
               title="Results of analysis of variance due to recombination by chromosome using ordinary last squares linear regression (OLSLR). `Slope\' and `Intercept\' are the estimated parameters\
 of the linear model expressed in terms of change in SNV density per centimorgan and SNV density respectively;\
 `$\hat{\sigma }^2_{rec}$\' is the estimated variance in SNV density due to recombination;\
 and `Mutations\' is the estimated average number of mutations resulting from a recombination event.")
t.write("S4.tex", label="tab:supp-OLSLR", justify="cccccccc")

Supplementary Table showing numbers of variants by chromosome.

In [43]:
path = "/Users/helmutsimon/Google Drive/Genetics/Neighbourhood Effects Project"        # Insert required path
if not os.getcwd() == path:
    os.chdir(path)
f1name = path + '/data/merge_counts_mcd020.csv'  # Insert filname
ccounts = pd.read_csv(f1name, sep=',', index_col=0)
f2name = path + '/Recombination_data/recomb_var_countscrtv001.csv'
rcounts = pd.read_csv(f2name, sep=',', index_col=0)
results = pd.DataFrame()
results['Intronic variants'] = ccounts.iloc[:,1]
results['All variants'] = rcounts.iloc[:,1].astype(int)
results.insert(loc=0, column='Chromosome', value=np.arange(1,23))
title="Counts of filtered variants by chromosome. Intronic variants were used for context analysis and all\
             variants were used for recombination analysis"
t = make_table(data_frame=results, title=title)
t.write("S5.tex", label="tab:supp-counts", justify="ccc")