The goal of this notebook is to analyze some specific trends among mutation rates. Most of the mutation rate plotting is performed natively by maple, but there are some auxillary things I want to demonstrate that don't need to be a part of the maple pipeline. Here I will plot the mutation rates in each region compared to mutation rates in other regions as scatter plots, showing that even within an individual biological replicate the mutation rates with selection are lower than mutation rates in regions without selection

In [2]:
import pandas as pd
import hvplot.pandas
import holoviews as hv
import sys
import statsmodels.api as sm
hv.extension('bokeh')

# full file path to the folder containing the SequenceAnalyzer.py and common.py files from github.com/gordonrix/maple
path_to_SA = '/home/redwood/Desktop/data_tools/maple/rules/utils'

sys.path.append(path_to_SA)
from common import export_svg_plots

first = True
for analysis in ['LEU2', 'URA3', 'mScarlettI']:
    file = f'../mutation_data/DE5-{analysis}/DE5-{analysis}_mutation-rates.csv'
    df = pd.read_csv(file, usecols=range(10))
    df = (df[(df['rate']!=1e-10) & (df['r_value']>0.6)]
            .assign(**{f'rate_{analysis}': lambda x: x['rate']})
            .drop(columns=['rate', 'intercept', 'r_value', 'p_value', 'std_err']))
    if first:
        data = df
        first = False
    else:
        data = data.merge(df, on=['sample_label', 'replicate', 'mut_type', 'wt_nt', 'mut_nt'])

data = data.assign(URA_over_LEU=lambda x: x['rate_URA3']/x['rate_LEU2'],
            mScarlettI_over_LEU=lambda x: x['rate_mScarlettI']/x['rate_LEU2'],
            URA3_over_mScarlettI=lambda x: x['rate_URA3']/x['rate_mScarlettI'])
data_all = data[data['mut_type']=='all']
data_individual = data[data['mut_type']!='all']
data_all

Unnamed: 0,sample_label,replicate,mut_type,wt_nt,mut_nt,rate_LEU2,rate_URA3,rate_mScarlettI,URA_over_LEU,mScarlettI_over_LEU,URA3_over_mScarlettI
0,I777K-L900S,1,all,all,all,0.000013,0.000014,0.000012,1.087606,0.919977,1.182210
10,I777K-L900S,4,all,all,all,0.000013,0.000018,0.000007,1.337489,0.545763,2.450676
13,TKS,1,all,all,all,0.000016,0.000015,0.000013,0.911457,0.799497,1.140039
18,TKS,2,all,all,all,0.000016,0.000015,0.000014,0.941482,0.826301,1.139394
23,TKS,3,all,all,all,0.000014,0.000015,0.000014,1.081413,0.997089,1.084571
...,...,...,...,...,...,...,...,...,...,...,...
493,BB-5k,2,all,all,all,0.000102,0.000239,0.000213,2.330415,2.081950,1.119342
504,BB-5k,3,all,all,all,0.000046,0.000154,0.000135,3.378942,2.960668,1.141277
511,BB-5k,4,all,all,all,0.000090,0.000214,0.000261,2.392292,2.910024,0.822087
519,BB-3E,3,all,all,all,0.000095,0.000205,0.000174,2.159838,1.829493,1.180567


In [3]:
def plot_rate_v_rate(data, rate1, rate2, min_value, max_value):
    # rate1 is x, rate2 is y
    # use statsmodels OLS to calculate R² between the two replicates
    y = data_all[f'rate_{rate2}'].to_numpy()
    # x = sm.add_constant(data_all[f'rate_{rate1}'].to_numpy())
    x = data_all[f'rate_{rate1}'].to_numpy()
    fit = sm.OLS(y,x).fit()
    fit_r2 = fit.rsquared
    # intercept = fit.params[0]
    slope = fit.params[0]

    return (data.hvplot.scatter(x=f'rate_{rate1}', y=f'rate_{rate2}', c='black', cmap='viridis', colorbar=True, frame_width=500, frame_height=500)
             .opts(fontsize={'title':16,'labels':14,'xticks':10,'yticks':10}, xlabel=f'mutation rate, {rate1}', ylabel=f'mutation rate, {rate2}', xlim=(min_value,max_value), ylim=(min_value,max_value)) *
            hv.Curve(([min_value, max_value], [min_value * slope, max_value * slope])).opts(
                            color="lightgrey", line_dash='dashed') *
            hv.Text(2e-4, 2e-4, f'R² = {fit_r2:.2f}', halign='left', valign='top').opts(color='black') *
            hv.Text(2e-4, 1.9e-4, f'slope = {slope:.2f}', halign='left', valign='top').opts(color='black'))

min_value = 0
max_value = 2.5e-4

plots = []

for y,x in [('URA3','mScarlettI'),('LEU2','mScarlettI'),('LEU2','URA3')]:
    plots.append(plot_rate_v_rate(data_all, x, y, min_value, max_value))

export_svg_plots(plots, 'rate_comparisons.html', labels=['UvM','LvM', 'LvU'])
hv.Layout(plots).cols(3)



### code graveyard

In [2]:
# # use statsmodels OLS to calculate R² between the two replicates
# y = data_all['URA_over_LEU'].to_numpy()
# x = sm.add_constant(data_all['rate_URA3'].to_numpy())
# fit = sm.OLS(y,x).fit()
# fit_r2_URA = fit.rsquared
# intercept_URA = fit.params[0]
# slope_URA = fit.params[1]

# y = data_all['mScarlettI_over_LEU'].to_numpy()
# x = sm.add_constant(data_all['rate_mScarlettI'].to_numpy())
# fit = sm.OLS(y,x).fit()
# fit_r2_mScar = fit.rsquared
# intercept_mScar = fit.params[0]
# slope_mScar = fit.params[1]

# y = data_all['rate_mScarlettI'].to_numpy()
# x = sm.add_constant(data_all['rate_URA3'].to_numpy())
# fit = sm.OLS(y,x).fit()
# fit_r2_mScar_URA3 = fit.rsquared
# intercept_mScar_URA3 = fit.params[0]
# slope_mScar_URA3 = fit.params[1]

In [3]:
# min_value = 0
# max_value = 2.5e-4

# plots = []

# plots.append(data_all.hvplot.scatter(x='rate_URA3', y='URA_over_LEU', c='black', cmap='viridis', colorbar=True, frame_width=500, frame_height=500)
#              .opts(fontsize={'title':16,'labels':14,'xticks':10,'yticks':10}, xlabel='mutation rate, URA3', ylabel='mutation rate, URA3 / LEU2') *
#     hv.Curve(([min_value, max_value], [min_value * slope_URA + intercept_URA, max_value * slope_URA + intercept_URA])).opts(
#                     color="lightgrey", line_dash='dashed') *
#     hv.Text(2e-4, 3.2, f'R² = {fit_r2_URA:.2f}', halign='left', valign='top').opts(color='black'))

In [4]:
# plots.append(data_all.hvplot.scatter(x='rate_mScarlettI', y='mScarlettI_over_LEU', c='black', cmap='viridis', colorbar=True, frame_width=500, frame_height=500)
#              .opts(fontsize={'title':16,'labels':14,'xticks':10,'yticks':10}, xlabel='mutation rate, mScarlettI', ylabel='mutation rate, mScarlettI / LEU2') *
#     hv.Curve(([min_value, max_value], [min_value * slope_mScar + intercept_mScar, max_value * slope_mScar + intercept_mScar])).opts(
#                     color="lightgrey", line_dash='dashed') *
#     hv.Text(2e-4, 3.2, f'R² = {fit_r2_mScar:.2f}', halign='left', valign='top').opts(color='black'))

In [5]:
# plots.append(data_all.hvplot.scatter(x='rate_URA3', y='mScarlettI_over_LEU', c='black', cmap='viridis', colorbar=True, frame_width=500, frame_height=500)
#              .opts(fontsize={'title':16,'labels':14,'xticks':10,'yticks':10}, xlabel='mutation rate, mScarlettI', ylabel='mutation rate, mScarlettI / LEU2') *
#     hv.Curve(([min_value, max_value], [min_value * slope_mScar + intercept_mScar, max_value * slope_mScar + intercept_mScar])).opts(
#                     color="lightgrey", line_dash='dashed') *
#     hv.Text(2e-4, 3.2, f'R² = {fit_r2_mScar:.2f}', halign='left', valign='top').opts(color='black'))

# export_svg_plots(plots, 'selection_effect_on_mutation_rate.html', labels=['URA3', 'mScarlettI'])
# hv.Layout(plots).cols(3)

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  layout_plot = gridplot(
  layout_plot = gridplot(


  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type
