In [1]:
#!/usr/bin/python3
#
#   Written by Christian D. Powell, 10/14/2019
#   Copyright (c) 2019. Christian D. Powell and Hunter N.B. Moseley
#   All rights reserved.
#
#   Python Notebook used to generate image files for publication.

from copy import deepcopy
import jsonpickle
from math import log2
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
from ProtamineAnalysis import *
from scipy.stats import ttest_ind
init_notebook_mode(connected=True)

AMINOACIDS = (
    'A', 'Q', 'L', 'S',
    'E', 'K', 'T',
    'N', 'G', 'M', 'W',
    'D', 'H', 'F', 'Y',
    'C', 'I', 'P', 'V',
    'B', 'Z', 'X',
)

In [2]:
#
#   Open Datasets
#

with open('Docs/Datasets/eutherian_p1.json', 'r') as f:
    ep1_align = jsonpickle.decode(f.read())
with open('Docs/Datasets/eutherian_p2.json', 'r') as f:
    ep2_align = jsonpickle.decode(f.read())
with open('Docs/Datasets/metatherian.json', 'r') as f:
    mp1_align = jsonpickle.decode(f.read())
with open('Docs/Datasets/fish.json', 'r') as f:
    fish_align = jsonpickle.decode(f.read())
alignments = [ep1_align, ep2_align, mp1_align, fish_align]

In [3]:
#
#   Figure 5. Arginine-Lysine Residues
#
#   Box plots of the arginine frequency for eutherian protamine P1, truncated eutherian P2, metatherian P1, and
#   fish protamines. Individual arginine-lysine frequencies are plotted next to the associated box plot.
#
#   Note: Plotly uses 1.5 IQR to determine lower (https://help.plot.ly/what-is-a-box-plot/)

ep1_RK_freqs = [analysis.frequency(pro, ['R', 'K']) for pro in ep1_align.protein_list]
ep2_RK_freqs = [analysis.frequency(pro, ['R', 'K']) for pro in ep2_align.protein_list]
ep2T_RK_freqs = analysis.region_frequencies(ep2_align, ['R', 'K'], 48)
mp1_RK_freqs = [analysis.frequency(pro, ['R', 'K']) for pro in mp1_align.protein_list]
fish_RK_freqs = [analysis.frequency(pro, ['R', 'K']) for pro in fish_align.protein_list]

data = [
    go.Box(y=ep1_RK_freqs, name='Eutherian P1', boxpoints='all'),
    go.Box(y=ep2_RK_freqs, name='Eutherian P2', boxpoints='all'),
    go.Box(y=ep2T_RK_freqs, name='Truncated Eutherian P2', boxpoints='all'),
    go.Box(y=mp1_RK_freqs, name='Metatherian', boxpoints='all'),
    go.Box(y=fish_RK_freqs, name='Fish', boxpoints='all'),
]
charge_freq_layout = go.Layout(xaxis=dict(title='Protamine Type'),
                               yaxis=dict(title='Arginine-Lysine Frequency (%)'),
                               showlegend=False)
fig = go.Figure(data=data, layout=charge_freq_layout)
fig.show()

In [4]:
#
#   Arginine-Lysine Residues (DNA Binding)
#
#   Box plots of the arginine frequency for eutherian P1, eutherian P1 DNA binding region, metatherian P1, 
#   metatherian P1 DNA binding region, fish protamines (for comparison only). Individual arginine-lysine 
#   frequencies are plotted next to the associated box plot.
#
#   Note: Plotly uses 1.5 IQR to determine lower (https://help.plot.ly/what-is-a-box-plot/)

ep1DNA_RK_freqs = analysis.region_frequencies(ep1_align, ['R', 'K'], 17, 48)
mp1DNA_RK_freqs = analysis.region_frequencies(mp1_align, ['R', 'K'], 17, 56)

data = [
    go.Box(y=ep1_RK_freqs, name='Eutherian P1', boxpoints='all'),
    go.Box(y=ep1DNA_RK_freqs, name='Eutherian P1 DNA', boxpoints='all'),
    go.Box(y=mp1_RK_freqs, name='Metatherian', boxpoints='all'),
    go.Box(y=mp1DNA_RK_freqs, name='Metatherian DNA', boxpoints='all'),
    go.Box(y=fish_RK_freqs, name='Fish', boxpoints='all'),
]
charge_freq_layout = go.Layout(xaxis=dict(title='Protamine Type'),
                               yaxis=dict(title='Arginine-Lysine Frequency (%)'),
                               showlegend=False)
fig = go.Figure(data=data, layout=charge_freq_layout)
fig.show()

In [5]:
#
#   Figure 6. Arginine-Lysine Residues (DNA Binding ONLY)
#
#   Box plots of the arginine-lysine frequency for the DNA binding regions of eutherian protamine P1, metatherian
#   sperm protamine, and the whole sequences of fish protamines. Individual arginine-lysine frequencies are
#   plotted next to the associated box plot.
#
#   Note: Plotly uses 1.5 IQR to determine lower (https://help.plot.ly/what-is-a-box-plot/)

data = [
    go.Box(y=ep1DNA_RK_freqs, name='Eutherian P1 DNA', boxpoints='all'),
    go.Box(y=mp1DNA_RK_freqs, name='Metatherian DNA', boxpoints='all'),
    go.Box(y=fish_RK_freqs, name='Fish', boxpoints='all'),
]
charge_freq_layout = go.Layout(xaxis=dict(title='Protamine Type'),
                               yaxis=dict(title='Arginine-Lysine Frequency (%)'),
                               showlegend=False)
fig = go.Figure(data=data, layout=charge_freq_layout)
fig.show()

In [6]:
#
#   Figures below not used in publication
#

In [7]:
#
#   Arginine Residue Frequency
#
#   Box plots of the arginine frequency for each protmaine group. Eutherian P1, Eutherian P2, Truncated Eutherian
#   P2, Metatherian, and fish. Individual arginine frequencies are plotted next to the associated box plot.
#
#   Note: Plotly uses 1.5 IQR to determine lower (https://help.plot.ly/what-is-a-box-plot/)

ep1_R_freqs = [analysis.frequency(pro, 'R') for pro in ep1_align.protein_list]
ep2_R_freqs = [analysis.frequency(pro, 'R') for pro in ep2_align.protein_list]
ep2T_R_freqs = analysis.region_frequencies(ep2_align, ['R'], 48)
mp1_R_freqs = [analysis.frequency(pro, 'R') for pro in mp1_align.protein_list]
fish_R_freqs = [analysis.frequency(pro, 'R') for pro in fish_align.protein_list]

data = [
    go.Box(y=ep1_R_freqs, name='Eutherian P1', boxpoints='all'),
    go.Box(y=ep2_R_freqs, name='Eutherian P2', boxpoints='all'),
    go.Box(y=ep2T_R_freqs, name='Truncated Eutherian P2', boxpoints='all'),
    go.Box(y=mp1_R_freqs, name='Metatherian', boxpoints='all'),
    go.Box(y=fish_R_freqs, name='Fish', boxpoints='all'),
]
charge_freq_layout = go.Layout(xaxis=dict(title='Protamine Type'),
                               yaxis=dict(title='Arginine Frequency (%)'),
                               showlegend=False)
fig = go.Figure(data=data, layout=charge_freq_layout)
fig.show()

In [8]:
#
#   Arginine Residue Frequency (Eutherian P1 DNA Binding Region)
#
#   Box plots of the arginine frequency for each protamine group. Eutherian P1, Eutherian P1 DNA binding region,
#   Metatherian, and fish. Individual arginine frequencies are plotted next to the associated box plot.
#
#   Note: Plotly uses 1.5 IQR to determine lower (https://help.plot.ly/what-is-a-box-plot/)

ep1DNA_R_freq = analysis.region_frequencies(ep1_align, ['R'], 17, 48)

data = [
    go.Box(y=ep1_R_freqs, name='Eutherian P1', boxpoints='all'),
    go.Box(y=ep1DNA_R_freq, name='Eutherian P1 DNA', boxpoints='all'),
    go.Box(y=mp1_R_freqs, name='Metatherian', boxpoints='all'),
    go.Box(y=fish_R_freqs, name='Fish', boxpoints='all'),
]
charge_freq_layout = go.Layout(xaxis=dict(title='Protamine Type'),
                               yaxis=dict(title='Arginine Frequency (%)'),
                               showlegend=False)
fig = go.Figure(data=data, layout=charge_freq_layout)
fig.show()

In [9]:
#
#   Variance Analysis (Eutherian P1 DNA Binding Region)
#
#   Note: Plotly uses 1.5 IQR to determine lower (https://help.plot.ly/what-is-a-box-plot/)

for alignment in alignments:
    aa_ranges = list()

    for aa in AMINOACIDS:
        new_freq = [analysis.frequency(pro, ['R', aa]) for pro in alignment.protein_list]
        aa_ranges.append(new_freq)
        print(aa, max(new_freq) - min(new_freq), np.percentile(new_freq, [0, 25, 50, 75, 100]))

    low = aa_ranges[0]
    for x in range(1, len(aa_ranges)):
        if max(aa_ranges[x]) - min(aa_ranges[x]) < max(low) - min(low):
            low = aa_ranges[x]

    print(aa_ranges.index(low), np.percentile(low, [0, 25, 50, 75, 100]), '\n',
          np.percentile(aa_ranges[4], [0, 25, 50, 75, 100]))

    data = [go.Box(y=val, name=AMINOACIDS[i], boxpoints='all') for i, val in enumerate(aa_ranges)]
    charge_freq_layout = go.Layout(xaxis=dict(title='R + ?'),
                                   yaxis=dict(title='Charge Frequency (%)'))
    fig = go.Figure(data=data, layout=charge_freq_layout)
    fig.show()

A 0.24667258207630877 [0.44897959 0.5106383  0.54347826 0.58823529 0.69565217]
Q 0.18158567774936063 [0.47058824 0.52941176 0.54901961 0.56862745 0.65217391]
L 0.26441881100266196 [0.3877551  0.48076923 0.52941176 0.56       0.65217391]
S 0.24215070643642073 [0.46938776 0.55102041 0.57142857 0.62745098 0.71153846]
E 0.26441881100266196 [0.3877551  0.46938776 0.50980392 0.55769231 0.65217391]
K 0.22360248447204972 [0.42857143 0.47058824 0.52       0.58       0.65217391]
T 0.22360248447204972 [0.42857143 0.5106383  0.55319149 0.57446809 0.65217391]
N 0.2440106477373558 [0.40816327 0.4893617  0.52173913 0.55769231 0.65217391]
G 0.26441881100266196 [0.3877551  0.47058824 0.52       0.55769231 0.65217391]
M 0.2657497781721384 [0.40816327 0.49019608 0.54       0.57692308 0.67391304]
W 0.26441881100266196 [0.3877551  0.46938776 0.50980392 0.55769231 0.65217391]
D 0.26441881100266196 [0.3877551  0.46938776 0.50980392 0.55769231 0.65217391]
H 0.28615794143744455 [0.3877551  0.47058824 0.52     

A 0.11972096530920057 [0.2745098  0.31372549 0.34142857 0.35092097 0.39423077]
Q 0.06975867269984914 [0.37254902 0.39173669 0.40093458 0.41747573 0.44230769]
L 0.09012066365007543 [0.31372549 0.3504902  0.36190476 0.37503403 0.40384615]
S 0.12782805429864252 [0.35294118 0.38970588 0.4086454  0.41747573 0.48076923]
E 0.1085972850678733 [0.35294118 0.39215686 0.4086454  0.41561965 0.46153846]
K 0.0904977375565611 [0.29411765 0.32838235 0.34147154 0.36227462 0.38461538]
T 0.09030920060331826 [0.30392157 0.34068627 0.35619048 0.36893204 0.39423077]
N 0.1293363499245852 [0.2745098  0.31372549 0.34975728 0.36010375 0.40384615]
G 0.12820512820512825 [0.33333333 0.37254902 0.41064426 0.42718447 0.46153846]
M 0.10991704374057315 [0.28431373 0.32352941 0.35595238 0.37015697 0.39423077]
W 0.11972096530920057 [0.2745098  0.31372549 0.34133148 0.35092097 0.39423077]
D 0.10030165912518857 [0.28431373 0.31372549 0.34951456 0.36366158 0.38461538]
H 0.11485436893203882 [0.39       0.43872549 0.45364554

A 0.2231384307846077 [0.44927536 0.6031746  0.61538462 0.63998463 0.67241379]
Q 0.23596014492753625 [0.42028986 0.58730159 0.6031746  0.62903226 0.65625   ]
L 0.2504528985507246 [0.4057971  0.58730159 0.6        0.62701613 0.65625   ]
S 0.24123422159887797 [0.56521739 0.73015873 0.74193548 0.7704918  0.80645161]
E 0.235447849845569 [0.42028986 0.58730159 0.6        0.6240942  0.6557377 ]
K 0.17747683535281533 [0.47826087 0.58730159 0.6        0.6240942  0.6557377 ]
T 0.23782725715683872 [0.41791045 0.6031746  0.60655738 0.62903226 0.6557377 ]
N 0.17863805970149255 [0.47761194 0.6031746  0.60655738 0.62903226 0.65625   ]
G 0.20239880059970017 [0.52173913 0.63492063 0.64516129 0.67741935 0.72413793]
M 0.25212393803098454 [0.42028986 0.6031746  0.61538462 0.63998463 0.67241379]
W 0.24994060346875735 [0.4057971  0.58730159 0.6        0.6240942  0.6557377 ]
D 0.235447849845569 [0.42028986 0.58730159 0.6        0.6240942  0.6557377 ]
H 0.2693653173413294 [0.42028986 0.64516129 0.65079365 0.6

A 0.25567502986857826 [0.51851852 0.66666667 0.6969697  0.70967742 0.77419355]
Q 0.2962962962962963 [0.44444444 0.63636364 0.65165441 0.66666667 0.74074074]
L 0.22222222222222227 [0.48148148 0.63636364 0.65165441 0.66666667 0.7037037 ]
S 0.26936026936026936 [0.51851852 0.69304435 0.74193548 0.77688172 0.78787879]
E 0.2592592592592593 [0.44444444 0.63636364 0.65625    0.66666667 0.7037037 ]
K 0.14120370370370372 [0.5625     0.63636364 0.65165441 0.66666667 0.7037037 ]
T 0.2281959378733573 [0.48148148 0.64935662 0.66666667 0.67741935 0.70967742]
N 0.2592592592592593 [0.44444444 0.63636364 0.65165441 0.66666667 0.7037037 ]
G 0.29629629629629634 [0.48148148 0.64705882 0.69223485 0.71875    0.77777778]
M 0.2652329749103943 [0.44444444 0.64563567 0.66666667 0.67741935 0.70967742]
W 0.2592592592592593 [0.44444444 0.63636364 0.65165441 0.66666667 0.7037037 ]
D 0.2592592592592593 [0.44444444 0.63636364 0.65165441 0.66666667 0.7037037 ]
H 0.18518518518518512 [0.55555556 0.63636364 0.65625    0.6