In [None]:
upstream = [
    '01-R23-RMSD', '02-R23-RMSF'
]
products = None
code_palette = None

In [None]:
import pandas
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# RMSD

The following plots are obtained from the RMSD against the **initial structure**. 

Reading the data from the `01-R23-RMSD` notebook.

In [None]:
exp01_rmsd = pd.read_csv(upstream['01-R23-RMSD']['rmsd'])

RMSD plot against time

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(8, 4)
sns.lineplot(x='time_ns', y='CA', data=exp01_rmsd.query('time_ns < 75 and time_ns > 25'),  hue='code', ax=ax, legend=True)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
fig.tight_layout()

Box plots by protein and subunit.

In [None]:
fig, ax = plt.subplots(1, 2, sharex=True, sharey=True)
fig.set_size_inches(8, 3.5)
sns.boxplot(data=exp01_rmsd.query('time_ns < 75 and time_ns > 25'), x='lsu', y='code', ax=ax[0], palette=code_palette)
sns.boxplot(data=exp01_rmsd.query('time_ns < 75 and time_ns > 25'), x='ssu', y='code', ax=ax[1], palette=code_palette)
ax[0].set_xlabel('RbcL RMSD ($\AA$)', fontname='Arial')
ax[1].set_xlabel('RbcS RMSD ($\AA$)', fontname='Arial')
plt.tight_layout()

## Average RMSD

The following RMSDs were computed against an structure representing the simulation average position of every atom.

Reading the data

In [None]:
exp01_rmsd_avg = pd.read_csv(upstream['01-R23-RMSD']['avg_rmsd'])
lsu_cols = ['lsu_A', 'lsu_E', 'lsu_I', 'lsu_M', 'lsu_C', 'lsu_G', 'lsu_K', 'lsu_O']
ssu_cols = ['ssu_B', 'ssu_F', 'ssu_J', 'ssu_N', 'ssu_D', 'ssu_H', 'ssu_L', 'ssu_P']

RMSD against time, by code and subunits.

In [None]:
sns.relplot(x='time_ns', y='CA', data=exp01_rmsd_avg.query('time_ns < 75 and time_ns > 25'), hue='code', ax=ax)
fig.legend(loc='center right')

We define the following custom palette.

In [None]:
set12 = sns.color_palette(sns.color_palette('Set2', 5) + sns.color_palette('Set1', 6))
code_palette = {
    "6ftl": "#C6E1AB",
    "3zxw": "#C6E1AB",
    "8ruc": "#C6E1AB",
    "1bwv": "#C6E1AB",
    "6ura": "#C6E1AB",
    "anciip": "#D1BADA",
    "ancip": "#D1BADA",
    "anci": "#D1BADA",
    "anciab": "#D1BADA",
    "ancicd": "#D1BADA",
    "ancia": "#D1BADA",
    "ancib": "#D1BADA"
}

code_palette_cap = {
    "6FTL": "#C6E1AB",
    "3ZXW": "#C6E1AB",
    "8RUC": "#C6E1AB",
    "1BWV": "#C6E1AB",
    "6URA": "#C6E1AB",
    "Anc-I/I'": "#D1BADA",
    "Anc-I'": "#D1BADA",
    "Anc-I": "#D1BADA",
    "Anc-IAB": "#D1BADA",
    "Anc-ICD": "#D1BADA",
    "Anc-IA": "#D1BADA",
    "Anc-IB": "#D1BADA",
}

Now we plot the RMSDs by code and subunit. Notice that the boxplots represent all the frames of the simulation.

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(1, 2, sharex=True, sharey=True)
fig.set_size_inches(10, 4)
sns.boxplot(data=exp01_rmsd_avg.query('frame < 750 & frame > 250'), x='code', y='lsu', ax=ax[0], palette=code_palette, linewidth=0.5)
sns.boxplot(data=exp01_rmsd_avg.query('frame < 750 & frame > 250'), x='code', y='ssu', ax=ax[1], palette=code_palette, linewidth=0.5)
ax[0].tick_params(axis='x', labelrotation=30)
ax[1].tick_params(axis='x', labelrotation=30)
ax[0].set_xlabel('RbcL RMSD ($\AA$)', fontname='Arial')
ax[1].set_xlabel('RbcS RMSD ($\AA$)', fontname='Arial')
plt.tight_layout()

We will repeat this plot, by considering only the subunit time-averages. This means that
we need to melt the dataset to place subunit RMSDs (now in columns) as rows,
and then groupby code + subunit. We will add a subunit variable (LSU/SSU) to
separate them in the plot.

In [None]:
exp01_rmsd_avg_melt = pd.melt(
    exp01_rmsd_avg,
    id_vars=['time', 'frame', 'code'], value_vars=[
        'lsu_A', 'lsu_C', 'lsu_E', 'lsu_G', 'lsu_I', 'lsu_K', 'lsu_M', 'lsu_O', 
        'ssu_B', 'ssu_D', 'ssu_F', 'ssu_H', 'ssu_J', 'ssu_L', 'ssu_N', 'ssu_P'
    ]
).dropna()

exp01_rmsd_avg_melt['subunit'] = exp01_rmsd_avg_melt['variable'].apply(lambda x: x.split('_')[0])
exp01_rmsd_avg_melt
exp01_rmsd_avg_melt_bysubunit = exp01_rmsd_avg_melt.groupby(['variable', 'subunit', 'code'], as_index=False).mean()

Box plot by code and subunit of the time-average RMSD.

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(1, 2, sharex=True, sharey=True)
fig.set_size_inches(10, 4)
sns.boxplot(data=exp01_rmsd_avg_melt_bysubunit.query('subunit == "lsu"'), x='code', y='value', ax=ax[0], palette=code_palette, linewidth=0.5)
sns.boxplot(data=exp01_rmsd_avg_melt_bysubunit.query('subunit == "ssu"'), x='code', y='value', ax=ax[1], palette=code_palette, linewidth=0.5)
ax[0].tick_params(axis='x', labelrotation=30)
ax[1].tick_params(axis='x', labelrotation=30)
ax[0].set_xlabel('RbcL RMSD ($\AA$)', fontname='Arial')
ax[1].set_xlabel('RbcS RMSD ($\AA$)', fontname='Arial')
plt.tight_layout()

Same plot, but showing the specific values of each point.

In [None]:
g = sns.catplot(
    data=exp01_rmsd_avg_melt_bysubunit, 
    x='code', y='value', col='subunit', hue='code',
    palette=code_palette, linewidth=0.5
)
g.axes[0, 0].tick_params(axis='x', labelrotation=30)
g.axes[0, 1].tick_params(axis='x', labelrotation=30)
g.axes[0, 0].set_xlabel('RbcL RMSD ($\AA$)', fontname='Arial')
g.axes[0, 1].set_xlabel('RbcS RMSD ($\AA$)', fontname='Arial')
plt.tight_layout()

### Statistical analysis

We are conducting a one-way ANOVA test. \
This will help us determine if there is any statistical difference between the means of RMSD values across different RuBisCOs

The F-statistic shows the ratio of the variance between the group means to the variance within the group.   \
The p-value (<<< 0.05) suggests that there is significant difference in group means. \
Following is to identify which RuBisCO-pairs show the significant difference in mean RMSD.

In [None]:
from scipy.stats import f_oneway
import statsmodels.stats.multicomp as mc

In [None]:
exp01_lsu_only_avg_vals = exp01_rmsd_avg_melt_bysubunit.query('subunit == "lsu"')
exp01_ssu_only_avg_vals = exp01_rmsd_avg_melt_bysubunit.query('subunit == "ssu"')

In [None]:
f_oneway(*[group['value'] for name, group in exp01_lsu_only_avg_vals.groupby('code')])

In [None]:
lsu_anova = f_oneway(*[group['value'] for name, group in exp01_lsu_only_avg_vals.groupby('code')])
lsu_mc_model = mc.MultiComparison(exp01_lsu_only_avg_vals['value'], exp01_lsu_only_avg_vals['code'])
lsu_tukey = lsu_mc_model.tukeyhsd()
# lsu_tukey.summary()
lsu_tukey_df = pd.DataFrame(data= lsu_tukey._results_table.data[1:], columns=lsu_tukey._results_table.data[0])
lsu_tukey_df[lsu_tukey_df.reject.isin([True])]
# lsu_tukey_df

Above about are the RbcL-pairs with significant difference in mean RMSD

In [None]:
f_oneway(*[group['value'] for name, group in exp01_ssu_only_avg_vals.groupby('code')])

In [None]:
ssu_anova = f_oneway(*[group['value'] for name, group in exp01_ssu_only_avg_vals.groupby('code')])
ssu_mc_model = mc.MultiComparison(exp01_ssu_only_avg_vals['value'], exp01_ssu_only_avg_vals['code'])
ssu_tukey = ssu_mc_model.tukeyhsd()
# ssu_tukey.summary()
ssu_tukey_df = pd.DataFrame(data= ssu_tukey._results_table.data[1:], columns=ssu_tukey._results_table.data[0])
ssu_tukey_df[ssu_tukey_df.reject.isin([True])]

In [None]:
lsu_tukey_df = pd.DataFrame(data= lsu_tukey._results_table.data[1:], columns=lsu_tukey._results_table.data[0])
lsu_tukey_df[lsu_tukey_df.reject.isin([True])]

## Miscellaneous

### RMSD, ancestor versus extant

We wonder whether ancestral subunits have higher or lower RMSDs than extant ones.

We create groups. 

In [None]:
ext_exp01_rmsd_avg = exp01_rmsd_avg[exp01_rmsd_avg.code.isin(['1bwv', '3zxw', '6ura', '6ftl' '8ruc'])]
anc_exp01_rmsd_avg = exp01_rmsd_avg[exp01_rmsd_avg.code.isin(['anci', 'ancia', 'anciab', 'ancib' 'ancicd', 'anciip'])]
ext_exp01_rmsd_avg['age'] = 'Extant'
anc_exp01_rmsd_avg['age'] = 'Ancestor'
pd.concat([ext_exp01_rmsd_avg, anc_exp01_rmsd_avg])

We plot them.in box plots

In [None]:
fig, ax = plt.subplots(1, 3, sharex=True, sharey=True)
fig.set_size_inches(12, 3.5)
sns.boxplot(data=pd.concat([ext_exp01_rmsd_avg, anc_exp01_rmsd_avg]).query('frame < 750 & frame > 250'), x='CA', y='age', ax=ax[0], palette=set12)
sns.boxplot(data=pd.concat([ext_exp01_rmsd_avg, anc_exp01_rmsd_avg]).query('frame < 750 & frame > 250'), x='lsu', y='age', ax=ax[1], palette=set12)
sns.boxplot(data=pd.concat([ext_exp01_rmsd_avg, anc_exp01_rmsd_avg]).query('frame < 750 & frame > 250'), x='ssu', y='age', ax=ax[2], palette=set12)
ax[0].set_xlabel('Overall RMSD ($\AA$)', fontname='Arial')
ax[1].set_xlabel('RbcL RMSD ($\AA$)', fontname='Arial')
ax[2].set_xlabel('RbcS RMSD ($\AA$)', fontname='Arial')
plt.tight_layout()

In [None]:
fig.set_size_inches(4, 3.5)
sns.boxplot(data=pd.concat([ext_exp01_rmsd_avg, anc_exp01_rmsd_avg]).query('frame < 750 & frame > 250'), x='CA', y='age', palette=set12)
plt.xlabel("Overall ($\AA$)", fontname='Arial')
plt.tight_layout()

### RMSD, With RbcS versus Without RbcS

We wonder whether Rubisco lacking RbcS has a higher or lower RMSD than Rubisco having the subunit.

In [None]:
wssu_exp01_rmsd_avg = exp01_rmsd_avg[exp01_rmsd_avg.code.isin(['1bwv', '3zxw', '6ftl' '8ruc', 'anci', 'ancia', 'anciab', 'ancib' 'ancicd'])]
wossu_exp01_rmsd_avg = exp01_rmsd_avg[exp01_rmsd_avg.code.isin(['6ura', 'anciip'])]
wssu_exp01_rmsd_avg['RbcS_status'] = 'with RbcS'
wossu_exp01_rmsd_avg['RbcS_status'] = 'Without RbcS'
pd.concat([wssu_exp01_rmsd_avg, wossu_exp01_rmsd_avg])

In [None]:
fig.set_size_inches(8, 3.5)
sns.boxplot(data=pd.concat([wssu_exp01_rmsd_avg, wossu_exp01_rmsd_avg]).query('frame < 750 & frame > 250'), x='CA', y='RbcS_status')
plt.xlabel("Overall ($\AA$)")
plt.tight_layout()

In [None]:
fig.set_size_inches(8, 3.5)
sns.boxplot(data=pd.concat([wssu_exp01_rmsd_avg, wossu_exp01_rmsd_avg]).query('frame < 750 & frame > 250'), x='lsu', y='RbcS_status')
plt.xlabel("RbcL_RMSD ($\AA$)")
plt.tight_layout()

## RMSF

RMSF stands for Root Mean Square Factor, and it represents the average devitation over time of the different residues. We use it to compare the different regions of the proteins. We have previously mapped the RMSFs to the alignments to enable RMSF comparisons.

We load the datasets.

In [None]:
exp01_rbcl_rmsf = pd.read_csv(
    upstream['02-R23-RMSF']['rbcl_rmsf']
)
exp01_rbcs_rmsf = pd.read_csv(
    upstream['02-R23-RMSF']['rbcs_rmsf']
)

RbcL RMSF lineplots over alignment sequence numbers, by protein

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='rmsf', data=exp01_rbcl_rmsf.groupby(['code', 'resnum'], as_index=False)['rmsf'].mean(), hue='code'
)
# ax.set_ylim(0.25, 3)
fig.tight_layout()

RbcS RMSF lineplots over alignment sequence numbers, by protein

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='rmsf', data=exp01_rbcl_rmsf.groupby(['code', 'resnum'], as_index=False)['rmsf'].mean(), hue='code'
)
# ax.set_ylim(0.25, 3)
fig.tight_layout()

### Comparison of RMSF of RbcL for the RuBisCOs at the dawn of RbcS

In [None]:
def calculate_RMSF_mean(sys_1, sys_2, df_in):   ## For calculating the difference in RMSf at each residue
    df = df_in[df_in['code'].isin([sys_1, sys_2])].groupby(['code', 'resnum'], as_index=False).mean()

    li_resnum = []
    li_diff = []

    for i in df['resnum'].unique():
        df_temp = df.loc[df['resnum'] == i]
        if df_temp.loc[df_temp['code'] == sys_1].empty == False and df_temp.loc[df_temp['code'] == sys_2].empty == False:
            diff = float(df_temp.loc[df_temp['code'] == sys_1]['rmsf']) - float(df_temp.loc[df_temp['code'] == sys_2]['rmsf'])

        li_resnum.append(i)
        li_diff.append(diff)

    df_out = {'resnum':li_resnum, 'diff':li_diff}
    df_out = pd.DataFrame(data=df_out).sort_values('resnum')
    return(df_out)

#### Anc-I/I' vs Anc-I'

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='rmsf', data=exp01_rbcl_rmsf[exp01_rbcl_rmsf['code'].isin(['ancip', 'anciip'])].groupby(['code', 'resnum'], as_index=False)['rmsf'].mean(), hue='code'
)
ax.set_ylim(0.25, 2)
fig.tight_layout()

In [None]:
ancip_anciip_rmsf_diff = calculate_RMSF_mean('anciip', 'ancip', exp01_rbcl_rmsf)

fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='diff', data=ancip_anciip_rmsf_diff
)
ax.axhline(0, c='black')
ax.axhline(0.30, ls='--', c='red')
ax.axhline(-0.30, ls='--', c='red')
ax.set_ylabel('RMSF difference')
ax.set_ylim(-0.5,1)
fig.tight_layout()

#### Anc-I/I' vs Anc-I

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='rmsf', data=exp01_rbcl_rmsf[exp01_rbcl_rmsf['code'].isin(['anci', 'anciip'])].groupby(['code', 'resnum'], as_index=False)['rmsf'].mean(), hue='code'
)
ax.set_ylim(0.25, 2)
fig.tight_layout()

In [None]:
ancip_anciip_rmsf_diff = calculate_RMSF_mean('anciip', 'anci', exp01_rbcl_rmsf)

fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='diff', data=ancip_anciip_rmsf_diff
)
ax.axhline(0, c='black')
ax.axhline(0.30, ls='--', c='red')
ax.set_ylabel('RMSF difference')
ax.set_ylim(-0.2,1)
fig.tight_layout()

#### Anc-I' vs 6URA

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='rmsf', data=exp01_rbcl_rmsf[exp01_rbcl_rmsf['code'].isin(['6ura', 'ancip'])].groupby(['code', 'resnum'], as_index=False)['rmsf'].mean(), hue='code'
)
ax.set_ylim(0.25, 2)
fig.tight_layout()

In [None]:
ancip_anciip_rmsf_diff = calculate_RMSF_mean('ancip', '6ura', exp01_rbcl_rmsf)

fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='diff', data=ancip_anciip_rmsf_diff
)
ax.axhline(0, c='black')
ax.axhline(0.30, ls='--', c='red')
ax.set_ylabel('RMSF difference')
ax.set_ylim(-0.2,1)
fig.tight_layout()

#### Anc-I/I' vs 6URA

In [None]:
fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='rmsf', data=exp01_rbcl_rmsf[exp01_rbcl_rmsf['code'].isin(['6ura', 'anciip'])].groupby(['code', 'resnum'], as_index=False)['rmsf'].mean(), hue='code'
)
ax.set_ylim(0.25, 2)
fig.tight_layout()

In [None]:
ancip_anciip_rmsf_diff = calculate_RMSF_mean('anciip', '6ura', exp01_rbcl_rmsf)

fig, ax = plt.subplots(1)
fig.set_size_inches(9, 3.0)
sns.lineplot(
    x='resnum', y='diff', data=ancip_anciip_rmsf_diff
)
ax.axhline(0, c='black')
ax.axhline(0.30, ls='--', c='red')
ax.set_ylabel('RMSF difference')
ax.set_ylim(-0.2,1)
fig.tight_layout()

## Ancient versus extant RbcS RMSF by time

In [None]:
fig, ax = plt.subplots(2, 1, sharex=True, sharey=True)
fig.set_size_inches(8, 3.5)
sns.lineplot(x='resnum', y='rmsf', marker='.', data=exp01_rbcs_rmsf.query('time == "extant"'), hue='code', ax=ax[0], palette='Set2')
sns.lineplot(x='resnum', y='rmsf', marker='.', data=exp01_rbcs_rmsf.query('time == "ancient"'), hue='code', ax=ax[1], palette='Set1')

# sns.move_legend(ax[0], "upper right", bbox_to_anchor=(1.2, 1))
# sns.move_legend(ax[1], "upper right", bbox_to_anchor=(1.2, 1))
ax[0].set_ylim(0, 2.0)
fig.legend(bbox_to_anchor=(1.05, 0.5), borderaxespad=0., loc='center left')
ax[0].legend('')
ax[1].legend('')
plt.tight_layout()
plt.show()