In [1]:
md_reference_data = None
upstream = None
product = None

In [2]:
# Parameters
md_reference_data = {
    "1bwv": {
        "dry_dcd": "1bwv.exp01.md01.dry.dcd",
        "dry_pdb": "1bwv.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "3zxw": {
        "dry_dcd": "3zxw.exp01.md01.dry.dcd",
        "dry_pdb": "3zxw.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "6ftl": {
        "dry_dcd": "6ftl.exp01.md01.dry.dcd",
        "dry_pdb": "6ftl.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "6ura": {
        "dry_dcd": "6ura.exp01.md01.dry.dcd",
        "dry_pdb": "6ura.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "8ruc": {
        "dry_dcd": "8ruc.exp01.md01.dry.dcd",
        "dry_pdb": "8ruc.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "anci": {
        "dry_dcd": "anci.exp01.md01.dry.dcd",
        "dry_pdb": "anci.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "anciab": {
        "dry_dcd": "anciab.exp01.md01.dry.dcd",
        "dry_pdb": "anciab.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancicd": {
        "dry_dcd": "ancicd.exp01.md01.dry.dcd",
        "dry_pdb": "ancicd.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancia": {
        "dry_dcd": "ancia.exp01.md01.dry.dcd",
        "dry_pdb": "ancia.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancib": {
        "dry_dcd": "ancib.exp01.md01.dry.dcd",
        "dry_pdb": "ancib.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "anciip": {
        "dry_dcd": "anciip.exp01.md01.dry.dcd",
        "dry_pdb": "anciip.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancip": {
        "dry_dcd": "ancip.exp01.md01.dry.dcd",
        "dry_pdb": "ancip.exp01.md01.dry.realigned.pdb",
        "time": "ancient",
    },
}
product = {
    "nb": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/out-01-R23-RMSD.ipynb",
    "rmsd": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/exp01-rmsd.csv",
    "avg_rmsd": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/exp01-avg-rmsd.csv",
}


# R23 - MD alignment and RMSD extraction 

## Contents

1. Alignment
2. RMSD
3. RMSD against average structure
4. RMSD over subunit
5. RMSF

## Alignment and preparation

In [3]:
from mdtools.mapping import map_alignment_to_structure, align_structure_sequences
import MDAnalysis as mda
from MDAnalysis.analysis import rms, align
from pyfamsa import Aligner, Sequence
import json
import pandas as pd
import prody as pdy
#matplotlib.rcParams['fontname'] = "Arial"



In [4]:
def load_trajectory(code, data, path):
    
    """
    loads the following files
    - an MDAnalysis trajectory with the dried (water-removed) dcd
    - an MDAnalysis Universe with the starting PDB (jic)
    - an prody with the starting PDB (pretty useful when MDAnalysis fails)
    """
    return dict(
        code=code,
        trajectory_dry = mda.Universe(
            path + data['dry_pdb'], 
            path + data['dry_dcd'], 
            frames='all', in_memory=True
        ),
        reference_pdy=pdy.parsePDB(path + data['dry_pdb']),
        reference=mda.Universe(
            path + data['dry_pdb']
        ),
        time=data['time']
    )


Loading all the simulations.

In [5]:
path = '../../../simulations/'
exp01_md = []
for key, items in md_reference_data.items():
    print(f"-- code {key}", end='')
    exp01_md.append(load_trajectory(
        code=key, data=items, path=path
    ))
    print(f" loaded")
exp01_md = pd.DataFrame.from_records(exp01_md).set_index('code')

-- code 1bwv



@> 77672 atoms and 1 coordinate set(s) were parsed in 2.27s.


 loaded
-- code 3zxw

@> 69984 atoms and 1 coordinate set(s) were parsed in 2.31s.


 loaded
-- code 6ftl

@> 77416 atoms and 1 coordinate set(s) were parsed in 1.09s.


 loaded
-- code 6ura

@> 54648 atoms and 1 coordinate set(s) were parsed in 1.97s.


 loaded
-- code 8ruc

@> 74224 atoms and 1 coordinate set(s) were parsed in 0.77s.


 loaded
-- code anci

@> 72128 atoms and 1 coordinate set(s) were parsed in 0.49s.


 loaded
-- code anciab

@> 72640 atoms and 1 coordinate set(s) were parsed in 0.91s.


 loaded
-- code ancicd

@> 76208 atoms and 1 coordinate set(s) were parsed in 2.46s.


 loaded
-- code ancia

@> 73536 atoms and 1 coordinate set(s) were parsed in 2.95s.


 loaded
-- code ancib

@> 72352 atoms and 1 coordinate set(s) were parsed in 2.99s.


 loaded
-- code anciip

@> 55120 atoms and 1 coordinate set(s) were parsed in 2.24s.


 loaded
-- code ancip

@> 54768 atoms and 1 coordinate set(s) were parsed in 1.63s.


 loaded


## RMSD calculations

We compute the RMSD of:
- All the alpha carbons.
- Large subunit alpha carbons
- Small subunit alpha carbons
- Upper and lower large subunit alpha carbons
- Upper and lower small subunit alpha carbons

In [6]:
exp01_rmsd = []

upper_ssu = 'name CA and (segid B or segid F or segid J or segid N)'
lower_ssu = 'name CA and (segid D or segid H or segid L or segid P)'
ssu = 'name CA and (segid B or segid F or segid J or segid N or segid D or segid H or segid L or segid P)'
upper_lsu = 'name CA and (segid A or segid E or segid I or segid M)'
lower_lsu = 'name CA and (segid C or segid G or segid K or segid O)'
lsu = 'name CA and (segid A or segid E or segid I or segid M or segid C or segid G or segid K or segid O)'


for key, item in exp01_md.iterrows():
    print(f"-- code {key}", end='')
    R = rms.RMSD(item['trajectory_dry'], item['reference'], select='name CA', groupselections=[upper_ssu, lower_ssu, ssu, upper_lsu, lower_lsu, lsu], superposition=False)
    results = R.run()
    tmp = pd.DataFrame(results.rmsd, columns=['frame', 'time', 'CA', 'upperSSU', 'lowerSSU', 'ssu', 'upperLSU', 'lowerLSU', 'lsu'])
    tmp['code'] = key
    exp01_rmsd.append(tmp)
    print(f" aligned")
    
exp01_rmsd = pd.concat(exp01_rmsd)
exp01_rmsd.index = pd.MultiIndex.from_arrays([exp01_rmsd['code'], exp01_rmsd['frame']])
exp01_rmsd['frame'] = exp01_rmsd['frame'].astype(int)
exp01_rmsd['time'] = exp01_rmsd['time'].astype(float)
exp01_rmsd['time_ns'] = exp01_rmsd['time'] / 10

-- code 1bwv



 aligned
-- code 3zxw



 aligned
-- code 6ftl



 aligned
-- code 6ura

  return np.sqrt(np.sum((a - b) ** 2) / N)




 aligned
-- code 8ruc



 aligned
-- code anci



 aligned
-- code anciab



 aligned
-- code ancicd



 aligned
-- code ancia



 aligned
-- code ancib



 aligned
-- code anciip

  return np.sqrt(np.sum((a - b) ** 2) / N)


  return np.sqrt(np.sum((a - b) ** 2) / N)


 aligned
-- code ancip

 aligned




We only make measurements using the last 50ns of each simulation, counting the previous 25 to 50ns as burn-in time.

In [7]:
exp01_rmsd['time_to_end'] = exp01_rmsd.groupby(level='code')['time_ns'].transform(
    lambda x: x.max() - x
)
exp01_rmsd.query('time_to_end < 50')['time_ns']

code   frame
1bwv   500.0    50.000002
       501.0    50.100002
       502.0    50.200002
       503.0    50.300002
       504.0    50.400002
                  ...    
ancip  995.0    99.500003
       996.0    99.600003
       997.0    99.700003
       998.0    99.800003
       999.0    99.900003
Name: time_ns, Length: 6000, dtype: float64

Dumping values to file.

In [8]:
exp01_rmsd.to_csv(product['rmsd'], index=None)

### RMSD over subunits

We compute the average structure RMSD by subunits.

First, we need to generate all the average subunits.

In [9]:
lsu_A = 'name CA and segid A'
lsu_E = 'name CA and segid E'
lsu_I = 'name CA and segid I'
lsu_M = 'name CA and segid M'
lsu_C = 'name CA and segid C'
lsu_G = 'name CA and segid G'
lsu_K = 'name CA and segid K'
lsu_O = 'name CA and segid O'

ssu_B = 'name CA and segid B'
ssu_F = 'name CA and segid F'
ssu_J = 'name CA and segid J'
ssu_N = 'name CA and segid N'
ssu_D = 'name CA and segid D'
ssu_H = 'name CA and segid H'
ssu_L = 'name CA and segid L'
ssu_P = 'name CA and segid P'

lsu = 'name CA and (segid A or segid E or segid I or segid M or segid C or segid G or segid K or segid O)'
ssu = 'name CA and (segid B or segid F or segid J or segid N or segid D or segid H or segid L or segid P)'

selection_strings = dict(
    overall = 'name CA',
    ssu=ssu, 
    lsu=lsu,
    lsu_A = lsu_A,
    lsu_E = lsu_E,
    lsu_I = lsu_I,
    lsu_M = lsu_M,
    lsu_C = lsu_C,
    lsu_G = lsu_G,
    lsu_K = lsu_K,
    lsu_O = lsu_O,
    ssu_B = ssu_B,
    ssu_F = ssu_F,
    ssu_J = ssu_J,
    ssu_N = ssu_N,
    ssu_D = ssu_D,
    ssu_H = ssu_H,
    ssu_L = ssu_L,
    ssu_P = ssu_P
)
average_structures = dict(
    overall=[],
    ssu=[],
    lsu=[],
    lsu_A=[],
    lsu_E=[],
    lsu_I=[],
    lsu_M=[],
    lsu_C=[],
    lsu_G=[],
    lsu_K=[],
    lsu_O=[],
    ssu_B=[],
    ssu_F=[],
    ssu_J=[],
    ssu_N=[],
    ssu_D=[],
    ssu_H=[],
    ssu_L=[],
    ssu_P=[]
)

for key, item in exp01_md.iterrows():
    print(f"-- code {key}", end='')
    for selection_key, selection_string in selection_strings.items():
        try:
            average = align.AverageStructure(
                item['trajectory_dry'], item['trajectory_dry'], 
                select=selection_string, ref_frame=0
            ).run()
        except ValueError:
            average_structures[selection_key].append(None)
            continue

        ref = average.results.universe
        average_structures[selection_key].append(ref)
    print(" aligned")

exp01_md['overall_avg'] = average_structures['overall']
exp01_md['ssu_avg'] = average_structures['ssu']
exp01_md['lsu_avg'] = average_structures['lsu']

exp01_md['lsu_A'] = average_structures['lsu_A']
exp01_md['lsu_E'] = average_structures['lsu_E']
exp01_md['lsu_I'] = average_structures['lsu_I']
exp01_md['lsu_M'] = average_structures['lsu_M']
exp01_md['lsu_C'] = average_structures['lsu_C']
exp01_md['lsu_G'] = average_structures['lsu_G']
exp01_md['lsu_K'] = average_structures['lsu_K']
exp01_md['lsu_O'] = average_structures['lsu_O']

exp01_md['ssu_B'] = average_structures['ssu_B']
exp01_md['ssu_F'] = average_structures['ssu_F']
exp01_md['ssu_J'] = average_structures['ssu_J']
exp01_md['ssu_N'] = average_structures['ssu_N']
exp01_md['ssu_D'] = average_structures['ssu_D']
exp01_md['ssu_H'] = average_structures['ssu_H']
exp01_md['ssu_L'] = average_structures['ssu_L']
exp01_md['ssu_P'] = average_structures['ssu_P']

-- code 1bwv

 aligned
-- code 3zxw

 aligned
-- code 6ftl

 aligned
-- code 6ura

 aligned
-- code 8ruc

 aligned
-- code anci

 aligned
-- code anciab

 aligned
-- code ancicd

 aligned
-- code ancia

 aligned
-- code ancib

 aligned
-- code anciip

 aligned
-- code ancip

 aligned


Computing the RMSD over the different subunit structures.

In [10]:
exp01_rmsd_avg = []
overall = 'name CA'

for key, item in exp01_md.iterrows():
    print(f"-- code {key}", end='')
    R = rms.RMSD(item['trajectory_dry'], item['overall_avg'], select='name CA', groupselections=[ssu, lsu, lsu_A, lsu_E, lsu_I, lsu_M, lsu_C, lsu_G, lsu_K, lsu_O, ssu_B, ssu_F, ssu_J, ssu_N, ssu_D, ssu_H, ssu_L, ssu_P], superposition=True)
    results = R.run()
    tmp = pd.DataFrame(results.rmsd, columns=['frame', 'time', 'CA', 'ssu', 'lsu', 'lsu_A', 'lsu_E', 'lsu_I', 'lsu_M', 'lsu_C', 'lsu_G', 'lsu_K', 'lsu_O', 'ssu_B', 'ssu_F', 'ssu_J', 'ssu_N', 'ssu_D', 'ssu_H', 'ssu_L', 'ssu_P'])
    tmp['code'] = key
    
    exp01_rmsd_avg.append(tmp)
    print(" aligned")
    
exp01_rmsd_avg = pd.concat(exp01_rmsd_avg)
exp01_rmsd_avg.index = pd.MultiIndex.from_arrays([exp01_rmsd_avg['code'], exp01_rmsd_avg['frame']])
exp01_rmsd_avg['frame'] = exp01_rmsd_avg['frame'].astype(int)
exp01_rmsd_avg['time'] = exp01_rmsd_avg['time'].astype(float)
exp01_rmsd_avg['time_ns'] = exp01_rmsd_avg['time'] / 10
exp01_rmsd_avg['time_to_end'] = exp01_rmsd_avg.groupby(level='code')['time_ns'].transform(
    lambda x: x.max() - x
)
exp01_rmsd_avg.query('time_to_end < 50')['time_ns']

-- code 1bwv



 aligned
-- code 3zxw



 aligned
-- code 6ftl

  return np.sqrt(np.sum((a - b) ** 2) / N)


 aligned
-- code 6ura



 aligned
-- code 8ruc



 aligned
-- code anci



 aligned
-- code anciab



 aligned
-- code ancicd



 aligned
-- code ancia



 aligned
-- code ancib

  return np.sqrt(np.sum((a - b) ** 2) / N)


 aligned
-- code anciip

  return np.sqrt(np.sum((a - b) ** 2) / N)


 aligned
-- code ancip

 aligned




code   frame
1bwv   500.0    50.000002
       501.0    50.100002
       502.0    50.200002
       503.0    50.300002
       504.0    50.400002
                  ...    
ancip  995.0    99.500003
       996.0    99.600003
       997.0    99.700003
       998.0    99.800003
       999.0    99.900003
Name: time_ns, Length: 6000, dtype: float64

In [11]:
exp01_rmsd_avg.to_csv(
    product['avg_rmsd'], index=None
)