In [1]:
md_reference_data = None
upstream = None
product = None

In [2]:
# Parameters
md_reference_data = {
    "1bwv": {
        "dry_dcd": "1bwv.exp01.md01.dry.dcd",
        "dry_pdb": "1bwv.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "3zxw": {
        "dry_dcd": "3zxw.exp01.md01.dry.dcd",
        "dry_pdb": "3zxw.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "6ftl": {
        "dry_dcd": "6ftl.exp01.md01.dry.dcd",
        "dry_pdb": "6ftl.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "6ura": {
        "dry_dcd": "6ura.exp01.md01.dry.dcd",
        "dry_pdb": "6ura.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "8ruc": {
        "dry_dcd": "8ruc.exp01.md01.dry.dcd",
        "dry_pdb": "8ruc.exp01.md01.dry.pdb",
        "time": "extant",
    },
    "anci": {
        "dry_dcd": "anci.exp01.md01.dry.dcd",
        "dry_pdb": "anci.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "anciab": {
        "dry_dcd": "anciab.exp01.md01.dry.dcd",
        "dry_pdb": "anciab.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancicd": {
        "dry_dcd": "ancicd.exp01.md01.dry.dcd",
        "dry_pdb": "ancicd.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancia": {
        "dry_dcd": "ancia.exp01.md01.dry.dcd",
        "dry_pdb": "ancia.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancib": {
        "dry_dcd": "ancib.exp01.md01.dry.dcd",
        "dry_pdb": "ancib.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "anciip": {
        "dry_dcd": "anciip.exp01.md01.dry.dcd",
        "dry_pdb": "anciip.exp01.md01.dry.pdb",
        "time": "ancient",
    },
    "ancip": {
        "dry_dcd": "ancip.exp01.md01.dry.dcd",
        "dry_pdb": "ancip.exp01.md01.dry.realigned.pdb",
        "time": "ancient",
    },
}
product = {
    "nb": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/out-02-R23-RMSF.ipynb",
    "pdb_bfactor_map": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/1bwv-RMSF.pdb",
    "rbcs_rmsf": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/rbcs-rmsf.csv",
    "rbcl_rmsf": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/rbcl-rmsf.csv",
    "rbcl_alignment": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/rbcl.fasta",
    "rbcs_alignment": "/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/rbcs.fasta",
}


# R23 - MD alignment and RMSD extraction 

## Contents

1. Alignment
2. RMSD
3. RMSD against average structure
4. RMSD over subunit
5. RMSF

## Alignment and preparation

In [3]:
from mdtools.mapping import map_alignment_to_structure, align_structure_sequences
import MDAnalysis as mda
from MDAnalysis.analysis import rms, align
from pyfamsa import Aligner, Sequence
import json
import pandas as pd
import prody as pdy
#matplotlib.rcParams['fontname'] = "Arial"



In [4]:
def load_trajectory(code, data, path):
    
    """
    loads the following files
    - an MDAnalysis trajectory with the dried (water-removed) dcd
    - an MDAnalysis Universe with the starting PDB (jic)
    - an prody with the starting PDB (pretty useful when MDAnalysis fails)
    """
    return dict(
        code=code,
        trajectory_dry = mda.Universe(
            path + data['dry_pdb'], 
            path + data['dry_dcd'], 
            frames='all', in_memory=True
        ),
        reference_pdy=pdy.parsePDB(path + data['dry_pdb']),
        reference=mda.Universe(
            path + data['dry_pdb']
        ),
        time=data['time']
    )


Loading all the simulations.

In [5]:
path = '../../../simulations/'
exp01_md = []
for key, items in md_reference_data.items():
    print(f"-- code {key}", end='')
    exp01_md.append(load_trajectory(
        code=key, data=items, path=path
    ))
    print(f" loaded")
exp01_md = pd.DataFrame.from_records(exp01_md).set_index('code')

-- code 1bwv



@> 77672 atoms and 1 coordinate set(s) were parsed in 2.86s.


 loaded
-- code 3zxw

@> 69984 atoms and 1 coordinate set(s) were parsed in 0.66s.


 loaded
-- code 6ftl

@> 77416 atoms and 1 coordinate set(s) were parsed in 0.85s.


 loaded
-- code 6ura

@> 54648 atoms and 1 coordinate set(s) were parsed in 1.53s.


 loaded
-- code 8ruc

@> 74224 atoms and 1 coordinate set(s) were parsed in 0.85s.


 loaded
-- code anci

@> 72128 atoms and 1 coordinate set(s) were parsed in 1.77s.


 loaded
-- code anciab

@> 72640 atoms and 1 coordinate set(s) were parsed in 0.88s.


 loaded
-- code ancicd

@> 76208 atoms and 1 coordinate set(s) were parsed in 0.41s.


 loaded
-- code ancia

@> 73536 atoms and 1 coordinate set(s) were parsed in 3.55s.


 loaded
-- code ancib

@> 72352 atoms and 1 coordinate set(s) were parsed in 1.66s.


 loaded
-- code anciip

@> 55120 atoms and 1 coordinate set(s) were parsed in 5.33s.


 loaded
-- code ancip

@> 54768 atoms and 1 coordinate set(s) were parsed in 1.97s.


 loaded


# Analysis of the RMSF

This part is a little bit convolved because it required mapping the residues to a common alignment (otherwise, the comparison doesn't have much sense).

In [6]:
def alignment_to_fasta(alignment):
    return "\n".join([f"> {item['id']}\n{item['sequence']}" for item in alignment])

In [7]:
large_subunits_alignment = align_structure_sequences(exp01_md['reference_pdy'].to_list(), exp01_md.index.to_list(), chain='A')
# LITTLE HACK FOR 6URA
small_subunit_index = exp01_md.index.isin(['6ura', 'anciip', 'ancip'])
small_subunits_alignment = align_structure_sequences(
    exp01_md['reference_pdy'][~small_subunit_index].to_list(), 
    exp01_md[~small_subunit_index].index.to_list(), chain='B')

with open(product['rbcl_alignment'], 'w') as f:
    f.write(alignment_to_fasta(large_subunits_alignment))
with open(product['rbcs_alignment'], 'w') as f:
    f.write(alignment_to_fasta(small_subunits_alignment))

In [8]:
large_subunits_alignment = pd.DataFrame(
    large_subunits_alignment
).set_index('id').T


upper_lsu = 'name CA and (segid A or segid E or segid I or segid M)'
lower_lsu = 'name CA and (segid C or segid G or segid K or segid O)'
lsu = 'name CA and (segid A or segid E or segid I or segid M or segid C or segid G or segid K or segid O)'

exp01_rbcl_rmsf = []

for key, item in exp01_md.iterrows():
    print(f"-- {key} ", end="")
    u = item['trajectory_dry'].select_atoms(lsu)
    R = rms.RMSF(u)
    results = R.run()
    rmsf_values = results.results.rmsf
    forward_residue_map, backward_residue_map = map_alignment_to_structure(
        structure=item['reference_pdy'], 
        aligned_sequence=large_subunits_alignment[key].to_list()[0],
        on_chain='A'
    )
    try:
        reference_pdy = item['reference_pdy'].select('(chain A or chain C or chain E or chain G or chain I or chain K or chain M or chain O) and name CA')
    except TypeError:
        reference_pdy = item['reference_pdy'].select('(chain A or chain C or chain E or chain G or chain I or chain K or chain M or chain O) and name CA')
    for i, (res, res_pdy) in enumerate(zip(u.residues, reference_pdy.getHierView().iterResidues())):
        try:
            resnum = backward_residue_map.index(res.resnum)
        

            exp01_rbcl_rmsf.append(
                dict(
                    rmsf=rmsf_values[i], resnum=resnum, chain=res_pdy.getChid(), code=key

                )
            )    
        except:
            continue
    print(" done")

            
exp01_rbcl_rmsf = pd.DataFrame.from_records(exp01_rbcl_rmsf)

-- 1bwv 

 done
-- 3zxw 

 done
-- 6ftl 

 done
-- 6ura 

 done
-- 8ruc 

 done
-- anci 

 done
-- anciab 

 done
-- ancicd 

 done
-- ancia 

 done
-- ancib 

 done
-- anciip 

 done
-- ancip 

 done


In [9]:
small_subunits_alignment = pd.DataFrame(
    small_subunits_alignment
).set_index('id').T

upper_ssu = 'name CA and (segid B or segid F or segid J or segid N)'
lower_ssu = 'name CA and (segid D or segid H or segid L or segid P)'
ssu = 'name CA and (segid B or segid F or segid J or segid N or segid D or segid H or segid L or segid P)'


exp01_rbcs_rmsf = []

for key, item in exp01_md.iterrows():
    print(f"-- {key} ", end="")
    if key in ['6ura', 'anciip', 'ancip']: 
        print(" skipped")
        continue
    u = item['trajectory_dry'].select_atoms('(chainID B or chainID D or chainID F or chainID H or chainID J or chainID L or chainID N or chainID P) and name CA')
    R = rms.RMSF(u)
    results = R.run()
    rmsf_values = results.results.rmsf
    forward_residue_map, backward_residue_map = map_alignment_to_structure(
        structure=item['reference_pdy'], 
        aligned_sequence=small_subunits_alignment[key].to_list()[0],
        on_chain='B'
    )
    try:
        reference_pdy = item['reference_pdy'].select('(chain B or chain D or chain F or chain H or chain J or chain L or chain N or chain P) and name CA')
    except KeyError:
        reference_pdy = item['reference_pdy'].select('(chain B or chain D or chain F or chain H or chain J or chain L or chain N or chain P) and name CA')

    for i, (res, res_pdy) in enumerate(zip(u.residues, reference_pdy.getHierView().iterResidues())):
        try:
            resnum = backward_residue_map.index(res.resnum)
        

            exp01_rbcs_rmsf.append(
                dict(
                    rmsf=rmsf_values[i], resnum=resnum, chain=res_pdy.getChid(), code=key,
                    time=item['time']

                )
            )    
        except:
            print(f"error with {res.resnum}")
            continue
    print(f" done")

exp01_rbcs_rmsf = pd.DataFrame.from_records(exp01_rbcs_rmsf)


-- 1bwv 

 done
-- 3zxw 

 done
-- 6ftl 

 done
-- 6ura  skipped
-- 8ruc 

 done
-- anci 

 done
-- anciab 

 done
-- ancicd 

 done
-- ancia 

 done
-- ancib 

 done
-- anciip  skipped
-- ancip  skipped


We finally map the RMSFs into a protein structure to visualize the RMSF spacially.

In [10]:
exp01_rbcs_rmsf.to_csv(product['rbcs_rmsf'])
exp01_rbcl_rmsf.to_csv(product['rbcl_rmsf'])

In [11]:
for residue, rmsf in zip(
    exp01_md['reference_pdy'].loc['1bwv'].select('chain B').getHierView().iterResidues(), 
    exp01_rbcs_rmsf.query('code == "1bwv"').groupby('resnum')['rmsf'].mean().tolist()
):
    residue.setBetas(rmsf)
pdy.writePDB(product['pdb_bfactor_map'], exp01_md['reference_pdy'].loc['1bwv'].select('chain B'))

'/mnt/researchdrive/bCuevas/r23/github_repo/RuBisCO_evolution/analysis/output/1bwv-RMSF.pdb'