## Code for separating & analyzing contact frequency of U30 on SERF
This script takes the following input files:
- 'full_traj/full.pdb'
- 'full_traj/full.xtc'
- 'ana_batch/bound_frame_list.txt'
- 'ana_batch/unbound_frame_list.txt'

NB: trajectory and topology files should be pre-concatentated to contain all replicates; must also be processed for pbc correction.

In [1]:
## import packages ##

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mdtraj as md
import pandas as pd
import itertools

In [2]:
## load trajectory w/ mdtraj ##
print('Loading trajectory...')

traj = md.load('trajectory_processing_example/full_traj/full.xtc', top = 'trajectory_processing_example/full_traj/full.pdb')

print('Trajectory loaded!')

## print this info for quality control
print('How many atoms?   %s' % traj.n_atoms)
print('How many residues?    %s' % traj.n_residues)
print('How many chains?     %s' % traj.n_chains)

Loading trajectory...
Trajectory loaded!
How many atoms?   97
How many residues?    97
How many chains?     2


In [4]:
## load in lists of bound and unbound frames
bound_list = np.loadtxt('trajectory_processing_example/ana_batch/bound_frame_list.txt')
unbound_list = np.loadtxt('trajectory_processing_example/ana_batch/unbound_frame_list.txt')

bound_list_int = [int(x) for x in bound_list]
unbound_list_int = [int(y) for y in unbound_list]

In [5]:
## slice trajectory into bound and unbound frames

bound_traj = traj.slice(bound_list_int)
unbound_traj = traj.slice(unbound_list_int)

In [6]:
## define protein and rna molecules in order to analyze them separately

protein = traj.topology.select("chainid == 0")
rna = traj.topology.select("chainid == 1")

protein_slice = traj.atom_slice(protein)

bound_protein_traj = bound_traj.atom_slice(protein)
unbound_protein_traj = unbound_traj.atom_slice(protein)

In [20]:
## compute Rgs for bound and unbound frames
# factor of 10 correction to convert nm > Angstroms

boundRgs = md.compute_rg(bound_protein_traj) * 10
unboundRgs = md.compute_rg(unbound_protein_traj) * 10

## UNCOMMENT to write out lists of bound Rgs for downstream analysis
#np.savetxt('SERF+U30_bound_Rgs.txt')

## intermolecular contacts analysis

In [14]:
## setup for contact map for BOUND TRAJ ONLY!!! improves signal-noise ## 

# get protein and rna residue numbers
rna_res = np.arange(len(protein)+1, traj.n_residues)
protein_res = range(0, len(protein))

# generate list of pairs of residues for distance computation
pairs = list(itertools.product(rna_res, protein_res))

# perform distance calculation
dist, pair = md.compute_contacts(bound_traj, pairs, scheme = 'ca')

distdf = pd.DataFrame(dist)
pairdf = pd.DataFrame(pair)

In [15]:
## using 15 Angstrom cutoff, define per frame and per residue whether there exists a contact (1 or 0)
# this will take a hot minute to run

contact_bin = np.empty([len(dist), len(pair)]).astype(int)

for i in range(0, len(dist)):
    for j in range(0, len(pair)):
        if dist[i][j] <= 1.5:
            contact_bin[i][j] = 1
        else:
            contact_bin[i][j] = 0

cbin_df = pd.DataFrame(contact_bin)


In [16]:
## now sum the total for each pair over the 1000 frames ##
# "1" means a contact (within 1.5 nm); "0" means no contact 
# therefore the sum of the values over all frames is the contact frequency 

contact_sum = np.empty(len(pairs)).astype(int)

for k in range(0, len(pair)):
    contact_sum[k] = sum(contact_bin[:,k])
    
# normalize contact frequency from 0.0 to 1.0 for each residue pair

contact_freq = np.empty(len(pair)).astype(float)
                        
for m in range(0, len(pair)):
    contact_freq[m] = contact_sum[m] / len(dist)

In [17]:
## do some formatting and write out contact frequency file ##

contact_toplot_df = pd.DataFrame(np.vstack((pair.T, contact_freq)).astype(float).T)
contact_df_clean = contact_toplot_df.reset_index().pivot(index = [0], columns = [1], values = [2])

df_flat = contact_df_clean.mean(axis=0)

data_out = np.array([protein_res, df_flat]).T

## UNCOMMENT below to save contact frequency data as text file
#np.savetxt('serf-u30_contact_frequency.txt', data_out)