# Structural Analysis of RosettaScript PDB Output
The goal of this notebook is to develop a general method for correlating structural variation in RosettaScript output (i.e. RMSD) with interesting factors (DDG scores, ensemble size, trial sizes, kT).<br>
<br>
<b>Things to think about:</b>
* How much backbone variation is required/useful?
* Input starting PDB, WT output PDBs, and mutant output PDBs
* All-vs-all RMSDs for any outputs (DDG WT, DDG mutants, minimization comparison structures)
* RMSD for 8A repack sphere (repacking) and overall structure

In [29]:
from Bio.PDB import *
import os

#From Kyle's Finalize.py
def read_mutations_resfile(filenum_dir):
    resfile = os.path.join(filenum_dir, 'mutations_repack.resfile')
    mutations = []
    with open(resfile, 'r') as f:
        post_start = False
        for line in f:
            if post_start:
                line = line.strip()
                pdb_resnum, chain, pikaa, mut_res = line.split()
                mutations.append( [pdb_resnum, chain, pikaa, mut_res] )
            elif line.startswith('start'):
                post_start = True
    return mutations

#Calculates sidechain rmsd        
def rms_mutation(mutations, sidechain_ref, sidechain_target):
    
    #Superimpose
    super_imposer = Bio.PDB.Superimposer()
    
    ref_atoms = sidechain_ref.get_unpacked_list()
    target_atoms = sidechain_target.get_unpacked_list()
    
    super_imposer.set_atoms(ref_atoms, target_atoms)
    return super_imposer.rms

#Calculates global CA rmsd   
def rms_global(ref_coords, alt_coords):
    super_imposer = Bio.PDB.Superimposer()
    ref_atoms = [Atom.Atom('CA','','','','','','CA', element='C') for i, coords in enumerate(ref_coords)]
    target_atoms = [Atom.Atom('CA','','','','','','CA', element='C') for i, coords in enumerate(alt_coords)]
    
    for i in ref_atoms:
        print i
    #super_imposer.set_atoms(ref_atoms, target_atoms)
    return super_imposer.rms

#Input structure
parser = PDBParser()
structure_ref = parser.get_structure('Ref', 'TestJobs/data/59648/1TM1_EI.pdb')
structure_target = parser.get_structure('Target', 'PDB_REDO_Stripped/1TM1_EI.pdb')
    
#Set selection in Bio.PDB speak for ref and target PDBs
for i in mutations:
    sidechain_ref = structure_ref[0][str(i[1])][int(i[0])]
    sidechain_target = structure_target[0][str(i[1])][int(i[0])]

mutations = read_mutations_resfile('TestJobs/data/59648/')
rms_mutation(mutations, sidechain_ref, sidechain_target)
rms_global(structure_ref, structure_target)

<Atom CA>


## Kyle's RMSD Script 

In [None]:
#!/usr/bin/python

program_description = "Script to calculate pairwise RMSD of ensemble"

# Use pyRMSD for RMSD calculations if available as it's much faster than BioPython
try:
    import pyRMSD
    from pyRMSD.matrixHandler import MatrixHandler
    import pyRMSD.RMSDCalculator
    from pyRMSD.availableCalculators import availableCalculators
    # Use CUDA for GPU calculations, if avialable
    if 'QCP_CUDA_MEM_CALCULATOR' in availableCalculators():
        pyrmsd_calc = 'QCP_CUDA_MEM_CALCULATOR'
    else:
        pyrmsd_calc = 'QCP_SERIAL_CALCULATOR'
except ImportError:
    print 'Warning: could not import faster RMSD module. Falling back to slower biopython...'
    pyRMSD = None
    import Bio.PDB
    import Bio.PDB.Atom as Atom

def calc_rms(ref_coords, alt_coords):
    # print ref_coords, alt_coords
    assert( len(ref_coords) == len(alt_coords) )
    if pyRMSD:
        calculator = pyRMSD.RMSDCalculator.RMSDCalculator(pyrmsd_calc, np.array([ref_coords, alt_coords]))
        return calculator.pairwiseRMSDMatrix()[0]
    else:
        super_imposer = Bio.PDB.Superimposer()
        ref_atoms = [Atom.Atom('CA', coords, 0.0, 1.0, '', ' CA ', i+1, element='C') for i, coords in enumerate(ref_coords)]
        alt_atoms = [Atom.Atom('CA', coords, 0.0, 1.0, '', ' CA ', i+1, element='C') for i, coords in enumerate(alt_coords)]
        super_imposer.set_atoms(ref_atoms, alt_atoms)
        return super_imposer.rms

def main():
    parser = argparse.ArgumentParser(description=program_description)

    parser.add_argument('-d', '--database_input',
                        nargs = '+',
                        help = 'Database to read coordinates from')
    parser.add_argument('-f', '--folder',
                        nargs = '+',
                        help = 'Folder to search for PDB files')
    parser.add_argument('-p', '--pdb_file',
                        nargs = '+',
                        help = 'PDB file to cluster')
    parser.add_argument('--single_thread',
                        default = False,
                        action = 'store_true',
                        help = 'Do not use multiprocessing')

    args = parser.parse_args()

    if args.single_thread:
        cpu_count = 1
    else:
        cpu_count = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(cpu_count)

    results_dict = {}

    print 'Starting calculating RMS for all paths vs. all first fragments for each position'
    total_count = len(path_data)
    starting_time = time.time()

    def helper_callback(outer_results):
        for results_tuple in outer_results:
            path_number, rms_results = results_tuple
            results_dict[path_number] = rms_results

    path_nums_for_jobs = [[] for x in xrange(cpu_count)]
    path_coords_for_jobs = [[] for x in xrange(cpu_count)]
    for i, path_coords in enumerate([[anchor_points[x] for x in path] for path in path_data]):
        path_nums_for_jobs[ i%cpu_count ].append( i )
        path_coords_for_jobs[ i%cpu_count ].append( path_coords )

    for path_nums_list, path_coords_list in zip(path_nums_for_jobs, path_coords_for_jobs):
        if args.single_thread:
            helper_callback( paths_against_all_fragments(path_nums_list, path_coords_list, starting_time, total_count, path_length) )
        else:
            pool.apply_async(paths_against_all_fragments, (path_nums_list, path_coords_list, starting_time, total_count, path_length), callback=helper_callback)

    if not args.single_thread:
        pool.close()
        pool.join()

    n = int(reporter_n.value)
    completion_time = time.time()
    print 'Finished! Processed %d %s, took %.3f seconds\n' % (n, 'paths', completion_time-starting_time)

if __name__ == "__main__":
    main()