In [37]:
from Bio.PDB import *
import os

#import pyRMSD
#from pyRMSD.matrixHandler import MatrixHandler
#import pyRMSD.RMSDCalculator
#from pyRMSD.availableCalculators import availableCalculators

#From Kyle's Finalize.py
def read_mutations_resfile(filenum_dir):
    resfile = os.path.join(filenum_dir, 'mutations_repack.resfile')
    mutations = []
    with open(resfile, 'r') as f:
        post_start = False
        for line in f:
            if post_start:
                line = line.strip()
                pdb_resnum, chain, pikaa, mut_res = line.split()
                mutations.append( [pdb_resnum, chain, pikaa, mut_res] )
            elif line.startswith('start'):
                post_start = True
    return mutations

#From Kyle's Finalize.py
def find_neighbors(filenum_dir, pdb_path, neighbor_distance = 8.0):
    mutations = read_mutations_resfile(filenum_dir)
    open_filename = pdb_path
    parser = PDBParser(PERMISSIVE=1)
    open_strct = parser.get_structure('Open', open_filename)

    # There should only be one model in PDB file
    num_models = 0
    for model in open_strct.get_models():
        num_models += 1
    assert( num_models == 1 )

    chain_list = [chain.get_id() for chain in open_strct[0].get_chains()]
    neighbors = set()
    for mutation in mutations:
        res_id, chain_id, pikaa, mut_aa = mutation
        mut_chain = str(chain_id)
        try:
            mut_pos = int( res_id )
            mut_insertion_code = ' '
        except ValueError:
            mut_pos = int( res_id[:-1] )
            mut_insertion_code = res_id[-1]

        mut_residue = open_strct[0][mut_chain][(' ', mut_pos, mut_insertion_code)]
        for chain in chain_list:
            for residue in [res.get_id() for res in open_strct[0][chain].get_residues()]:
                try:
                    # Kyle note - might be good to do something else for consistency, since not all residues have CB
                    dist = mut_residue['CB'] - open_strct[0][chain][residue]['CB']
                    if dist < neighbor_distance:
                        neighbors.add( (residue, chain) )
                except KeyError:
                    pass

    return neighbors

#Residue list generator
def generate_lists(reslist, ref_struc, alt_struc):
    ref_atoms = []
    alt_atoms = []

    for res in reslist:
        for ref_atom in ref_struc.get_atoms():
            if ref_atom.get_parent().get_full_id()[2] == res[1]:
                if str(ref_atom.get_parent().get_full_id()[3][1]) == str(res[0]):
                    if 'H' in ref_atom.get_id():
                        if 'O' in ref_atom.get_id() or 'N' in ref_atom.get_id():
                            ref_atoms.append(ref_atom)
                        else:
                            continue
                    else:
                        ref_atoms.append(ref_atom)
        
        for alt_atom in alt_struc.get_atoms():    
            if alt_atom.get_parent().get_full_id()[2] == res[1]:
                if str(alt_atom.get_parent().get_full_id()[3][1]) == str(res[0]):     
                    if 'H' in alt_atom.get_id():
                        if 'O' in alt_atom.get_id() or 'N' in alt_atom.get_id():
                            alt_atoms.append(alt_atom)
                        else:
                            continue
                    else:
                        alt_atoms.append(alt_atom)
    
    return ref_atoms, alt_atoms

#Calculates global CA rmsd (Only guarenteed to work on identical structures!!!!)
#By identical I mean the same starting PDBs but one had some movers applied to it
def rms_global(ref_struc, alt_struc):

    ref_atoms = []
    alt_atoms = []
    
    for ref_atom in ref_struc.get_atoms():
        if ref_atom.get_name() == 'CA':
            ref_atoms.append(ref_atom)
        else:
            continue
    
    for alt_atom in alt_struc.get_atoms():
        if alt_atom.get_name() == 'CA':
            alt_atoms.append(alt_atom)
        else:
            continue
    
    super_imposer = Bio.PDB.Superimposer()
    super_imposer.set_atoms(ref_atoms, alt_atoms)
    
    return super_imposer.rms

#ALLxALL Global RMSD
def all_by_all(datadir):
    parser = PDBParser()
    all_rmsd = []
    for i in enumerate(os.listdir(datadir)):
        if i[1].endswith('.pdb'):
            for j in enumerate(os.listdir(datadir)):
                if j[1].endswith('.pdb'):
                    if i[0]>j[0]:
                        structure_ref = parser.get_structure('Ref', 'TestJobs/output/59648/%s' % i[1])
                        structure_target = parser.get_structure('Target', 'TestJobs/output/59648/%s' % j[1])

                        RMSD = rms_global(structure_ref, structure_target)
                        print '%s:%s = %s' %( j[1][9:-4], i[1][9:-4], RMSD)
                        all_rmsd.append(RMSD)
    return all_rmsd

#Neighborhood RMSD
def rms_neighborhood(neighbors, ref_struc, alt_struc):
    ref_atoms, alt_atoms = generate_lists(neighbors, ref_struc, alt_struc)
    
    def getKey(item):
        return item.get_full_id()[4][0]
    
    ref_atoms = sorted(ref_atoms, key=getKey)
    alt_atoms = sorted(alt_atoms, key=getKey)
    
    super_imposer = Bio.PDB.Superimposer()
    super_imposer.set_atoms(ref_atoms, alt_atoms)

    return super_imposer.rms

#Calculates sidechain rmsd        
def rms_mutation(filenum_dir, ref_struc, alt_struc):
    mutations = read_mutations_resfile(filenum_dir)
    ref_atoms, alt_atoms = generate_lists(mutations, ref_struc, alt_struc)
    
    def getKey(item):
        return item.get_full_id()[4][0]
    
    ref_atoms = sorted(ref_atoms, key=getKey)
    alt_atoms = sorted(alt_atoms, key=getKey)

    ref_atoms_coord = []
    alt_atoms_coord = []
    
    for i in ref_atoms:
        ref_atoms_coord.append(i.get_coord())
    for i in alt_atoms:
        alt_atoms_coord.append(i.get_coord())
    
    super_imposer = Bio.PDB.Superimposer()
    super_imposer.set_atoms(ref_atoms, alt_atoms)
        
    return super_imposer.rms

#Input structure (for now, should transfer to a different function later)
def simple_input():
    parser = PDBParser()
    structure_ref = parser.get_structure('Ref', 'TestJobs/data/59648/1TM1_EI.pdb')
    structure_target = parser.get_structure('Target', 'TestJobs/output/59648/1TM1_EI_0003.pdb')
    return structure_ref, structure_target

#Define things
datadir = 'TestJobs/output/59648'
filenum_dir = 'TestJobs/data/59648/'
pdb_path = 'TestJobs/data/59648/1TM1_EI.pdb'
neighbors = find_neighbors(filenum_dir, pdb_path, 8)

#Action!!!
def main():
    #all_by_all(datadir)
    structure_ref, structure_target = simple_input()
    #rms_global(structure_ref, structure_target)
    #rms_neighborhood(neighbors, structure_ref, structure_target)
    print rms_mutation(filenum_dir, structure_ref, structure_target)

main()

0.107679458755


# Structural Analysis of RosettaScript PDB Output
The goal of this notebook is to develop a general method for correlating structural variation in RosettaScript output (i.e. RMSD) with interesting factors (DDG scores, ensemble size, trial sizes, kT).<br>
<br>
<b>Things to think about:</b>
* How much backbone variation is required/useful?
* Input starting PDB, WT output PDBs, and mutant output PDBs
* All-vs-all RMSDs for any outputs (DDG WT, DDG mutants, minimization comparison structures)
* RMSD for 8A repack sphere (repacking) and overall structure

## Kyle's RMSD Script 

In [None]:
#!/usr/bin/python

program_description = "Script to calculate pairwise RMSD of ensemble"

# Use pyRMSD for RMSD calculations if available as it's much faster than BioPython
try:
    import pyRMSD
    from pyRMSD.matrixHandler import MatrixHandler
    import pyRMSD.RMSDCalculator
    from pyRMSD.availableCalculators import availableCalculators
    # Use CUDA for GPU calculations, if avialable
    if 'QCP_CUDA_MEM_CALCULATOR' in availableCalculators():
        pyrmsd_calc = 'QCP_CUDA_MEM_CALCULATOR'
    else:
        pyrmsd_calc = 'QCP_SERIAL_CALCULATOR'
except ImportError:
    print 'Warning: could not import faster RMSD module. Falling back to slower biopython...'
    pyRMSD = None
    import Bio.PDB
    import Bio.PDB.Atom as Atom

def calc_rms(ref_coords, alt_coords):
    # print ref_coords, alt_coords
    assert( len(ref_coords) == len(alt_coords) )
    if pyRMSD:
        calculator = pyRMSD.RMSDCalculator.RMSDCalculator(pyrmsd_calc, np.array([ref_coords, alt_coords]))
        return calculator.pairwiseRMSDMatrix()[0]
    else:
        super_imposer = Bio.PDB.Superimposer()
        ref_atoms = [Atom.Atom('CA', coords, 0.0, 1.0, '', ' CA ', i+1, element='C') for i, coords in enumerate(ref_coords)]
        alt_atoms = [Atom.Atom('CA', coords, 0.0, 1.0, '', ' CA ', i+1, element='C') for i, coords in enumerate(alt_coords)]
        super_imposer.set_atoms(ref_atoms, alt_atoms)
        return super_imposer.rms

def main():
    parser = argparse.ArgumentParser(description=program_description)

    parser.add_argument('-d', '--database_input',
                        nargs = '+',
                        help = 'Database to read coordinates from')
    parser.add_argument('-f', '--folder',
                        nargs = '+',
                        help = 'Folder to search for PDB files')
    parser.add_argument('-p', '--pdb_file',
                        nargs = '+',
                        help = 'PDB file to cluster')
    parser.add_argument('--single_thread',
                        default = False,
                        action = 'store_true',
                        help = 'Do not use multiprocessing')

    args = parser.parse_args()

    if args.single_thread:
        cpu_count = 1
    else:
        cpu_count = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(cpu_count)

    results_dict = {}

    print 'Starting calculating RMS for all paths vs. all first fragments for each position'
    total_count = len(path_data)
    starting_time = time.time()

    def helper_callback(outer_results):
        for results_tuple in outer_results:
            path_number, rms_results = results_tuple
            results_dict[path_number] = rms_results

    path_nums_for_jobs = [[] for x in xrange(cpu_count)]
    path_coords_for_jobs = [[] for x in xrange(cpu_count)]
    for i, path_coords in enumerate([[anchor_points[x] for x in path] for path in path_data]):
        path_nums_for_jobs[ i%cpu_count ].append( i )
        path_coords_for_jobs[ i%cpu_count ].append( path_coords )

    for path_nums_list, path_coords_list in zip(path_nums_for_jobs, path_coords_for_jobs):
        if args.single_thread:
            helper_callback( paths_against_all_fragments(path_nums_list, path_coords_list, starting_time, total_count, path_length) )
        else:
            pool.apply_async(paths_against_all_fragments, (path_nums_list, path_coords_list, starting_time, total_count, path_length), callback=helper_callback)

    if not args.single_thread:
        pool.close()
        pool.join()

    n = int(reporter_n.value)
    completion_time = time.time()
    print 'Finished! Processed %d %s, took %.3f seconds\n' % (n, 'paths', completion_time-starting_time)

if __name__ == "__main__":
    main()