In [1]:
from collections import namedtuple
from os import listdir

import ipywidgets as widgets
import pandas as pd
import numpy as np

## Atom structure

In [42]:
Atom = namedtuple('Atom', ['name', 'element', 'aminoacid', 'x', 'y', 'z'])

Atom.parse_name = lambda string: string[12:16].strip()
Atom.parse_element = lambda string: string[12] if len(string.strip()) < 78 else string[76:78].strip()
Atom.parse_aminoacid = lambda string: string[17:20].strip()
Atom.parse_x = lambda string: float(string[30:38].strip())
Atom.parse_y = lambda string: float(string[38:46].strip())
Atom.parse_z = lambda string: float(string[46:54].strip())

Atom.parse = lambda string: Atom(
    Atom.parse_name(string), Atom.parse_element(string), Atom.parse_aminoacid(string),
    Atom.parse_y(string), Atom.parse_x(string), Atom.parse_z(string))

## Select the molecule to compare

In [43]:
path_molecules_to_compare = [f for f in listdir() if 'reference' not in f and f.endswith('.pdb')]
print(path_molecules_to_compare)

['1ACW-01.pdb', '1ACW-02.pdb', '1ACW-03.pdb', '1ACW-04.pdb', '1ACW-05.pdb', '1ACW-06.pdb']


## Read data from pdb file

In [44]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
#                 print(line)
                try:
                    atoms.append(Atom.parse(line))
                except Exception as e:
                    print(e)
    return atoms

In [45]:
reference = pd.DataFrame(read_molecule('reference.pdb'))
molecules_to_compare = [pd.DataFrame(read_molecule(m)) for m in path_molecules_to_compare]

In [165]:
molecules_to_compare[0][:10]

Unnamed: 0,name,element,aminoacid,x,y,z
0,N,N,VAL,-2.044,-1.352,0.0
1,H1,H,VAL,-1.261,-1.99,0.0
2,H2,H,VAL,-2.54,-1.446,0.875
3,H3,H,VAL,-2.54,-1.446,-0.875
4,CA,C,VAL,-1.523,0.0,0.0
5,HA,H,VAL,-1.871,0.524,0.89
6,CB,C,VAL,-1.989,0.768,-1.232
7,HB,H,VAL,-3.078,0.814,-1.241
8,CG1,C,VAL,-1.5,0.056,-2.49
9,HG11,H,VAL,-0.411,0.01,-2.481


In [146]:
reference[:10]

Unnamed: 0,name,element,aminoacid,x,y,z
0,N,N,VAL,0.298,0.965,-0.467
1,CA,C,VAL,0.25,1.811,-1.701
2,C,C,VAL,0.4,3.29,-1.32
3,O,O,VAL,1.053,3.628,-0.346
4,CB,C,VAL,1.384,1.417,-2.664
5,CG1,C,VAL,1.262,-0.064,-3.035
6,CG2,C,VAL,2.744,1.666,-2.002
7,H1,H,VAL,1.18,1.151,0.051
8,H2,H,VAL,0.255,-0.04,-0.731
9,H3,H,VAL,-0.512,1.196,0.142


## RMSD calculation

### No Translation

In [143]:
def apply_transformations(transformation, df):
    homogeneous_coordinates = df[['x', 'y', 'z']].copy()
    homogeneous_coordinates['_'] = 1.0
    return homogeneous_coordinates.dot(transformation)
    
def RMSD(molecule1, molecule2, functions=['CA'], transformation1=np.eye(4), transformation2=np.eye(4)):
    molecule1 = molecule1[molecule1.name.isin(functions)].copy()
    molecule1 = apply_transformations(transformation1, molecule1)

    molecule2 = molecule2[molecule2.name.isin(functions)].copy()
    molecule2 = apply_transformations(transformation2, molecule2)
    
    smaller_size = min(len(molecule1), len(molecule2))

    squared_differences = (molecule1[:smaller_size].reset_index(drop=True) - \
                           molecule2[:smaller_size].reset_index(drop=True)) ** 2

    squared_distance = squared_differences.sum(axis=1)
    average_squared_distance = squared_distance.mean()
    average_distance = np.sqrt(average_squared_distance)
    return average_distance

In [145]:
print('Not applying any transfomation.')
for idx, molecule_to_compare in enumerate(molecules_to_compare):
    print('RMSD between the reference and molecule %d: %f' % \
          (idx + 1, RMSD(reference, molecules_to_compare[idx])))

Not applying any transfomation.
RMSD between the reference and molecule 1: 23.519198
RMSD between the reference and molecule 2: 35.708949
RMSD between the reference and molecule 3: 23.615692
RMSD between the reference and molecule 4: 20.787576
RMSD between the reference and molecule 5: 22.118981
RMSD between the reference and molecule 6: 25.326716


### With translation.

In [160]:
ref_point = reference[reference.name == 'CA'].iloc[0]
ref_point

name            CA
element          C
aminoacid      VAL
x             0.25
y            1.811
z           -1.701
Name: 1, dtype: object

In [171]:
ref_point = reference[reference.name == 'CA'].iloc[0]

def infer_translation_matrix(molecule_to_compare):
    ref_to_displace = molecule_to_compare[molecule_to_compare.name == 'CA'].iloc[0]
    print(ref_to_displace)
    
    T_MATRIX = np.eye(4)
    T_MATRIX[0, 3] = (ref_point.x - ref_to_displace.x)
    T_MATRIX[1, 3] = (ref_point.y - ref_to_displace.y)
    T_MATRIX[2, 3] = (ref_point.z - ref_to_displace.z)
    return T_MATRIX

In [173]:
print('Applying transfomation where the first reference alpha carbon is the center of both molecules.')
for idx, molecule_to_compare in enumerate(molecules_to_compare):
    translation = infer_translation_matrix(molecule_to_compare)
    print('RMSD between the reference and molecule %d: %f' % \
          (idx + 1, RMSD(reference, molecules_to_compare[idx], transformation1=translation)))

Applying transfomation where the first reference alpha carbon is the center of both molecules.
name            CA
element          C
aminoacid      VAL
x           -1.523
y                0
z                0
Name: 4, dtype: object
RMSD between the reference and molecule 1: 45.495990
name            CA
element          C
aminoacid      VAL
x           -1.523
y                0
z                0
Name: 4, dtype: object
RMSD between the reference and molecule 2: 52.838069
name            CA
element          C
aminoacid      VAL
x           -1.523
y                0
z                0
Name: 4, dtype: object
RMSD between the reference and molecule 3: 45.545948
name            CA
element          C
aminoacid      VAL
x           -1.523
y                0
z                0
Name: 4, dtype: object
RMSD between the reference and molecule 4: 44.145846
name            CA
element          C
aminoacid      VAL
x           -1.523
y                0
z                0
Name: 4, dtype: object
RMSD bet