# Assignment II
* Leoni Mota Loris

* At first, Load the data.

In [1]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

In [2]:
from collections import namedtuple

Atom = namedtuple('Atom', ['name', 'element', 'aminoacid', 'x', 'y', 'z', 'h', 'residue_seq_number'])

Atom.parse_name = lambda string: string[12:16].strip()
Atom.parse_element = lambda string: string[12] if len(string.strip()) < 78 \
                     else string[76:78].strip()
Atom.parse_aminoacid = lambda string: string[17:20].strip()
Atom.parse_x = lambda string: float(string[30:38].strip())
Atom.parse_y = lambda string: float(string[38:46].strip())
Atom.parse_z = lambda string: float(string[46:54].strip())
Atom.parse_residue_seq_number = lambda string: int(string[23:30].strip())

Atom.parse = lambda string: Atom(
    Atom.parse_name(string), Atom.parse_element(string), Atom.parse_aminoacid(string),
    Atom.parse_y(string), Atom.parse_x(string), Atom.parse_z(string), 1,Atom.parse_residue_seq_number(string))

In [3]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                try:
                    atoms.append(Atom.parse(line))
                except Exception as e:
                    print(e)
    return pd.DataFrame(atoms)

In [4]:
crambin = read_molecule('assets/1crn.pdb')
crambin[:10]

Unnamed: 0,name,element,aminoacid,x,y,z,h,residue_seq_number
0,N,N,THR,14.099,17.047,3.625,1,1
1,CA,C,THR,12.784,16.967,4.338,1,1
2,C,C,THR,12.755,15.685,5.133,1,1
3,O,O,THR,13.825,15.268,5.594,1,1
4,CB,C,THR,12.703,18.17,5.337,1,1
5,OG1,O,THR,12.829,19.334,4.463,1,1
6,CG2,C,THR,11.546,18.15,6.304,1,1
7,N,N,THR,11.555,15.115,5.265,1,2
8,CA,C,THR,11.469,13.856,6.066,1,2
9,C,C,THR,10.785,14.164,7.379,1,2


In [5]:
import re
hsv = plt.get_cmap('hsv')

def plot_molecule(molecule):
    fig = plt.figure(figsize=(16, 9))
    atoms = list(map(lambda n: re.findall('[A-Z]+', n)[0], molecule.name))
    indexes_aminoacids = {name: i for i, name in enumerate(atoms)}
    def aminoacid_name_to_color(name):
        return hsv(indexes_aminoacids[name] / (len(indexes_aminoacids) - 1))

    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(molecule.x, molecule.y, molecule.z, c=list(map(aminoacid_name_to_color, atoms)), s=70)
    ax.set_xlabel('X');ax.set_ylabel('Y');ax.set_zlabel('Z')

    plt.show()

## Question 1

### a)

* I) First we need to map the sequence of characters as in:

**TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATTCPGDYAN**
to a sequence of amino acids 

* II) And then, infer the position of each atom of the amino acid from a standard reference ($\alpha$ carbon) 


* III) Infer the position of multiple amino acids based on the peptide bond.

#### I) map the sequence of characters

In [6]:
aminoacid_mapping = {'T': 'THR', 'C': 'CYS', 'P': 'PRO', 'S': 'SER', 'I': 'ILE',
                     'V': 'VAL', 'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'F': 'PHE',
                     'L': 'LEU', 'G': 'GLY', 'E': 'GLU', 'D': 'ASP', 'Y': 'TYR'}
    
def map_to_aminoacids(sequence):
    return list(map(lambda char: aminoacid_mapping[char], sequence))

In [7]:
aminoacid_sequence = 'TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATTCPGDYAN'

In [8]:
aminoacids_names = map_to_aminoacids(aminoacid_sequence)

______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
    

#### II) infer the position of each atom of the amino acid. For this part, it is not well defined on how to infer the positions.


* We could map this sequence to known sequences of the PDB and hope it is a known protein. This is called 
**Homology Modelling** (**SWISS-MODEL** is an example).


* *Or*, we could predict the protein structure using an any number of softwares such as **I-TASSER (Iterative Threading ASSEmbly Refinement)**


We're are going to use both and compare the results.

I-TASSER:
![assets/TASSER.png](assets/TASSER.png)

* But, if we're not using something already pre-defined, we should get the pdb file for each of those amino acids and infer the position of each amino acid by the peptide bond.

*Obs.:* PDB's for each amino acid were downloaded here:
https://www.nyu.edu/pages/mathmol/library/life/life1.html

In [9]:
aminoacids_df = list(map(lambda aminoacid_name: read_molecule('assets/' + aminoacid_name + '.pdb'), aminoacids_names))

In [10]:
display(aminoacids_df[0])

Unnamed: 0,name,element,aminoacid,x,y,z,h,residue_seq_number
0,N,,THR,-0.083,0.08,0.0,1,1
1,CA,,THR,-0.033,1.539,0.0,1,1
2,C,,THR,1.394,2.032,0.0,1,1
3,O,,THR,2.339,1.268,0.0,1,1
4,CB,,THR,-0.818,2.116,-1.226,1,1
5,OG1,,THR,-2.188,1.743,-1.162,1,1
6,CG2,,THR,-0.842,3.654,-1.361,1,1
7,1H,1.0,THR,0.813,-0.523,0.0,1,1
8,2H,2.0,THR,-1.018,-0.461,0.0,1,1
9,HA,,THR,-0.508,1.904,0.929,1,1


______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
______________________________________________________________________________________________________________
    

#### III) Infer the position of multiple amino acids based on the peptide bond.


* After the nucleotilic attack the resonance delocalization of electrons in the peptide bond forces the creation of a plane where the atoms will be located (amide plan).

In [11]:
import numpy as np

def translation_matrix(delta_x, delta_y, delta_z, ref_x=.0, ref_y=.0, ref_z=.0):
    T = np.eye(4)
    T[0, -1], T[1, -1], T[2, -1] = delta_x - ref_x, delta_y - ref_y, delta_z - ref_z
    return T

def rotation_matrix(x_rotation_rad, y_rotation_rad, z_rotation_rad):
    Rx, Ry, Rz = np.eye(4), np.eye(4), np.eye(4)

    Rx[1, 1], Rx[1, 2], Rx[2, 1], Rx[2, 2] =\
        np.cos(x_rotation_rad), -np.sin(x_rotation_rad), np.sin(x_rotation_rad), np.cos(x_rotation_rad)

    Ry[0, 0], Ry[0, 2], Ry[2, 0], Rx[2, 2] =\
        np.cos(y_rotation_rad), np.sin(y_rotation_rad), -np.sin(y_rotation_rad), np.cos(y_rotation_rad)

    Rz[0, 0], Rx[0, 1], Rx[1, 0], Rx[1, 1] =\
        np.cos(z_rotation_rad), -np.sin(z_rotation_rad), np.sin(z_rotation_rad), np.cos(z_rotation_rad)

    return Rx * Ry * Rz

def apply_transformations(transformation, df):
    homogeneous_coordinates = df[['x', 'y', 'z']].copy()
    return transformation.dot(homogeneous_coordinates.values.T).T[:, :-1]

molecule2[['x', 'y','z']] = apply_transformations(transformation2, molecule2)

NameError: name 'transformation2' is not defined

In [119]:
def form_pepitide_bond(previous_aminoacid, current_aminoacid):
    previous_aminoacid[previous_aminoacid.name == 'C']
    

for i in range(1, len(aminoacids_df)):
    current_aminoacid = aminoacids_df[i]
    previous_aminoacid = aminoacids_df[i - 1]
    form_pepitide_bond(previous_aminoacid, current_aminoacid)