### Attempt to make, new improved x-ray structure analyser

In [1]:
from typing import Tuple, Optional, List
import warnings
import itertools
import pandas as pd
import numpy as np


from MDAnalysis import Universe
from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import HydrogenBondAnalysis as HBA
from MDAnalysis.analysis import contacts # TODO perhaps remove and use distances implementation?
from MDAnalysis.analysis import distances

  import xdrlib


In [2]:
# Amino acid definitions - helps define interaction types.
POSITIVE_SB_RESIDUES = ("LYS", "ARG")
NEGATIVE_SB_RESIDUES = ("GLU", "ASP")
HYDROPHOBIC_RESIDUES = ("ALA", "VAL", "LEU", "ILE", "PRO", "PHE", "CYS")
# CATION PI
# AROMATIC 
# OTHER INTERACTION TYPES. 
STANDARD_20_RESIDUES = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY", 
                        "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", 
                        "THR", "TRP", "TYR", "VAL"]

### Test single frame analysis

In [3]:
parm_file = r"WT_PTP1B_Closed.pdb"
coord_file = r"WT_PTP1B_Closed.pdb"

from tools_proj.contacts.contact_analysis import single_frame_contact_analysis
result_no_topo = single_frame_contact_analysis(#topology_file=parm_file,
                              coordinates_file=coord_file,
                             out_file="test_file.txt",
                             first_res=1,
                             last_res=250)
result_no_topo

Sytem setup complete, identifying interactions now.
Analysis complete, the file test_file.txt has been written to disk
Time taken: 0:00:12.136619


['GLU1 LYS4 saltbridge sc-sc',
 'GLU1 GLU5 vdw mc-wc',
 'GLU1 SER241 vdw sc-wc',
 'MET2 PHE6 vdw mc-wc',
 'MET2 MET234 vdw sc-sc',
 'MET2 PRO240 vdw sc-mc',
 'MET2 VAL243 vdw sc-wc',
 'MET2 ILE245 vdw sc-sc',
 'GLU3 GLU7 vdw mc-wc',
 'LYS4 GLN8 vdw mc-wc',
 'GLU5 ILE9 vdw wc-wc',
 'GLU5 LYS246 saltbridge sc-sc',
 'PHE6 ASP10 vdw wc-wc',
 'PHE6 TRP15 pipi sc-sc',
 'GLU7 LYS11 vdw mc-wc',
 'GLN8 SER12 vdw mc-wc',
 'ILE9 GLY13 vdw mc-mc',
 'ILE9 SER14 vdw wc-wc',
 'ILE9 TRP15 vdw wc-wc',
 'ILE9 ILE18 vdw sc-sc',
 'ASP10 TRP15 vdw wc-sc',
 'SER14 ILE18 vdw wc-wc',
 'TRP15 TYR19 vdw mc-wc',
 'ALA16 GLN20 vdw mc-wc',
 'ALA17 ASP21 vdw mc-wc',
 'ILE18 ILE22 vdw wc-wc',
 'ILE18 LYS246 vdw sc-sc',
 'TYR19 ARG23 vdw wc-wc',
 'GLN20 HIP24 vdw mc-wc',
 'ASP21 GLU25 vdw wc-wc',
 'ILE22 ALA26 vdw mc-wc',
 'ILE22 LYS246 vdw sc-sc',
 'ILE22 LEU249 vdw sc-sc',
 'ILE22 LEU250 vdw wc-sc',
 'GLU25 LEU250 vdw wc-sc',
 'ALA26 LEU250 vdw wc-sc',
 'SER27 PHE51 vdw mc-sc',
 'PHE29 PHE51 pipi sc-sc',
 'PHE29 HI

In [4]:
result_yes_topo = single_frame_contact_analysis(topology_file=r"WT_PTP1B_Closed.pdb",
                              coordinates_file=r"WT_PTP1B_Closed.pdb",
                             out_file="doesn't_matter_atm",
                             first_res=1,
                             last_res=298)
result_yes_topo

Sytem setup complete, identifying interactions now.
Analysis complete, the file doesn't_matter_atm has been written to disk
Time taken: 0:00:16.544609


['GLU1 LYS4 saltbridge sc-sc',
 'GLU1 GLU5 vdw mc-wc',
 'GLU1 SER241 vdw sc-wc',
 'MET2 PHE6 vdw mc-wc',
 'MET2 MET234 vdw sc-sc',
 'MET2 PRO240 vdw sc-mc',
 'MET2 VAL243 vdw sc-wc',
 'MET2 ILE245 vdw sc-sc',
 'MET2 TYR270 vdw mc-sc',
 'MET2 VAL273 vdw sc-sc',
 'MET2 ILE274 vdw wc-wc',
 'GLU3 GLU7 vdw mc-wc',
 'GLU3 ILE274 vdw wc-sc',
 'LYS4 GLN8 vdw mc-wc',
 'GLU5 ILE9 vdw wc-wc',
 'GLU5 LYS246 saltbridge sc-sc',
 'GLU5 TYR270 vdw sc-sc',
 'PHE6 ASP10 vdw wc-wc',
 'PHE6 TRP15 pipi sc-sc',
 'PHE6 ARG267 vdw sc-wc',
 'PHE6 TYR270 pipi sc-sc',
 'PHE6 LEU271 vdw sc-sc',
 'PHE6 ILE274 vdw sc-sc',
 'GLU7 LYS11 vdw mc-wc',
 'GLN8 SER12 vdw mc-wc',
 'ILE9 GLY13 vdw mc-mc',
 'ILE9 SER14 vdw wc-wc',
 'ILE9 TRP15 vdw wc-wc',
 'ILE9 ILE18 vdw sc-sc',
 'ASP10 TRP15 vdw wc-sc',
 'ASP10 ARG267 saltbridge sc-sc',
 'SER14 ILE18 vdw wc-wc',
 'TRP15 TYR19 vdw mc-wc',
 'TRP15 ALA263 vdw wc-wc',
 'TRP15 LEU266 vdw sc-wc',
 'TRP15 ARG267 vdw sc-wc',
 'TRP15 TYR270 pipi sc-sc',
 'ALA16 GLN20 vdw mc-wc',
 'A

### Testing multiple frame analysis

In [5]:
from tools_proj.contacts.contact_analysis import multi_frame_contact_analysis

# Using pdb as both 
parm_file = r"4YFM_MAB-1_test_traj.pdb" 
traj_file = r"4YFM_MAB-1_test_traj.pdb" # 5 frames. 

results_pdb = multi_frame_contact_analysis(topology_file=parm_file,
                             trajectory_file=traj_file,
                             out_file="traj_test.txt",
                             report_time_taken = True
)
results_pdb

Setup complete, identifying interactions now.
Time taken: 0:04:06.737289


[['ALA1 LEU5 vdw mc-wc',
  'ALA1 VAL29 vdw sc-sc',
  'ALA1 ILE252 vdw wc-wc',
  'ALA1 ALA256 vdw sc-sc',
  'PRO2 ALA6 vdw mc-wc',
  'PRO2 GLY30 vdw wc-mc',
  'PRO2 HID31 vdw mc-sc',
  'ASP3 SER7 vdw mc-mc',
  'GLU4 LEU8 vdw mc-wc',
  'GLU4 ILE252 vdw wc-wc',
  'GLU4 ARG255 saltbridge sc-sc',
  'LEU5 GLU9 vdw mc-wc',
  'LEU5 ILE16 vdw wc-sc',
  'LEU5 VAL18 vdw sc-sc',
  'LEU5 HID31 vdw wc-sc',
  'LEU5 ALA249 vdw sc-wc',
  'LEU5 ILE252 vdw sc-sc',
  'LEU5 VAL253 vdw sc-sc',
  'ALA6 LYS10 vdw mc-wc',
  'ALA6 HID31 vdw wc-sc',
  'SER7 ASP11 vdw mc-wc',
  'LEU8 PHE12 vdw mc-wc',
  'LEU8 ILE16 vdw sc-sc',
  'LEU8 THR245 vdw sc-wc',
  'LEU8 ALA248 vdw sc-wc',
  'LEU8 ALA249 vdw sc-wc',
  'LEU8 ILE252 vdw sc-sc',
  'GLU9 GLY13 vdw mc-mc',
  'GLU9 GLY14 vdw wc-mc',
  'GLU9 ARG15 vdw sc-wc',
  'GLU9 ILE16 vdw wc-wc',
  'GLU9 HID31 vdw sc-sc',
  'GLU9 ARG32 saltbridge sc-sc',
  'PHE12 ILE16 vdw sc-sc',
  'PHE12 THR232 vdw sc-sc',
  'PHE12 VAL233 vdw sc-mc',
  'PHE12 PRO234 vdw sc-wc',
  'PHE12 LY

In [6]:
results_pdb_single = multi_frame_contact_analysis(#topology_file=parm_file,
                             trajectory_file=traj_file,
                             out_file="traj_test.txt",
                             first_res=1,
                             #last_res=282, 
                             report_time_taken = True
)
results_pdb_single

Setup complete, identifying interactions now.
Time taken: 0:04:05.736638


[['ALA1 LEU5 vdw mc-wc',
  'ALA1 VAL29 vdw sc-sc',
  'ALA1 ILE252 vdw wc-wc',
  'ALA1 ALA256 vdw sc-sc',
  'PRO2 ALA6 vdw mc-wc',
  'PRO2 GLY30 vdw wc-mc',
  'PRO2 HID31 vdw mc-sc',
  'ASP3 SER7 vdw mc-mc',
  'GLU4 LEU8 vdw mc-wc',
  'GLU4 ILE252 vdw wc-wc',
  'GLU4 ARG255 saltbridge sc-sc',
  'LEU5 GLU9 vdw mc-wc',
  'LEU5 ILE16 vdw wc-sc',
  'LEU5 VAL18 vdw sc-sc',
  'LEU5 HID31 vdw wc-sc',
  'LEU5 ALA249 vdw sc-wc',
  'LEU5 ILE252 vdw sc-sc',
  'LEU5 VAL253 vdw sc-sc',
  'ALA6 LYS10 vdw mc-wc',
  'ALA6 HID31 vdw wc-sc',
  'SER7 ASP11 vdw mc-wc',
  'LEU8 PHE12 vdw mc-wc',
  'LEU8 ILE16 vdw sc-sc',
  'LEU8 THR245 vdw sc-wc',
  'LEU8 ALA248 vdw sc-wc',
  'LEU8 ALA249 vdw sc-wc',
  'LEU8 ILE252 vdw sc-sc',
  'GLU9 GLY13 vdw mc-mc',
  'GLU9 GLY14 vdw wc-mc',
  'GLU9 ARG15 vdw sc-wc',
  'GLU9 ILE16 vdw wc-wc',
  'GLU9 HID31 vdw sc-sc',
  'GLU9 ARG32 saltbridge sc-sc',
  'PHE12 ILE16 vdw sc-sc',
  'PHE12 THR232 vdw sc-sc',
  'PHE12 VAL233 vdw sc-mc',
  'PHE12 PRO234 vdw sc-wc',
  'PHE12 LY

In [7]:
# Using prmtop and traj...
parm_file = r"4YFM_MAB-1_apo.prmtop" 
traj_file = r"4YFM_MAB-1_test_traj.nc" # 5 frames. 

results_prmtop_traj = multi_frame_contact_analysis(topology_file=parm_file,
                             trajectory_file=traj_file,
                             out_file="doesn't matter now",
                             last_res=20, 
                             report_time_taken = True
)
results_prmtop_traj

Setup complete, identifying interactions now.
Time taken: 0:00:02.178457


[['ALA1 LEU5 vdw mc-wc',
  'PRO2 ALA6 vdw mc-wc',
  'ASP3 SER7 vdw mc-mc',
  'GLU4 LEU8 vdw mc-wc',
  'LEU5 GLU9 vdw mc-wc',
  'LEU5 ILE16 vdw wc-sc',
  'LEU5 VAL18 vdw sc-sc',
  'ALA6 LYS10 vdw mc-wc',
  'SER7 ASP11 vdw mc-wc',
  'LEU8 PHE12 vdw mc-wc',
  'LEU8 ILE16 vdw sc-sc',
  'GLU9 GLY13 vdw mc-mc',
  'GLU9 GLY14 vdw wc-mc',
  'GLU9 ARG15 vdw sc-wc',
  'GLU9 ILE16 vdw wc-wc',
  'PHE12 ILE16 vdw sc-sc'],
 ['ALA1 LEU5 vdw mc-wc',
  'PRO2 ALA6 vdw mc-wc',
  'ASP3 SER7 vdw mc-wc',
  'GLU4 LEU8 vdw mc-wc',
  'LEU5 GLU9 vdw mc-wc',
  'LEU5 VAL18 vdw sc-sc',
  'ALA6 LYS10 vdw mc-wc',
  'SER7 ASP11 vdw wc-wc',
  'LEU8 PHE12 vdw mc-wc',
  'LEU8 ILE16 vdw wc-sc',
  'GLU9 GLY13 vdw mc-mc',
  'GLU9 GLY14 vdw wc-mc',
  'GLU9 ARG15 vdw sc-wc',
  'GLU9 ILE16 vdw wc-wc',
  'PHE12 ILE16 vdw sc-sc'],
 ['ALA1 LEU5 vdw mc-mc',
  'PRO2 ALA6 vdw mc-wc',
  'ASP3 SER7 vdw mc-wc',
  'GLU4 LEU8 vdw mc-wc',
  'LEU5 GLU9 vdw mc-wc',
  'LEU5 VAL18 vdw sc-sc',
  'ALA6 LYS10 vdw mc-wc',
  'SER7 ASP11 vdw mc-wc

### (Side Chain) Hbonds

Can have a protocol if topology provided, but otherwise, this will work with standard 20 amino acids 
Could the expansion just be that I add the backbone names to each one, and then select from the whole aa and not from the side chain? 
Once a hydrogen bond found, can label it based on the atoms used perhaps?

Hbond defintion:
- up till 3.5 A, reducing between 2 and 3.5 good.
- 180 +- 45 degrees?, towards 180 = better.
- Could try to make a model that swaps between 0 and 1 for this? 

In [None]:
SC_HBOND_RES_ATOMS_ACCEPTOR = {"ASN": ["OD1"], 
                                "GLN": ["OE1"], 
                                "SER": ["OG"], 
                                "THR": ["OG1"], 
                                "TYR": ["OH"], 
                                "GLU": ["OE1", "OE2"], 
                                "GLU": ["OE1", "OE2"],
                                "ASP": ["OD1", "OD2"],
                                "ASH": ["OD1", "OD2"],
                                "HIE": ["ND1"],
                                "HID": ["NE2"]
}
# List of possible combinations? 
# Hydrogens that match the pair? 
SC_HBOND_RES_ATOMS_DONOR = {"ASN": [{"ND2": ["HD21", "HD22"]}], 
                            "ARG": [{"NE": ["HE"]}, {"NH1": ["HH11", "HH12"]}, {"NH2": ["HH21", "HH22"]}], 
                            "GLN": [{"NE2": ["HE21", "HE22"]}],
                            "SER": [{"OG": ["HG"]}],
                            "THR": [{"OG1": ["HG1"]}],
                            "TYR": [{"OH": ["HH"]}], 
                            "HIE": [{"NE2": ["HE2"]}], 
                            "HID": [{"ND1": ["HD1"]}], 
                            "HIP": [{"NE2": ["HE2"]}, {"ND1": ["HD1"]}],  
                            "LYS": [{"NZ": ["HZ1", "HZ2", "HZ3"]}], 
                            "LYN": [{"NZ": ["HZ2", "HZ3"]}]
}

# cutoffs. 
HB_DIST_CUTOFF = 3.5 # Donor-acceptor Distance
HB_ANGLE_IDEAL = 180 
HB_ANGLE_TOLERANCE = 45 #

In [None]:
parm_file = r"WT_PTP1B_Closed.pdb" 
traj_file = r"WT_PTP1B_Closed.pdb" 
universe = Universe(parm_file, traj_file) 

In [None]:
res1, res2 = 39, 63 # 39, 63 # Hbond, 
res1_mc_sele = "name C CA O N H and resid " + str(res1)
res1_sc_sele = "not name C CA O N H and resid " + str(res1)
res1_mc_atoms = universe.select_atoms(res1_mc_sele)
res1_sc_atoms = universe.select_atoms(res1_sc_sele)

res2_mc_sele = "name C CA O N H and resid " + str(res2)
res2_sc_sele = "not name C CA O N H and resid " + str(res2)
res2_mc_atoms = universe.select_atoms(res2_mc_sele)
res2_sc_atoms = universe.select_atoms(res2_sc_sele)

res1_name = universe.residues.resnames[res1-1] # 0-indexed
res2_name = universe.residues.resnames[res2-1] # 0-indexed
res1_name, res2_name

In [None]:
# Need to consider that if and elif can both be true, and need to test both! 


if (res1_name in SC_HBOND_RES_ATOMS_ACCEPTOR.keys()) and (res2_name in SC_HBOND_RES_ATOMS_DONOR.keys()):
    hb_potential_acceptor_atoms = SC_HBOND_RES_ATOMS_ACCEPTOR[res1_name]
    hb_potential_donor_h_pairs = SC_HBOND_RES_ATOMS_DONOR[res2_name]

    combinations = list(itertools.product(hb_potential_acceptor_atoms, hb_potential_donor_h_pairs))

    for acceptor, donor in combinations:
        acceptor_atom = res1_sc_atoms.select_atoms("name " + acceptor)
        donor_atom = res2_sc_atoms.select_atoms("name " + list(donor.keys())[0])

        # remove donor_acceptor that fail distance test. 
        d_a_dist = distances.distance_array(acceptor_atom.positions, donor_atom.positions)
        if d_a_dist > HB_DIST_CUTOFF:
            continue

        for hydrogen in list(donor.values())[0]:
            hydrogen_atom = res2_sc_atoms.select_atoms("name " + hydrogen)

            dh_vector = donor_atom.positions[0] - hydrogen_atom.positions[0]
            ha_vector = acceptor_atom.positions[0] - hydrogen_atom.positions[0] 
            dha_angle = angle_between_two_vectors(dh_vector, ha_vector) 

            delta_dha_ideal = min(
                np.abs(dha_angle - HB_ANGLE_IDEAL),
                np.abs(dha_angle - HB_ANGLE_IDEAL - 180)
            )
            if delta_dha_ideal < HB_ANGLE_TOLERANCE:
                print("H-bond found!")
                print(acceptor_atom.names, donor_atom.names, hydrogen_atom.names)

elif (res1_name in SC_HBOND_RES_ATOMS_DONOR.keys()) and (res2_name in SC_HBOND_RES_ATOMS_ACCEPTOR.keys()):
    hb_potential_donor_h_pairs = SC_HBOND_RES_ATOMS_DONOR[res1_name]
    hb_potential_acceptor_atoms = SC_HBOND_RES_ATOMS_ACCEPTOR[res2_name]
    # TODO, logic would be here, for follow up.
    # test_sc_hbond()

else: 
    print("no hbonds possible.")

### Hydrophobic

Requirements:
- 4 Å min distance.
- Between two "hydrophobic" residues 
- The atoms that contact must be non-polar (e.g. carbon).
- The atom should also be from the side-chain.

### Metals?
