# Model crystalographic structures using Modeller

- This notebook aimed to model missing loops inside the PDB structures of the target protein.

In [2]:
from pathlib import Path
from glob import glob
from prody import parsePDB
import sys
sys.path.append('../..')
from helper_modules.run_modeller import *
from helper_modules.get_pdb_ids_from_uniport import *

### Inputs
- Define some basic properties of the target protein.


In [3]:
prot_name       = 'hsp90'
uniprot_id      = 'P07900'
ref_struc_id    = '1uyg'

In [4]:
seq_prot_full = get_seq_from_uniprot(uniprot_id)
print(seq_prot_full)
print(f'\nThere are {len(seq_prot_full)} residues.')

MPEETQTQDQPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYESLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVEKERDKEVSDDEAEEKEDKEEEKEKEEKESEDKPEIEDVGSDEEEEKKDGDKKKKKKIKEKYIDQEELNKTKPIWTRNPDDITNEEYGEFYKSLTNDWEDHLAVKHFSVEGQLEFRALLFVPRRAPFDLFENRKKKNNIKLYVRRVFIMDNCEELIPEYLNFIRGVVDSEDLPLNISREMLQQSKILKVIRKNLVKKCLELFTELAEDKENYKKFYEQFSKNIKLGIHEDSQNRKKLSELLRYYTSASGDEMVSLKDYCTRMKENQKHIYYITGETKDQVANSAFVERLRKHGLEVIYMIEPIDEYCVQQLKEFEGKTLVSVTKEGLELPEDEEEKKKQEEKKTKFENLCKIMKDILEKKVEKVVVSNRLVTSPCCIVTSTYGWTANMERIMKAQALRDNSTMGYMAAKKHLEINPDHSIIETLRQKAEADKNDKSVKDLVILLYETALLSSGFSLEDPQTHANRIYRMIKLGLGIDEDDPTADDTSAAVTEEMPPLEGDDDTSRMEEVD

There are 732 residues.


<h4 style='color: black; background-color: #F9E5AB; padding: 5px;'>
    Important!
</h4>

- <mark>NOTE:</mark> We will used a subsequece of the EGFR and HSP90 protein, as we are only interested in modeling the protein's active site.
- For the EGFR and HSP90 we will use a pdb id as a reference to get only the region containing the active site.

## Keep only the binding site region

- For this protein we only will use the region comprising the binding site.
- Only the subsequence going from positions 16 to 223 will be considered.

In [7]:
# Get the reference structure and its sequence and residue positions
seq_cry, positions_cry = get_structure_sequence(ref_struc_id)
print(len(seq_cry))
smin = positions_cry[0]
smax = positions_cry[-1]

print(smin, smax)

seq_prot = seq_cry
seq_prot

208
16 223


'EVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYESLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVE'

## Start the Modelling process
### Define the input and output directories

In [8]:
OUT_MAIN   = './pdb_structures'

# Get the list of input files
INPUT_DIR = f'{OUT_MAIN}/pdb_chains'
input_files = sorted(glob(f'{INPUT_DIR}/*pdb'))

# Define the output directory
OUTPUT_DIR = f'{OUT_MAIN}/pdb_modeled'
Path(OUTPUT_DIR).mkdir(parents = True, exist_ok = True)

In [10]:
# Model all molecules
for pdb_file in input_files:
    # Load the pdb file
    pdb_chain = parsePDB(pdb_file)
    
    # Run modeller
    run_modeller(
                 pdb_file = pdb_file, 
                 seq_prot = seq_prot, 
                 output_dir = OUTPUT_DIR, 
                 keep_original_resnum = True,
                 num_res_window = 2, 
                 max_var_iterations = 500, 
                 repeat_optimization = 2,
                 chid = 'A',
                 verbose = True,
                 start_position = smin,
                 end_position = smax
                )

>>> ['GLU' 'GLU' 'GLU' ... 'GLU' 'GLU' 'GLU']
./pdb_structures/pdb_chains/1byq_A.pdb
EVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYETLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVE
***** 208
Modelling protein 1byq_A
EVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYETLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVE EVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYESLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVE
*****
Alignment(seqA='EVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYETLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVE', seqB='EVETFAFQAEIAQL

SequenceMismatchError: get_ran_648E> Alignment sequence does not match that in PDB file:        1  ./pdb_structures/pdb_chains/6tn4_A.pdb (You didn't specify the starting and ending residue numbers and chain IDs in the alignment, so Modeller tried to guess these from the PDB file.) Suggestion: put in the residue numbers and chain IDs (see the manual) and run again for more detailed diagnostics. You could also try running with allow_alternates=True to accept alternate one-letter code matches (e.g. B to N, Z to Q).

Finished!