# Model missing regions in cDK2 Crystal structures using Modeller

In [1]:
from glob import glob
from pathlib import Path
import sys
sys.path.append('../..')
%load_ext autoreload
%autoreload 2

## Protein target Sequence

To model possible missed regions in the PDB structure, we'll use the CDK2 protein sequence from UniProt.
- <mark>Protein CDK2 (Human)</mark>:
    - UniProtKB: [P24941](https://www.uniprot.org/uniprot/P24941)
    
- Fetch the CDK2 sequence from UniProtKB:

In [2]:
from helper_modules.pdbs_from_uniport import *

prot_name  = 'cdk2'
uniprot_id = 'P24941'
target_sequence = get_seq_from_uniprot(uniprot_id)
print(f'\nProtein {prot_name.upper()} ({uniprot_id}) has {len(target_sequence)} residues.')
print(target_sequence)


Protein CDK2 (P24941) has 298 residues.
MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGLARAFGVPVRTYTHEVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQMLHYDPNKRISAKAALAHPFFQDVTKPVPHLRL


### Define input and output directories

In [3]:
ROOT_DIR = '../../data/crystal_confs/'

# Get the list of input files
INPUT_DIR = f'{ROOT_DIR}/pdb_chains'
input_files = sorted(glob(f'{INPUT_DIR}/*pdb'))

# Define the output directory
OUTPUT_DIR = f'{ROOT_DIR}/pdb_modeled'
Path(OUTPUT_DIR).mkdir(parents = True, exist_ok = True)

## Run Modeller

In [4]:
from helper_modules.run_modeller import run_modeller

In [5]:
# Model all molecules
for pdb_file in input_files[:2]:
    # Run modeller
    run_modeller(
             pdb_file        = pdb_file, 
             target_sequence = target_sequence, 
             output_dir      = OUTPUT_DIR, 
             keep_original_resnum = True,
             num_res_window       = 2, 
             max_var_iterations   = 1000, 
             repeat_optimization  = 2,
             chid = 'A', 
             verbose = True
            )
print('...')
# NOTE:
# `chid = 'A'` is hardcoded, chains were renamed to 'A' in the 
# `1_Download_CDK2_crystal_structures_from_the_PDB.ipynb` notebook

Modelling protein 1aq1_A
Target sequence
 MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGLARAFGVPVRTYTHEVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQMLHYDPNKRISAKAALAHPFFQDVTKPVPHLRL
Input sequence from .pdb file
 MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKI--------VPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGL-------------EVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQMLHYDPNKRISAKAALAHPFFQDVTKPVPHLRL
*****
{'num_gaps': 2, 'gap_lengths': [8, 13], 'gap_list': [[36, 43], [149, 161]], 'gap_window': [[34, 45], [147, 163]]}

                         MODELLER 10.2, 2021/11/15, r12267

     PROTEIN STRUCTURE MODELLING BY SATISFACTION OF SPATIAL RESTRAINTS


                     Copyright(c) 1989-2021 Andrej Sali
                   

Modelling protein 1b38_A
Target sequence
 MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGLARAFGVPVRTYTHEVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQMLHYDPNKRISAKAALAHPFFQDVTKPVPHLRL
Input sequence from .pdb file
 MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKI--------VPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGLARAFGVPVRTYTHEVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQMLHYDPNKRISAKAALAHPFFQDVTKPVPHLRL
*****
{'num_gaps': 1, 'gap_lengths': [8], 'gap_list': [[36, 43]], 'gap_window': [[34, 45]]}

check_ali___> Checking the sequence-structure alignment. 

Implied intrachain target CA(i)-CA(i+1) distances longer than  8.0 angstroms:

ALN_POS  TMPL  RID1  RID2  NAM1  NAM2     DIST
----------------------------------------------
END 

1
...
