# Protein targets

In [2]:
# Cargar cada estructura desde el archivo que proporciona DUD

In [1]:
import os
import numpy as np
import pandas as pd
import glob
import json
import pypdb
import time
from prody import *
path_to_dud_target_files = os.path.join(*'../../../Benchmarks/dud_targets'.split('/'))

In [64]:
from Bio.Seq import Seq
from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastpCommandline

def get_uniprot_id_from_pdb_structure(prot_name, verbose = True,
                                      path_to_files = F'{path_to_dud_target_files}/'):
    # Read the .pdb file and obtains its sequence
    prot_name = prot_name.lower()
    rec_pdb_file = glob.glob(F'{path_to_files}/{prot_name}/*pdb')[0]
    rec_pdb = parsePDB(rec_pdb_file)
    resnums = rec_pdb.select('ca').getResnums()
    resids = rec_pdb.select('ca').getSequence()
    positions = dict(zip(resnums, resids))
    sequence = ''
    
    for i in range(1, resnums[-1]):
        try:
            sequence += positions[i]
        except:
            sequence += '-'
    while sequence[:5] == '-----':
        sequence = sequence[5:]

    # Saves the sequence  in a temporal fasta file
    with open("./temp_fasta.fasta", "w") as ofile:
        ofile.write(">" + 'Seq' + "\n" + sequence + "\n")
        ofile.close()
    # Perfroms the blastp over uniprot swissprot
    blastx_cline = NcbiblastpCommandline(cmd='blastp', query='temp_fasta.fasta', 
                    db = os.path.join(*'../../../Benchmarks/uniprot_db/uniprot_sprot.fasta'.split('/')), 
                    evalue = 0.0001, outfmt = 5, out="blast_results.xml")
    blastx_cline()
    # Reads the temporary results
    blast_record = NCBIXML.read(open("blast_results.xml"))
    # Gets the first record, with the highest score and from HUMAN
    try:
        human_titles = [i.title for i in blast_record.alignments if 'human' in i.title.lower()]
        title = human_titles[0]
        uniprot_id = title.split('|')[3]
        name = title.split('|')[4]
    except:
        title = blast_record.alignments[0].title
        uniprot_id = blast_record.alignments[0].title.split('|')[3]
        name = title.split('|')[4] + '*'
    if verbose:
        print(prot_name, uniprot_id, '\n', title)
    return {'name': name, 'accession': uniprot_id}

In [108]:
# First we have to obtain the UNIPROT Accession numbers
# dud_targets_uniprot = {}
# for name in dud_targets_names.keys():
#     dud_targets_uniprot[name] = get_uniprot_id_from_pdb_structure(prot_name = name)

### Saving the result

In [67]:

# with open('../../../Benchmarks/dud_targets_uniprot_accessions.json', 'w') as fp:
#     json.dump(dud_targets_uniprot, fp)

# Targets from DEKOIS2.0

In [115]:
# dekois_targets_uniprot = {}
# for key, pdb_id in dekois_targets_names.items():
#     dict_info = pypdb.get_all_info(pdb_id)['polymer']
    
#     if type(dict_info) is list:
#         molecule = dict_info[0]['macroMolecule']
#     elif type(dict_info) is dict:
#         molecule = dict_info['macroMolecule']
#     if type(molecule) is list:
#         molecule = molecule[0]
#     name = molecule['@name']
#     uniprot_id = molecule['accession']['@id']
#     dekois_targets_uniprot[key] = {'name': name, 'accession': uniprot_id}
#     time.sleep(0.5)

In [107]:
# Saving the results
# with open('../../../Benchmarks/dekois2_targets_uniprot_accessions.json', 'w') as fp:
#     json.dump(dekois_targets_uniprot, fp)

# Getting the number of crystals in pdb

In [2]:
# opening the results
dekois_accessions = pd.read_json('../../../Benchmarks/dekois2_targets_uniprot_accessions.json').T
dud_accessions = pd.read_json('../../../Benchmarks/dud_targets_uniprot_accessions.json').T

In [3]:
# Number of targets on each set
print(F'Number of targets in DUD set: {dud_accessions.shape[0]}')

print(F'Number of targets in DEKOIS2.0 set: {dekois_accessions.shape[0]}')

Number of targets in DUD set: 21
Number of targets in DEKOIS2.0 set: 81


### Then we have to merge the dataframes

In [7]:
# Reset the index
dud_accessions.reset_index(inplace=True)
dekois_accessions.reset_index(inplace=True)

# Setting the uniprot accession index as the new index
dud_accessions.set_index('accession', inplace = True)
dekois_accessions.set_index('accession', inplace = True)

# Merging the datasets
df_targets = pd.concat([dud_accessions, dekois_accessions], axis = 0).sort_index()
df_targets = df_targets.iloc[~df_targets.index.duplicated(keep='last')]
# Naming the table
df_targets.columns = ['name', 'long_name']

# Fill a new columns indicating if the target is in a given datset
# for dud
df_targets.loc[:, 'DUD'] = [1 if i in dud_accessions.index else 0 for i in df_targets.index]
# for dekois
df_targets.loc[:, 'DEKOIS2'] = [1 if i in dekois_accessions.index else 0 for i in df_targets.index]

In [8]:
df_targets

Unnamed: 0_level_0,name,long_name,DUD,DEKOIS2
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O14965,AURKA,Aurora kinase A,0,1
O15530,PDK1,3-phosphoinositide-dependent protein kinase 1,0,1
O76074,PDE5,"cGMP-specific 3',5'-cyclic phosphodiesterase",0,1
P00374,DHFR,Dihydrofolate reductase,1,1
P00533,EGFR,Epidermal growth factor receptor,1,1
...,...,...,...,...
Q92731,ER-beta,Estrogen receptor beta,0,1
Q92769,HDAC2,Histone deacetylase 2,0,1
Q9BY41,HDAC8,Histone deacetylase 8,0,1
Q9BYF1,ACE-2,Angiotensin-converting enzyme 2,0,1


In [None]:
# df_targets.to_json('targets_table.json')

## Get the number of crystals available for each target

In [417]:
import requests
from Bio import SeqIO
import pickle
import pypdb

def blast_pdb(uniprot_id, identidad = 95):
    
    file_blast_record = 'target_fastas/' + uniprot_id + '_blast_record.pkl'
    if os.path.isfile(file_blast_record):
        blast_record = pickle.load(open(file_blast_record, 'rb'))
    else:
        # Descargamos el fasta de UniProt
        url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
        file_name_fasta = 'target_fastas/' + uniprot_id + '.fasta'
        open(file_name_fasta, 'wb').write(url_fasta.content)
        # Leemos la secuenciade aminoácidos
        fasta_prot = SeqIO.read(open(file_name_fasta),'fasta')
        seq_prot = str(fasta_prot.seq)
        try:
            blast_record = blastPDB(seq_prot)
            pickle.dump(blast_record, open(file_blast_record, 'wb'))
        except: pass
    
    pdbids = blast_record.getHits(percent_identity = identidad) # Devuelve un diccionario con cada proteína hit
    len(pdbids) # num de proteínas que tienen un 95% de identidad con la secuencia
    pdbids_list = list(pdbids.keys()) # lista de pdb_ids de las estructuras de la proteina
    print(F'{len(pdbids_list)} hits con una identidad del {identidad}%')
    time.sleep(2)
    return(pdbids_list)

In [77]:
import requests
from Bio import SeqIO
import pickle
import pypdb

def blast_pdb(uniprot_id, identidad = 95):
    
    file_blast_record = 'target_fastas/' + uniprot_id + '_blast_record.pkl'
    if os.path.isfile(file_blast_record):
        blast_record = pickle.load(open(file_blast_record, 'rb'))
    else:
        # Descargamos el fasta de UniProt
        url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
        fasta = ''.join(url_fasta.text.split('\n')[1:])
        seq_prot = fasta
        try:
            blast_record = blastPDB(seq_prot)
            pickle.dump(blast_record, open(file_blast_record, 'wb'))
        except: pass
    
    pdbids = blast_record.getHits(percent_identity = identidad) # Devuelve un diccionario con cada proteína hit
    len(pdbids) # num de proteínas que tienen un 95% de identidad con la secuencia
    pdbids_list = list(pdbids.keys()) # lista de pdb_ids de las estructuras de la proteina
    print(F'{len(pdbids_list)} hits con una identidad del {identidad}%')
    time.sleep(2)
    return(pdbids_list)

In [None]:
targets_pdb_ids = {}
for uniprot_id in df_targets.index:
    try:
        targets_pdb_ids[uniprot_id] = blast_pdb(uniprot_id, identidad = 95)
    except:
        print(uniprot_id, 'No se pudo...')

@> Blast searching NCBI PDB database for "MDRSK..."
Waiting for 27s to reconnect NCBI for search results.

In [None]:
blast_pdb('Q05769', identidad = 95)

In [80]:
import requests
uniprot_id = 'P24941'
url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
fasta = ''.join(url_fasta.text.split('\n')[1:])

blast_result = pypdb.blast_from_sequence(fasta, 1e3)

In [153]:
import pypdb
import untangle
import requests
import pandas as pd
import numpy as np

uniprot_id = 'P24941'
url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
seq = ''.join(url_fasta.text.split('\n')[1:])
eCutoff = 1e-20

payload = {
            'sequence': seq,
            'eCutoff':str(eCutoff),
            'matrix':'BLOSUM62',
            'outputFormat':'XML' # or 'HTML' or nothing for text format
            }

pdbUrl = 'http://www.rcsb.org/pdb/rest/getBlastPDB1'

# create the url with parameters
r = requests.get(pdbUrl, params = payload)

In [154]:
soup = BeautifulSoup(r.content)

In [None]:
for hit in soup.find_all('hit'):
    pdb_id = hit.find('hit_def').get_text().split(':')[0]
    e_val = float(hit.find('hsp_evalue').get_text())
    hit_len = int(hit.find('hit_len').get_text())

In [158]:
# hit_id
# hit_num
x.find_all('hit')[100]

<hit>
<hit_num>101</hit_num>
<hit_id>gnl|BL_ORD_ID|114317</hit_id>
<hit_def>4D1X:1:A|pdbid|entity|chain(s)|sequence</hit_def>
<hit_accession>114317</hit_accession>
<hit_len>298</hit_len>
<hit_hsps>
<hsp>
<hsp_num>1</hsp_num>
<hsp_bit-score>615.15</hsp_bit-score>
<hsp_score>1585</hsp_score>
<hsp_evalue>5.12434e-176</hsp_evalue>
<hsp_query-from>1</hsp_query-from>
<hsp_query-to>298</hsp_query-to>
<hsp_hit-from>1</hsp_hit-from>
<hsp_hit-to>298</hsp_hit-to>
<hsp_query-frame>1</hsp_query-frame>
<hsp_hit-frame>1</hsp_hit-frame>
<hsp_identity>298</hsp_identity>
<hsp_positive>298</hsp_positive>
<hsp_align-len>298</hsp_align-len>
<hsp_qseq>MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETEGVPSTAIREISLLKELNHPNIVKLLDVIHTENKLYLVFEFLHQDLKKFMDASALTGIPLPLIKSYLFQLLQGLAFCHSHRVLHRDLKPQNLLINTEGAIKLADFGLARAFGVPVRTYTHEVVTLWYRAPEILLGCKYYSTAVDIWSLGCIFAEMVTRRALFPGDSEIDQLFRIFRTLGTPDEVVWPGVTSMPDYKPSFPKWARQDFSKVVPPLDEDGRSLLSQMLHYDPNKRISAKAALAHPFFQDVTKPVPHLRL</hsp_qseq>
<hsp_hseq>MENFQKVEKIGEGTYGVVYKARNKLTGEVVALKKIRLDTETE

In [120]:
ro.BlastOutput.BlastOutput_iterations.Iteration.Iteration_hits.Hit

[Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           
           
           
           
         ),
 Element(name = Hit, attributes = {}, cdata = 
           
           
           


## Web Scrapping

In [11]:
import urllib.request
from bs4 import BeautifulSoup

In [12]:
URL_PDBKe = 'https://www.ebi.ac.uk/pdbe/pdbe-kb/proteins/'

In [16]:
uniprot_id = 'Q05769'
url_prot = URL_PDBKe + uniprot_id

prot_data = urllib.request.urlopen( url_prot ).read().decode()

BeautifulSoup(prot_data)

<!DOCTYPE html>
<html lang="en">
<head>
<title>PDBe-KB Protein Pages</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="PDBe-KB Protein Pages" name="description"/>
<meta content="pdbe-kb, uniprot, protein, structure" name="keywords"/>
<meta content="PDBe-KB" name="author"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width" name="MobileOptimized"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<!-- Android Chrome mobile browser tab color -->
<meta content="#70BDBD" name="theme-color"/>
<!-- FavIcons -->
<link href="/pdbe/pdbe-kb/static/icon/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/pdbe/pdbe-kb/static/icon/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/pdbe/pdbe-kb/static/icon/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/pdbe/pdbe-kb/static/icon/site.webmanifest" rel="manifest"/>
<link color="#5bbad5" href="/pdbe

In [17]:
url_prot

'https://www.ebi.ac.uk/pdbe/pdbe-kb/proteins/Q05769'

In [18]:
import pypdb

pypdb.blast_from_sequence()

In [385]:
pdbids = blast_record.getHits(percent_identity = 95) # Devuelve un diccionario con cada proteína hit
pdbids_list = list(pdbids.keys()) # lista de pdb_ids de las estructuras de la proteina
print(F'Se obtuvo un total de {len(pdbids_list)} hits con una identidad del {identidad}%')

Se obtuvo un total de 75 hits con una identidad del 85%


In [396]:
df_targets

Unnamed: 0_level_0,name,long_name,DUD,DEKOIS2,pdb_ids
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
O14965,AURKA,Aurora kinase A,0,1,0.0
O15530,PDK1,3-phosphoinositide-dependent protein kinase 1,0,1,0.0
O76074,PDE5,"cGMP-specific 3',5'-cyclic phosphodiesterase",0,1,0.0
P00374,DHFR,Dihydrofolate reductase,1,1,0.0
P00533,EGFR,Epidermal growth factor receptor,1,1,0.0
...,...,...,...,...,...
Q92731,ER-beta,Estrogen receptor beta,0,1,0.0
Q92769,HDAC2,Histone deacetylase 2,0,1,0.0
Q9BY41,HDAC8,Histone deacetylase 8,0,1,0.0
Q9BYF1,ACE-2,Angiotensin-converting enzyme 2,0,1,0.0


In [47]:
dud_targets_names = {
    "ACE": "Angiotensin-converting enzyme",
    "ACHE": "Acetylcholine esterase",
    "ADA": "Adenosine deaminase",
    "ALR2": "Aldose reductase",
    "AmpC": "AmpC beta lactamase",
    "AR": "Androgen receptor",
    "CDK2": "Cyclin dependent kinase 2",
    "COMT": "Catechol O-methyltransferase",
    "COX1": "Cyclooxygenase 1",
    "COX2": "Cyclooxygenase 2",
    "DHFR": "Dihydrofolate reductase",
    "EGFr": "Epidermal growth factor receptor kinase",
    "ER_agonist": "Estrogen receptor agonist",
    "ER_antagonist": "Estrogen receptor antagonist",
    "FGFr1": "Fibroblast growth factor receptor kinase",
    "FXa": "Factor Xa",
    "GART": "glycinamide ribonucleotide transformylase",
    "GPB": "Glycogen phosphorylase beta",
    "GR": "Glutocorticoid receptor",
    "HIVPR": "HIV protease",
    "HIVRT": "HIV reverse transcriptase",
    "HMGR": "Hydroxymethylglutaryl-CoA reductase",
    "HSP90": "Human heat shock protein 90 kinase",
    "InhA": "Enoyl ACP reductase",
    "MR": "Mineralcorticoid receptor",
    "NA": "Neuraminidase",
    "P38": "P38 mitogen activated protein kinase",
    "PARP": "Poly(ADP-ribose) polymerase",
    "PDE5": "Phosphodiesterase V",
    "PDGFrb": "Platlet derived growth factor receptor kinase",
    "PNP": "Purine nucleoside phosphorylase",
    "PPARg": "Peroxisome proliferator activated receptor gamma",
    "PR": "Progesterone receptor",
    "RXRa": "Retinoic X receptor alpha",
    "SAHH": "S-adenosyl-homocysteine hydrolase",
    "SRC": "Tyrosine kinase SRC",
    "Thrombin": "Thrombin",
    "TK": "Thymidine kinase",
    "Trypsin": "Trypsin",
    "VEGFr2": "Vascular endothelial growth factor receptor kinase"
}

In [6]:
dekois_targets_names = {
    "11-beta-HSD1": "3tfq",
    "17-beta-HSD1": "3klm",
    "A2A": "3eml",
    "ACE": "1uze",
    "ACE-2": "1r4l",
    "ACHE": "1eve",
    "ADAM17": "3ewj",
    "ADRB2": "3ny9",
    "AKT1": "3qkl",
    "ALR2": "1ah3",
    "AR": "1e3g",
    "AURKA": "3fdn",
    "AURKB": "2vgo",
    "Bcl-2": "2w3l",
    "BRAF": "3skc",
    "CATL": "3bc3",
    "CDK2": "1ckp",
    "COX1": "3kk6",
    "COX2": "1cx2",
    "CTSK": "3kx1",
    "CYP2A6": "1z11",
    "DHFR": "1s3v",
    "EGFR": "1m17",
    "EPHB4": "2vwz",
    "ERBB2": "3pp0",
    "ER-beta": "3oll",
    "FGFR1": "1agw",
    "FKBP1A": "2dg3",
    "FXA": "1f0r",
    "GBA": "2wcg",
    "GR": "1nhz",
    "GSK3B": "3i4b",
    "HDAC2": "3max",
    "HDAC8": "3sff",
    "HIV1PR": "3nu3",
    "HIV1RT": "1s6p",
    "HMGA": "1hw8",
    "HSP90": "1uy6",
    "IGF1R": "3nw7",
    "INHA": "1p44",
    "ITK": "3mj1",
    "JAK3": "3lxl",
    "JNK1": "3elj",
    "JNK2": "3npc",
    "JNK3": "2b1p",
    "KIF11": "3k5e",
    "Lck": "3mpm",
    "MDM2": "3lbk",
    "MK2": "3kc3",
    "MMP2": "1hov",
    "NA": "1a4g",
    "P38": "1ouk",
    "PARP-1": "3l3m",
    "PDE4B": "3frg",
    "PDE5": "1xp0",
    "PDK1": "2xch",
    "PI3Kg": "3dbs",
    "PIM-1": "3r04",
    "PIM-2": "2iwi",
    "PNP": "1b8o",
    "PPARa": "2p54",
    "PPARg": "1fm9",
    "PR": "2w8y",
    "PRKCQ": "1xjd",
    "PYGL-in": "1xoi",
    "PYGL-out": "3dds",
    "QPCT": "2afx",
    "ROCK-1": "3v8s",
    "RXR": "2p1t",
    "SARS-HCoV": "2z94",
    "SIRT2": "1j8f",
    "SRC": "2src",
    "Thrombin": "3rm2",
    "TIE2": "2oo8",
    "TK": "1w4r",
    "TP": "1uou",
    "TPA": "1a5h",
    "TS": "1i00",
    "uPA": "3mhw",
    "VEGFR1": "3hng",
    "VEGFR2": "3c7q"
}