# Protein targets

In [1]:
# Cargar cada estructura desde el archivo que proporciona DUD

In [2]:
import os
import numpy as np
import pandas as pd
import glob
import json
import pypdb
import time
from prody import *
path_to_dud_target_files = os.path.join(*'../../../Benchmarks/dud_targets'.split('/'))

In [3]:
from Bio.Seq import Seq
from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastpCommandline

def get_uniprot_id_from_pdb_structure(prot_name, verbose = True,
                                      path_to_files = F'{path_to_dud_target_files}/'):
    # Read the .pdb file and obtains its sequence
    prot_name = prot_name.lower()
    rec_pdb_file = glob.glob(F'{path_to_files}/{prot_name}/*pdb')[0]
    rec_pdb = parsePDB(rec_pdb_file)
    resnums = rec_pdb.select('ca').getResnums()
    resids = rec_pdb.select('ca').getSequence()
    positions = dict(zip(resnums, resids))
    sequence = ''
    
    for i in range(1, resnums[-1]):
        try:
            sequence += positions[i]
        except:
            sequence += '-'
    while sequence[:5] == '-----':
        sequence = sequence[5:]

    # Saves the sequence  in a temporal fasta file
    with open("./temp_fasta.fasta", "w") as ofile:
        ofile.write(">" + 'Seq' + "\n" + sequence + "\n")
        ofile.close()
    # Perfroms the blastp over uniprot swissprot
    blastx_cline = NcbiblastpCommandline(cmd='blastp', query='temp_fasta.fasta', 
                    db = os.path.join(*'../../../Benchmarks/uniprot_db/uniprot_sprot.fasta'.split('/')), 
                    evalue = 0.0001, outfmt = 5, out="blast_results.xml")
    blastx_cline()
    # Reads the temporary results
    blast_record = NCBIXML.read(open("blast_results.xml"))
    # Gets the first record, with the highest score and from HUMAN
    try:
        human_titles = [i.title for i in blast_record.alignments if 'human' in i.title.lower()]
        title = human_titles[0]
        uniprot_id = title.split('|')[3]
        name = title.split('|')[4]
    except:
        title = blast_record.alignments[0].title
        uniprot_id = blast_record.alignments[0].title.split('|')[3]
        name = title.split('|')[4] + '*'
    if verbose:
        print(prot_name, uniprot_id, '\n', title)
    return {'name': name, 'accession': uniprot_id}

In [4]:
# First we have to obtain the UNIPROT Accession numbers
# dud_targets_uniprot = {}
# for name in dud_targets_names.keys():
#     dud_targets_uniprot[name] = get_uniprot_id_from_pdb_structure(prot_name = name)

### Saving the result

In [5]:

# with open('../../../Benchmarks/dud_targets_uniprot_accessions.json', 'w') as fp:
#     json.dump(dud_targets_uniprot, fp)

# Targets from DEKOIS2.0

In [22]:
dekois_targets_uniprot = {}
for key, pdb_id in dekois_targets_names.items():
    dict_info = pypdb.get_all_info(pdb_id)['polymer']
    
    if type(dict_info) is list:
        molecule = dict_info[0]['macroMolecule']
    elif type(dict_info) is dict:
        molecule = dict_info['macroMolecule']
    if type(molecule) is list:
        molecule = molecule[0]
    name = molecule['@name']
    uniprot_id = molecule['accession']['@id']
    dekois_targets_uniprot[key] = {'name': name, 'accession': uniprot_id}
    time.sleep(0.5)

In [23]:
dekois_targets_uniprot

{'11-beta-HSD1': {'name': 'Corticosteroid 11-beta-dehydrogenase isozyme 1',
  'accession': 'P28845'},
 '17-beta-HSD1': {'name': 'Estradiol 17-beta-dehydrogenase 1',
  'accession': 'P14061'},
 'A2A': {'name': 'Endolysin', 'accession': 'P00720'},
 'ACE': {'name': 'Angiotensin-converting enzyme', 'accession': 'P12821'},
 'ACE-2': {'name': 'Angiotensin-converting enzyme 2', 'accession': 'Q9BYF1'},
 'ACHE': {'name': 'Acetylcholinesterase', 'accession': 'P04058'},
 'ADAM17': {'name': 'Disintegrin and metalloproteinase domain-containing protein 17',
  'accession': 'P78536'},
 'ADRB2': {'name': 'Endolysin', 'accession': 'P00720'},
 'AKT1': {'name': 'RAC-alpha serine/threonine-protein kinase',
  'accession': 'P31749'},
 'ALR2': {'name': 'Aldose reductase', 'accession': 'P80276'},
 'AR': {'name': 'Androgen receptor', 'accession': 'P10275'},
 'AURKA': {'name': 'Aurora kinase A', 'accession': 'O14965'},
 'AURKB': {'name': 'Aurora kinase B-A', 'accession': 'Q6DE08'},
 'Bcl-2': {'name': 'Apoptosis r

In [107]:
# Saving the results
# with open('../../../Benchmarks/dekois2_targets_uniprot_accessions.json', 'w') as fp:
#     json.dump(dekois_targets_uniprot, fp)

# Getting the number of crystals in pdb

In [7]:
# opening the results
dekois_accessions = pd.read_json('../../../Benchmarks/datasets/dekois2_targets_uniprot_accessions.json').T
dud_accessions = pd.read_json('../../../Benchmarks/datasets/dud_targets_uniprot_accessions.json').T

# Reset the index
dud_accessions.reset_index(inplace=True)
dekois_accessions.reset_index(inplace=True)

# Setting the uniprot accession index as the new index
dud_accessions.set_index('accession', inplace = True)
dekois_accessions.set_index('accession', inplace = True)

In [8]:
# Number of targets on each set
print(F'Number of targets in DUD set: {dud_accessions.shape[0]}')

print(F'Number of targets in DEKOIS2.0 set: {dekois_accessions.shape[0]}')

Number of targets in DUD set: 21
Number of targets in DEKOIS2.0 set: 81


### Then we have to merge the dataframes

In [9]:
dekois_accessions.sort_index()[39:]

Unnamed: 0_level_0,index,name
accession,Unnamed: 1_level_1,Unnamed: 2_level_1
P17948,VEGFR1,Vascular endothelial growth factor receptor 1
P19793,RXR,Retinoic acid receptor RXR-alpha
P19971,TP,Thymidine phosphorylase
P24941,CDK2,Cyclin-dependent kinase 2
P27907,,Neuraminidase
P28845,11-beta-HSD1,Corticosteroid 11-beta-dehydrogenase isozyme 1
P29274,A2A,Adenosine receptor A2a
P31749,AKT1,RAC-alpha serine/threonine-protein kinase
P35968,VEGFR2,Vascular endothelial growth factor receptor 2
P37231,PPARg,Peroxisome proliferator-activated receptor gamma


In [10]:
# Merging the datasets
df_targets = pd.concat([dud_accessions, dekois_accessions], axis = 0).sort_index()
df_targets = df_targets.iloc[~df_targets.index.duplicated(keep='last')]
# Naming the table
df_targets.columns = ['name', 'long_name']

# Fill a new columns indicating if the target is in a given datset
# for dud
df_targets.loc[:, 'DUD'] = [1 if i in dud_accessions.index else 0 for i in df_targets.index]
# for dekois
df_targets.loc[:, 'DEKOIS2'] = [1 if i in dekois_accessions.index else 0 for i in df_targets.index]

In [11]:
df_targets

Unnamed: 0_level_0,name,long_name,DUD,DEKOIS2
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O14965,AURKA,Aurora kinase A,0,1
O15530,PDK1,3-phosphoinositide-dependent protein kinase 1,0,1
O76074,PDE5,"cGMP-specific 3',5'-cyclic phosphodiesterase",0,1
P00374,DHFR,Dihydrofolate reductase,1,1
P00533,EGFR,Epidermal growth factor receptor,1,1
...,...,...,...,...
Q92731,ER-beta,Estrogen receptor beta,0,1
Q92769,HDAC2,Histone deacetylase 2,0,1
Q9BY41,HDAC8,Histone deacetylase 8,0,1
Q9BYF1,ACE-2,Angiotensin-converting enzyme 2,0,1


In [12]:
len(df_targets[df_targets['DUD'] == 1])

20

In [50]:
# df_targets.to_json('targets_table.json')

## Get the number of crystals available for each target

In [13]:
from bs4 import BeautifulSoup
import requests

def pdb_ids_from_uniprot(uniprot_id, time_sleep = 2):
    r = requests.get('https://www.uniprot.org/uniprot/' + uniprot_id)
    soup = BeautifulSoup(r.content)
    pdb_tags = soup.find_all('a', {'class': 'pdb'})
    
    pdb_ids = [tag.get_text() for tag in pdb_tags]
    
    result_dict = {'pdb_ids': pdb_ids, 'n_crystals': len(pdb_ids)}
    
    print(uniprot_id, len(pdb_ids))
    time.sleep(time_sleep)
    return(result_dict)

In [14]:
targets_pdb_ids = {}
for uniprot_id in df_targets.index:
    targets_pdb_ids[uniprot_id] = pdb_ids_from_uniprot(uniprot_id)

O14965 156
O15530 68
O76074 38
P00374 79
P00533 213
P00734 392
P00742 146
P00749 146
P00750 11
P00811 106
P00813 2
P03366 337
P03372 296
P04035 22
P04058 118
P04062 32
P04150 43
P04183 4
P04585 214
P04587 71
P04626 34
P04818 60
P05979 30
P06239 56
P06401 20
P06737 18
P07550 36
P07711 38
P07900 300
P08069 30
P08253 11
P09874 67
P0C6X7 101
P10275 82
P10415 27
P11217 1
P11309 165
P11362 66
P11509 11
P12821 47
P12931 64
P14061 27
P15056 81
P15121 145
P17948 12
P19793 89
P19971 4
P21964 12
P22102 29
P22303 51
P23219 1
P24941 416
P27907 5
P28845 40
P29274 52
P31749 32
P35354 8
P35968 54
P37231 225
P43235 61
P45983 32
P45984 2
P48736 95
P49137 26
P49841 87
P52333 36
P52732 58
P53779 51
P54760 23
P55859 21
P62942 51
P78536 23
P80276 5
P9WGR1 97
Q00987 111
Q02763 15
Q04759 8
Q05769 51
Q07343 43
Q07869 21
Q08881 36
Q13464 27
Q16539 242
Q16769 26
Q6DE08 12
Q8IXJ6 31
Q92731 32
Q92769 8
Q9BY41 50
Q9BYF1 22
Q9P1W9 2


In [15]:
pd.DataFrame(targets_pdb_ids).T.to_json('targets_PDB_IDS.json')

## Old functions

In [417]:
# import requests
# from Bio import SeqIO
# import pickle
# import pypdb

def _blast_pdb(uniprot_id, identidad = 95):
    
    file_blast_record = 'target_fastas/' + uniprot_id + '_blast_record.pkl'
    if os.path.isfile(file_blast_record):
        blast_record = pickle.load(open(file_blast_record, 'rb'))
    else:
        # Descargamos el fasta de UniProt
        url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
        file_name_fasta = 'target_fastas/' + uniprot_id + '.fasta'
        open(file_name_fasta, 'wb').write(url_fasta.content)
        # Leemos la secuenciade aminoácidos
        fasta_prot = SeqIO.read(open(file_name_fasta),'fasta')
        seq_prot = str(fasta_prot.seq)
        try:
            blast_record = blastPDB(seq_prot)
            pickle.dump(blast_record, open(file_blast_record, 'wb'))
        except: pass
    
    pdbids = blast_record.getHits(percent_identity = identidad) # Devuelve un diccionario con cada proteína hit
    len(pdbids) # num de proteínas que tienen un 95% de identidad con la secuencia
    pdbids_list = list(pdbids.keys()) # lista de pdb_ids de las estructuras de la proteina
    print(F'{len(pdbids_list)} hits con una identidad del {identidad}%')
    time.sleep(2)
    return(pdbids_list)

In [77]:
# import requests
# from Bio import SeqIO
# import pickle
# import pypdb

def _blast_pdb(uniprot_id, identidad = 95):
    
    file_blast_record = 'target_fastas/' + uniprot_id + '_blast_record.pkl'
    if os.path.isfile(file_blast_record):
        blast_record = pickle.load(open(file_blast_record, 'rb'))
    else:
        # Descargamos el fasta de UniProt
        url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
        fasta = ''.join(url_fasta.text.split('\n')[1:])
        seq_prot = fasta
        try:
            blast_record = blastPDB(seq_prot)
            pickle.dump(blast_record, open(file_blast_record, 'wb'))
        except: pass
    
    pdbids = blast_record.getHits(percent_identity = identidad) # Devuelve un diccionario con cada proteína hit
    len(pdbids) # num de proteínas que tienen un 95% de identidad con la secuencia
    pdbids_list = list(pdbids.keys()) # lista de pdb_ids de las estructuras de la proteina
    print(F'{len(pdbids_list)} hits con una identidad del {identidad}%')
    time.sleep(2)
    return(pdbids_list)

In [174]:
# import pypdb
# import untangle
# import requests
# import pandas as pd
# import numpy as np

def _blast_pdb(uniprot_id, eCutoff = 0, time_sleep = 2):
    url_fasta = requests.get("https://www.uniprot.org/uniprot/" + uniprot_id + ".fasta")
    seq = ''.join(url_fasta.text.split('\n')[1:])

    payload = {
                'sequence': seq,
                'eCutoff':str(eCutoff),
                'matrix':'BLOSUM62',
                'outputFormat':'XML' # or 'HTML' or nothing for text format
                }

    pdbUrl = 'http://www.rcsb.org/pdb/rest/getBlastPDB1'

    # Create the url with parameters
    r = requests.get(pdbUrl, params = payload)
    # Parse 
    soup = BeautifulSoup(r.content)

    pdb_ids = []
    for hit in soup.find_all('hit'):
        pdb_id = hit.find('hit_def').get_text().split(':')[0]
        pdb_ids.append(pdb_id)
    #     e_val = float(hit.find('hsp_evalue').get_text())
    #     hit_len = int(hit.find('hit_len').get_text())
    print(uniprot_id, len(pdb_ids))
    
    time.sleep(time_sleep)
    return({'pdb_ids': pdb_ids, 'n_crystals': len(pdb_ids)})


In [47]:
dud_targets_names = {
    "ACE": "Angiotensin-converting enzyme",
    "ACHE": "Acetylcholine esterase",
    "ADA": "Adenosine deaminase",
    "ALR2": "Aldose reductase",
    "AmpC": "AmpC beta lactamase",
    "AR": "Androgen receptor",
    "CDK2": "Cyclin dependent kinase 2",
    "COMT": "Catechol O-methyltransferase",
    "COX1": "Cyclooxygenase 1",
    "COX2": "Cyclooxygenase 2",
    "DHFR": "Dihydrofolate reductase",
    "EGFr": "Epidermal growth factor receptor kinase",
    "ER_agonist": "Estrogen receptor agonist",
    "ER_antagonist": "Estrogen receptor antagonist",
    "FGFr1": "Fibroblast growth factor receptor kinase",
    "FXa": "Factor Xa",
    "GART": "glycinamide ribonucleotide transformylase",
    "GPB": "Glycogen phosphorylase beta",
    "GR": "Glutocorticoid receptor",
    "HIVPR": "HIV protease",
    "HIVRT": "HIV reverse transcriptase",
    "HMGR": "Hydroxymethylglutaryl-CoA reductase",
    "HSP90": "Human heat shock protein 90 kinase",
    "InhA": "Enoyl ACP reductase",
    "MR": "Mineralcorticoid receptor",
    "NA": "Neuraminidase",
    "P38": "P38 mitogen activated protein kinase",
    "PARP": "Poly(ADP-ribose) polymerase",
    "PDE5": "Phosphodiesterase V",
    "PDGFrb": "Platlet derived growth factor receptor kinase",
    "PNP": "Purine nucleoside phosphorylase",
    "PPARg": "Peroxisome proliferator activated receptor gamma",
    "PR": "Progesterone receptor",
    "RXRa": "Retinoic X receptor alpha",
    "SAHH": "S-adenosyl-homocysteine hydrolase",
    "SRC": "Tyrosine kinase SRC",
    "Thrombin": "Thrombin",
    "TK": "Thymidine kinase",
    "Trypsin": "Trypsin",
    "VEGFr2": "Vascular endothelial growth factor receptor kinase"
}

In [21]:
dekois_targets_names = {
    "11-beta-HSD1": "3tfq",
    "17-beta-HSD1": "3klm",
    "A2A": "3eml",
    "ACE": "1uze",
    "ACE-2": "1r4l",
    "ACHE": "1eve",
    "ADAM17": "3ewj",
    "ADRB2": "3ny9",
    "AKT1": "3qkl",
    "ALR2": "1ah3",
    "AR": "1e3g",
    "AURKA": "3fdn",
    "AURKB": "2vgo",
    "Bcl-2": "2w3l",
    "BRAF": "3skc",
    "CATL": "3bc3",
    "CDK2": "1ckp",
    "COX1": "3kk6",
    "COX2": "1cx2",
    "CTSK": "3kx1",
    "CYP2A6": "1z11",
    "DHFR": "1s3v",
    "EGFR": "1m17",
    "EPHB4": "2vwz",
    "ERBB2": "3pp0",
    "ER-beta": "3oll",
    "FGFR1": "1agw",
    "FKBP1A": "2dg3",
    "FXA": "1f0r",
    "GBA": "2wcg",
    "GR": "1nhz",
    "GSK3B": "3i4b",
    "HDAC2": "3max",
    "HDAC8": "3sff",
    "HIV1PR": "3nu3",
    "HIV1RT": "1s6p",
    "HMGA": "1hw8",
    "HSP90": "1uy6",
    "IGF1R": "3nw7",
    "INHA": "1p44",
    "ITK": "3mj1",
    "JAK3": "3lxl",
    "JNK1": "3elj",
    "JNK2": "3npc",
    "JNK3": "2b1p",
    "KIF11": "3k5e",
    "Lck": "3mpm",
    "MDM2": "3lbk",
    "MK2": "3kc3",
    "MMP2": "1hov",
    "NA": "1a4g",
    "P38": "1ouk",
    "PARP-1": "3l3m",
    "PDE4B": "3frg",
    "PDE5": "1xp0",
    "PDK1": "2xch",
    "PI3Kg": "3dbs",
    "PIM-1": "3r04",
    "PIM-2": "2iwi",
    "PNP": "1b8o",
    "PPARa": "2p54",
    "PPARg": "1fm9",
    "PR": "2w8y",
    "PRKCQ": "1xjd",
    "PYGL-in": "1xoi",
    "PYGL-out": "3dds",
    "QPCT": "2afx",
    "ROCK-1": "3v8s",
    "RXR": "2p1t",
    "SARS-HCoV": "2z94",
    "SIRT2": "1j8f",
    "SRC": "2src",
    "Thrombin": "3rm2",
    "TIE2": "2oo8",
    "TK": "1w4r",
    "TP": "1uou",
    "TPA": "1a5h",
    "TS": "1i00",
    "uPA": "3mhw",
    "VEGFR1": "3hng",
    "VEGFR2": "3c7q"
}

In [3]:
len(dekois_targets_names)

81