In [2]:
# Cargar cada estructura desde el archivo que proporciona DUD

In [110]:
import os
import numpy as np
import pandas as pd
import glob
from prody import *
path_to_dud_target_files = os.path.join(*'../../../Benchmarks/dud_targets'.split('/'))

In [64]:
from Bio.Seq import Seq
from Bio.Blast import NCBIXML
from Bio.Blast.Applications import NcbiblastpCommandline

def get_uniprot_id_from_pdb_structure(prot_name, verbose = True,
                                      path_to_files = F'{path_to_dud_target_files}/'):
    # Read the .pdb file and obtains its sequence
    prot_name = prot_name.lower()
    rec_pdb_file = glob.glob(F'{path_to_files}/{prot_name}/*pdb')[0]
    rec_pdb = parsePDB(rec_pdb_file)
    resnums = rec_pdb.select('ca').getResnums()
    resids = rec_pdb.select('ca').getSequence()
    positions = dict(zip(resnums, resids))
    sequence = ''
    
    for i in range(1, resnums[-1]):
        try:
            sequence += positions[i]
        except:
            sequence += '-'
    while sequence[:5] == '-----':
        sequence = sequence[5:]

    # Saves the sequence  in a temporal fasta file
    with open("./temp_fasta.fasta", "w") as ofile:
        ofile.write(">" + 'Seq' + "\n" + sequence + "\n")
        ofile.close()
    # Perfroms the blastp over uniprot swissprot
    blastx_cline = NcbiblastpCommandline(cmd='blastp', query='temp_fasta.fasta', 
                    db = os.path.join(*'../../../Benchmarks/uniprot_db/uniprot_sprot.fasta'.split('/')), 
                    evalue = 0.0001, outfmt = 5, out="blast_results.xml")
    blastx_cline()
    # Reads the temporary results
    blast_record = NCBIXML.read(open("blast_results.xml"))
    # Gets the first record, with the highest score and from HUMAN
    try:
        human_titles = [i.title for i in blast_record.alignments if 'human' in i.title.lower()]
        title = human_titles[0]
        uniprot_id = title.split('|')[3]
        name = title.split('|')[4]
    except:
        title = blast_record.alignments[0].title
        uniprot_id = blast_record.alignments[0].title.split('|')[3]
        name = title.split('|')[4] + '*'
    if verbose:
        print(prot_name, uniprot_id, '\n', title)
    return {'name': name, 'accession': uniprot_id}

In [108]:
# First we have to obtain the UNIPROT Accession numbers
# dud_targets_uniprot = {}
# for name in dud_targets_names.keys():
#     dud_targets_uniprot[name] = get_uniprot_id_from_pdb_structure(prot_name = name)

### Saving the result

In [67]:
# Saving the results
import json
# with open('../../../Benchmarks/dud_targets_uniprot_accessions.json', 'w') as fp:
#     json.dump(dud_targets_uniprot, fp)

# Targets from DEKOIS2.0

In [90]:
import pypdb
import time

In [115]:
# dekois_targets_uniprot = {}
# for key, pdb_id in dekois_targets_names.items():
#     dict_info = pypdb.get_all_info(pdb_id)['polymer']
    
#     if type(dict_info) is list:
#         molecule = dict_info[0]['macroMolecule']
#     elif type(dict_info) is dict:
#         molecule = dict_info['macroMolecule']
#     if type(molecule) is list:
#         molecule = molecule[0]
#     name = molecule['@name']
#     uniprot_id = molecule['accession']['@id']
#     dekois_targets_uniprot[key] = {'name': name, 'accession': uniprot_id}
#     time.sleep(0.5)

In [107]:
# Saving the results
# with open('../../../Benchmarks/dekois2_targets_uniprot_accessions.json', 'w') as fp:
#     json.dump(dekois_targets_uniprot, fp)

# Getting the number of crystals in pdb

In [294]:
# opening the results

dekois_accessions = pd.read_json('../../../Benchmarks/dekois2_targets_uniprot_accessions.json').T
dud_accessions = pd.read_json('../../../Benchmarks/dud_targets_uniprot_accessions.json').T

In [295]:
# Number of targets on each set
print(F'Number of targets in DUD set: {dud_accessions.shape[0]}')

print(F'Number of targets in DEKOIS2.0 set: {dekois_accessions.shape[0]}')

Number of targets in DUD set: 21
Number of targets in DEKOIS2.0 set: 81


### Then we have to merge the dataframes

In [304]:
# First we create a set of uniprot_ids
acc_num_set = (dud_accessions['accession'].to_list() +
                  dekois_accessions['accession'].to_list())
len(acc_num_set)
sorted(acc_num_set)

['O14965',
 'O15530',
 'O76074',
 'P00374',
 'P00374',
 'P00533',
 'P00533',
 'P00720',
 'P00720',
 'P00734',
 'P00742',
 'P00742',
 'P00749',
 'P00750',
 'P00811',
 'P00813',
 'P03366',
 'P03366',
 'P03372',
 'P03372',
 'P04035',
 'P04058',
 'P04062',
 'P04150',
 'P04150',
 'P04183',
 'P04585',
 'P04587',
 'P04626',
 'P04818',
 'P05979',
 'P06239',
 'P06401',
 'P06737',
 'P06737',
 'P07711',
 'P07900',
 'P08069',
 'P08253',
 'P09874',
 'P0C6X7',
 'P10275',
 'P10275',
 'P10415',
 'P11217',
 'P11309',
 'P11362',
 'P11362',
 'P11509',
 'P12821',
 'P12821',
 'P12931',
 'P14061',
 'P15056',
 'P15121',
 'P17948',
 'P19793',
 'P19793',
 'P19971',
 'P21964',
 'P22102',
 'P22303',
 'P23219',
 'P24941',
 'P24941',
 'P27907',
 'P28845',
 'P31749',
 'P35354',
 'P35968',
 'P43235',
 'P45983',
 'P45984',
 'P48736',
 'P49137',
 'P49841',
 'P52333',
 'P52732',
 'P53779',
 'P54760',
 'P55859',
 'P62942',
 'P78536',
 'P80276',
 'P9WGR1',
 'Q00987',
 'Q02763',
 'Q04759',
 'Q05769',
 'Q07343',
 'Q07869',

In [246]:
dekois_accessions[['accession']]

Unnamed: 0,accession
11-beta-HSD1,P28845
17-beta-HSD1,P14061
A2A,P00720
ACE,P12821
ACE-2,Q9BYF1
...,...
TPA,P00750
TS,P04818
uPA,P00749
VEGFR1,P17948


In [354]:
dekois_accessions = pd.read_json('../../../Benchmarks/dekois2_targets_uniprot_accessions.json').T
dud_accessions = pd.read_json('../../../Benchmarks/dud_targets_uniprot_accessions.json').T

# Reset the index
dud_accessions.reset_index(inplace=True)
dekois_accessions.reset_index(inplace=True)

# Setting the uniprot accession index as the new index
dud_accessions.set_index('accession', inplace = True)
dekois_accessions.set_index('accession', inplace = True)

# Merging the datasets
df_targets = pd.concat([a,b], axis = 0).sort_index()
df_targets = c.iloc[~c.index.duplicated(keep='last')]
# Naming the table
df_targets.columns = ['name', 'long_name']

# Fill a new columns indicating if the target is in a given datset
# for dud
df_targets.loc[:, 'DUD'] = [1 if i in dud_accessions.index else 0 for i in df_targets.index]
# for dekois
df_targets.loc[:, 'DEKOIS2'] = [1 if i in dekois_accessions.index else 0 for i in df_targets.index]



In [355]:
df_targets

Unnamed: 0_level_0,name,long_name,DUD,DEKOIS2
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O14965,AURKA,Aurora kinase A,0,1
O15530,PDK1,3-phosphoinositide-dependent protein kinase 1,0,1
O76074,PDE5,"cGMP-specific 3',5'-cyclic phosphodiesterase",0,1
P00374,DHFR,Dihydrofolate reductase,1,1
P00533,EGFR,Epidermal growth factor receptor,1,1
...,...,...,...,...
Q92731,ER-beta,Estrogen receptor beta,0,1
Q92769,HDAC2,Histone deacetylase 2,0,1
Q9BY41,HDAC8,Histone deacetylase 8,0,1
Q9BYF1,ACE-2,Angiotensin-converting enzyme 2,0,1


In [47]:
dud_targets_names = {
    "ACE": "Angiotensin-converting enzyme",
    "ACHE": "Acetylcholine esterase",
    "ADA": "Adenosine deaminase",
    "ALR2": "Aldose reductase",
    "AmpC": "AmpC beta lactamase",
    "AR": "Androgen receptor",
    "CDK2": "Cyclin dependent kinase 2",
    "COMT": "Catechol O-methyltransferase",
    "COX1": "Cyclooxygenase 1",
    "COX2": "Cyclooxygenase 2",
    "DHFR": "Dihydrofolate reductase",
    "EGFr": "Epidermal growth factor receptor kinase",
    "ER_agonist": "Estrogen receptor agonist",
    "ER_antagonist": "Estrogen receptor antagonist",
    "FGFr1": "Fibroblast growth factor receptor kinase",
    "FXa": "Factor Xa",
    "GART": "glycinamide ribonucleotide transformylase",
    "GPB": "Glycogen phosphorylase beta",
    "GR": "Glutocorticoid receptor",
    "HIVPR": "HIV protease",
    "HIVRT": "HIV reverse transcriptase",
    "HMGR": "Hydroxymethylglutaryl-CoA reductase",
    "HSP90": "Human heat shock protein 90 kinase",
    "InhA": "Enoyl ACP reductase",
    "MR": "Mineralcorticoid receptor",
    "NA": "Neuraminidase",
    "P38": "P38 mitogen activated protein kinase",
    "PARP": "Poly(ADP-ribose) polymerase",
    "PDE5": "Phosphodiesterase V",
    "PDGFrb": "Platlet derived growth factor receptor kinase",
    "PNP": "Purine nucleoside phosphorylase",
    "PPARg": "Peroxisome proliferator activated receptor gamma",
    "PR": "Progesterone receptor",
    "RXRa": "Retinoic X receptor alpha",
    "SAHH": "S-adenosyl-homocysteine hydrolase",
    "SRC": "Tyrosine kinase SRC",
    "Thrombin": "Thrombin",
    "TK": "Thymidine kinase",
    "Trypsin": "Trypsin",
    "VEGFr2": "Vascular endothelial growth factor receptor kinase"
}

In [6]:
dekois_targets_names = {
    "11-beta-HSD1": "3tfq",
    "17-beta-HSD1": "3klm",
    "A2A": "3eml",
    "ACE": "1uze",
    "ACE-2": "1r4l",
    "ACHE": "1eve",
    "ADAM17": "3ewj",
    "ADRB2": "3ny9",
    "AKT1": "3qkl",
    "ALR2": "1ah3",
    "AR": "1e3g",
    "AURKA": "3fdn",
    "AURKB": "2vgo",
    "Bcl-2": "2w3l",
    "BRAF": "3skc",
    "CATL": "3bc3",
    "CDK2": "1ckp",
    "COX1": "3kk6",
    "COX2": "1cx2",
    "CTSK": "3kx1",
    "CYP2A6": "1z11",
    "DHFR": "1s3v",
    "EGFR": "1m17",
    "EPHB4": "2vwz",
    "ERBB2": "3pp0",
    "ER-beta": "3oll",
    "FGFR1": "1agw",
    "FKBP1A": "2dg3",
    "FXA": "1f0r",
    "GBA": "2wcg",
    "GR": "1nhz",
    "GSK3B": "3i4b",
    "HDAC2": "3max",
    "HDAC8": "3sff",
    "HIV1PR": "3nu3",
    "HIV1RT": "1s6p",
    "HMGA": "1hw8",
    "HSP90": "1uy6",
    "IGF1R": "3nw7",
    "INHA": "1p44",
    "ITK": "3mj1",
    "JAK3": "3lxl",
    "JNK1": "3elj",
    "JNK2": "3npc",
    "JNK3": "2b1p",
    "KIF11": "3k5e",
    "Lck": "3mpm",
    "MDM2": "3lbk",
    "MK2": "3kc3",
    "MMP2": "1hov",
    "NA": "1a4g",
    "P38": "1ouk",
    "PARP-1": "3l3m",
    "PDE4B": "3frg",
    "PDE5": "1xp0",
    "PDK1": "2xch",
    "PI3Kg": "3dbs",
    "PIM-1": "3r04",
    "PIM-2": "2iwi",
    "PNP": "1b8o",
    "PPARa": "2p54",
    "PPARg": "1fm9",
    "PR": "2w8y",
    "PRKCQ": "1xjd",
    "PYGL-in": "1xoi",
    "PYGL-out": "3dds",
    "QPCT": "2afx",
    "ROCK-1": "3v8s",
    "RXR": "2p1t",
    "SARS-HCoV": "2z94",
    "SIRT2": "1j8f",
    "SRC": "2src",
    "Thrombin": "3rm2",
    "TIE2": "2oo8",
    "TK": "1w4r",
    "TP": "1uou",
    "TPA": "1a5h",
    "TS": "1i00",
    "uPA": "3mhw",
    "VEGFR1": "3hng",
    "VEGFR2": "3c7q"
}