# **Computational Drug Discovery [Part 1] Download Bioactivity Data**





Jesús Cea García

# **Instalamos las librerías**

In [None]:
! pip install chembl_webresource_client



# **Importamos las librerías**

In [None]:
# Importamos pandas para el análisis de datos y el cliente para interactuar con
# la API de ChEMBL
import pandas as pd
from chembl_webresource_client.new_client import new_client

# **Buscamos la proteína objetivo**

En este proyecto me quiero centrar en fármacos que actúen sobre la acetilcolinesterasa (AChE) humana. La AChE es una enzima clave en el sistema nervioso que hidroliza la acetilcolina, un neurotransmisor crucial para la transmisión sináptica. Es un objetivo farmacológico en enfermedades neurodegenerativas como el Alzheimer y en intoxicaciones por organofosforados.

In [None]:

target = new_client.target #Accedemos como nuevo cliente a la búsqueda de targets moleculares
target_query = target.search('AChE') #En el buscador de targets buscamos el término 'AChE' con .search, esto nos dará una lista de diccionarios
targets = pd.DataFrame.from_dict(target_query) #Transformamos los diccionarios de target_query a data.frame
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Torpedo californica,Acetylcholinesterase,25.0,False,CHEMBL4780,"[{'accession': 'P04058', 'component_descriptio...",SINGLE PROTEIN,7787
1,[],Electrophorus electricus,Acetylcholinesterase,25.0,False,CHEMBL4078,"[{'accession': 'O42275', 'component_descriptio...",SINGLE PROTEIN,8005
2,[],Musca domestica,Acetylcholinesterase,25.0,False,CHEMBL5752,"[{'accession': 'Q95P20', 'component_descriptio...",SINGLE PROTEIN,7370
3,[],Danio rerio,Acetylcholinesterase,25.0,False,CHEMBL3308995,"[{'accession': 'Q9DDE3', 'component_descriptio...",SINGLE PROTEIN,7955
4,[],Homo sapiens,Acetylcholinesterase,19.0,False,CHEMBL220,"[{'accession': 'P22303', 'component_descriptio...",SINGLE PROTEIN,9606
...,...,...,...,...,...,...,...,...,...
99,[],Macaca mulatta,Neuronal acetylcholine receptor subunit alpha-7,3.0,False,CHEMBL2150833,"[{'accession': 'Q866A2', 'component_descriptio...",SINGLE PROTEIN,9544
100,[],Musca domestica,Nicotinic acetylcholine receptor alpha 5 subunit,3.0,False,CHEMBL2366408,"[{'accession': 'A9XFY4', 'component_descriptio...",SINGLE PROTEIN,7370
101,[],Periplaneta americana,Nicotinic acetylcholine receptor alpha8 subunit,3.0,False,CHEMBL2366437,"[{'accession': 'H6TY12', 'component_descriptio...",SINGLE PROTEIN,6978
102,[],Drosophila melanogaster,Acetylcholine receptor subunit beta-like 2,3.0,False,CHEMBL2366470,"[{'accession': 'P25162', 'component_descriptio...",SINGLE PROTEIN,7227


# **Seleccionamos y obtenemos información de bioactividad para la SARS coronavirus 3C-like proteinase**

In [None]:
selected_target = targets.target_chembl_id[4]
selected_target


'CHEMBL220'

Vamos a centrarnos en los datos de bioactividad para AChE utilizando los valores Concentración Inhibitoria Media (IC$_{50}$) en nM. Esto es, la cantidad de sustancia inhibitoria con capacidad de inhibir un determinado componente o proceso biológico en un 50%.

In [None]:
activity = new_client.activity #Igual que antes con target, accedemos como nuevo cliente a los datos de actividad
resultado = activity.filter(target_chembl_id = selected_target).filter(standard_type = "IC50") #Filtramos las búsquedas de activdad para nuestro target y después para la columna
# standard_type

In [None]:
df = pd.DataFrame.from_dict(resultado) #Transformamos los diccionarios de resultado a data.frame

In [None]:
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9410,"{'action_type': 'INHIBITOR', 'description': 'N...",,25724873,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5391657,Inhibition of Acetylcholinesterase (unknown or...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,46.0
9411,"{'action_type': 'INHIBITOR', 'description': 'N...",,25724874,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5391657,Inhibition of Acetylcholinesterase (unknown or...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,38.31
9412,"{'action_type': 'INHIBITOR', 'description': 'N...",,25733694,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5393547,Inhibition of recombinant human AChE expressed...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,1.71
9413,,,25733695,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5393547,Inhibition of recombinant human AChE expressed...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,10.0


# **Transformamos el df a .csv**

In [None]:
df.to_csv('bioactivity_data1.csv', index = False)

# **Copiamos a Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
! cp bioactivity_data1.csv "/content/gdrive/My Drive/Colab Notebooks/data"

In [None]:
! ls


bioactivity_data1.csv  drive  gdrive  sample_data


In [None]:
!head bioactivity_data1.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholinesterase,B,,,BAO_0000190,BAO_0000357,single protein format,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,,,CHEMBL1148382,J Med Chem,2004.0,"{'bei': '19.61', 'le': '0.36', 'lle': '3.32', 'sei': '9.21'}",CHEMBL133897,,CHEMBL133897,6.12,0,http://www.op

# **Manejo de valores faltantes**

In [None]:
df2 = df[df.standard_value.notna()]
df2

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33969,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.75
1,,,37563,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.1
2,,,37565,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,50.0
3,,,38902,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.3
4,,,41170,[],CHEMBL643384,Inhibitory concentration against acetylcholine...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9410,"{'action_type': 'INHIBITOR', 'description': 'N...",,25724873,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5391657,Inhibition of Acetylcholinesterase (unknown or...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,46.0
9411,"{'action_type': 'INHIBITOR', 'description': 'N...",,25724874,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5391657,Inhibition of Acetylcholinesterase (unknown or...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,38.31
9412,"{'action_type': 'INHIBITOR', 'description': 'N...",,25733694,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5393547,Inhibition of recombinant human AChE expressed...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,1.71
9413,,,25733695,"[{'comments': None, 'relation': '=', 'result_f...",CHEMBL5393547,Inhibition of recombinant human AChE expressed...,B,,,BAO_0000190,...,Homo sapiens,Acetylcholinesterase,9606,,,IC50,uM,UO_0000065,,10.0


# **Preprocesamiento de los datos**

# **Codificando los datos como activos, intermedios o inactivos**

Para IC50, compuestos con más de 10000 nM se considerarán inactivos, menos de 1000 nM activos y entre 1000 y 10000 intermedios.

In [None]:
bioactivity_class = []

for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append('inactive')
  elif float(i) <= 1000:
    bioactivity_class.append('active')
  else:
    bioactivity_class.append('intermediate')

**Iteramos el ChEMBL ID en una lista**

In [None]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)

**Realizamos el mismo proceso sobre cannonical_smile (representación de la estructura química de la molécula) y standard_value**

In [None]:
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)

In [None]:
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)

# **Creamos un nuevo data.frame con estas columnas**

In [None]:
df3 = pd.DataFrame(data = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value)), columns = ['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])


In [None]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active,750.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active,100.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive,50000.0
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active,300.0
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active,800.0
...,...,...,...,...
8125,CHEMBL5398421,COc1cc(O)c2c(c1)C(=O)c1cc(O)c(O)cc1CCN2,inactive,46000.0
8126,CHEMBL11298,N[C@@H](CO)C(=O)O,inactive,38310.0
8127,CHEMBL5395312,CN1CCN(c2ccc(C(=O)Nc3cc(-c4nc5ccccc5[nH]4)n[nH...,intermediate,1710.0
8128,CHEMBL5399112,O=C(Nc1cc(-c2nc3ccccc3[nH]2)n[nH]1)c1ccc(N2CCN...,inactive,10000.0


In [None]:
df3.to_csv('bioactivity_preprocessed_data1.csv', index=False)

In [None]:
! cp bioactivity_preprocessed_data1.csv "/content/gdrive/My Drive/Colab Notebooks/data2"

In [None]:
! ls

bioactivity_data1.csv  bioactivity_preprocessed_data1.csv  drive  gdrive  sample_data
