## <center> Demonstarting ChEMBL Database for Research in Drug Discovery

In [2]:
! pip install chembl_webresource_client



## Data Collection and Preprocessing

### Accessing data through the chembl_websource_client API

In [3]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [4]:
target = new_client.target
target_query = target.search('3CL')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,5.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
1,[],Human coronavirus NL63,Replicase polyprotein 1a,5.0,False,CHEMBL3232683,"[{'accession': 'P0C6U6', 'component_descriptio...",SINGLE PROTEIN,277944
2,[],Middle East respiratory syndrome-related coron...,Replicase polyprotein 1ab,5.0,False,CHEMBL4295557,"[{'accession': 'K9N7C7', 'component_descriptio...",SINGLE PROTEIN,1263720
3,[],Feline coronavirus (strain FIPV WSU-79/1146) (...,Replicase polyprotein 1ab,4.0,False,CHEMBL4295624,"[{'accession': 'Q98VG9', 'component_descriptio...",SINGLE PROTEIN,33734
4,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,3.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,3.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [5]:
selectedTarget = targets.target_chembl_id[5]
selectedTarget

'CHEMBL4523582'

In [6]:
activity = new_client.activity
bioActivity = activity.filter(target_chembl_id = selectedTarget).filter(standard_type = "IC50")
df = pd.DataFrame.from_dict(bioActivity)

In [7]:
df.head(3)
df['canonical_smiles']

0       Cc1c(OCC(F)(F)F)ccnc1C[S+]([O-])c1nc2ccccc2[nH]1
1                                  Cc1c(-c2cnccn2)ssc1=S
2                O=c1sn(-c2cccc3ccccc23)c(=O)n1Cc1ccccc1
3      O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)...
4                                O=C1C=Cc2cc(Br)ccc2C1=O
                             ...                        
112                               C=CC(=O)c1ccc2ccccc2c1
113                 C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.[Cl-]
114               Nc1ccc(S(=O)(=O)[N-]c2ncccn2)cc1.[Ag+]
115                                                 None
116    C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Cl.Nc1ccc2cc3cc...
Name: canonical_smiles, Length: 117, dtype: object

In [8]:
df.head(3)

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,Dtt Insensitive,19964199,[],CHEMBL4495583,SARS-CoV-2 3CL-Pro protease inhibition IC50 de...,F,,,BAO_0000190,BAO_0000019,...,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,IC50,uM,UO_0000065,,0.39
1,Dtt Insensitive,19964200,[],CHEMBL4495583,SARS-CoV-2 3CL-Pro protease inhibition IC50 de...,F,,,BAO_0000190,BAO_0000019,...,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,IC50,uM,UO_0000065,,0.21
2,Dtt Insensitive,19964201,[],CHEMBL4495583,SARS-CoV-2 3CL-Pro protease inhibition IC50 de...,F,,,BAO_0000190,BAO_0000019,...,Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,2697049,,,IC50,uM,UO_0000065,,0.08


In [12]:
cleanDF = df[df.standard_value.notna()]

### Pre-processing of the bioactivity data

**Labeling compounds as either being active, inactive, or intermediate**

The bioactivity data is in the **IC50 unit**. Compounds having values of less than 1000 nM will be considered to be active while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as intermediate.

In [11]:
activityClass = []

for i in cleanDF.standard_value:
    if float(i) >= 10000:
        activityClass.append('inactive')
    elif float(i) <= 1000:
        activityClass.append('active')
    else:
        activityClass.append('intermediate')


Molecule Chembl id to a list

In [15]:
mol_cid = []

for id in cleanDF.molecule_chembl_id:
    mol_cid.append(id)
    

Cannocial Smiles column list

In [14]:
canonical_smiles = []
for smiles in cleanDF.canonical_smiles:
    canonical_smiles.append(smiles)

In [16]:
standard_values = []
for value in cleanDF.standard_value:
    standard_values.append(value)

In [23]:
data_tuples = list(zip(mol_cid, canonical_smiles, activityClass, standard_values))
finalDF = pd.DataFrame(data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class','IC50(nm)'])

In [24]:
finalDF

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,IC50(nm)
0,CHEMBL480,Cc1c(OCC(F)(F)F)ccnc1C[S+]([O-])c1nc2ccccc2[nH]1,active,390.0
1,CHEMBL178459,Cc1c(-c2cnccn2)ssc1=S,active,210.0
2,CHEMBL3545157,O=c1sn(-c2cccc3ccccc23)c(=O)n1Cc1ccccc1,active,80.0
3,CHEMBL297453,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)...,intermediate,1580.0
4,CHEMBL4303595,O=C1C=Cc2cc(Br)ccc2C1=O,active,40.0
...,...,...,...,...
112,CHEMBL154580,C=CC(=O)c1ccc2ccccc2c1,intermediate,1240.0
113,CHEMBL354349,C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.[Cl-],intermediate,4980.0
114,CHEMBL1382627,Nc1ccc(S(=O)(=O)[N-]c2ncccn2)cc1.[Ag+],active,750.0
115,CHEMBL4303664,,active,880.0


### Processing Inhibition Data 

In [9]:
newBioactivity = activity.filter(target_chembl_id = selectedTarget).filter(standard_type = "Inhibition")


In [10]:
inhibitionDataFrame = pd.DataFrame.from_dict(newBioactivity)

KeyboardInterrupt: 