### Install ChEMBL library to download data

In [6]:
! pip install chembl_webresource_client



### Import necessary libraries

In [7]:
import pandas as pd 
import csv

# imported as "new_client" instead of "chembl_webresource_client.new_client"
from chembl_webresource_client.new_client import new_client

### Search for Target Protein

#### Target search for coronavirus

In [8]:
# obtain datasets related to COVID from ChEMBL
# keyword == coronavirus
# drug targets refer to receptors (need to confirm)

target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)

targets

# note of the 10 results, 3 are single proteins 

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],Feline coronavirus,Feline coronavirus,14.0,False,CHEMBL612744,[],ORGANISM,12663
2,[],Murine coronavirus,Murine coronavirus,14.0,False,CHEMBL5209664,[],ORGANISM,694005
3,[],Canine coronavirus,Canine coronavirus,14.0,False,CHEMBL5291668,[],ORGANISM,11153
4,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
5,[],Human coronavirus OC43,Human coronavirus OC43,13.0,False,CHEMBL5209665,[],ORGANISM,31631
6,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
8,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
9,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


#### Select and retrieve bioactivity data for SARS coronavirus 3C-like proteinase

In [9]:
# We will work with the SARS coronavirus 3C-like proteinase (CHEMBL3927)

selected_target = targets.target_chembl_id[6]
selected_target

'CHEMBL3927'

In [10]:
# Retrieve IC_50 bioactivity data only for the selected target protein 
# Bioactivity data is a measure of the concentration of a drug needed to inhibit a biological process by 50% 


activity = new_client.activity
selected_target_activities = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
len(selected_target_activities)

133

In [11]:
df = pd.DataFrame.from_dict(selected_target_activities)

# drop compounds with no value in standard_value 
df = df[df.standard_value.notna()]

# check that all entries are of standard_type IC50 
print(df.standard_type.unique())

df

['IC50']


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,,,12041507,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.6
129,,,12041508,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.1
130,,,12041509,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,11.5
131,,,12041510,[],CHEMBL2150313,Inhibition of SARS-CoV PLpro expressed in Esch...,B,,,BAO_0000190,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,10.7


In [12]:
# Save bioactivity data as a CSV file. "index = False" means columns are copied without the headers 
df.standard_value.astype(str)
df.to_csv('bioactivity_data.csv',index=False)

In [13]:
# view standard type and standard value (represents potency of drug. a lower value correlates with increased potency)
# ideally we want a lower standard value, since less of the drug is needed to elicit 50% inhibition of a target protein

df["standard_value"] = df["standard_value"].astype(float)
df[["activity_id","assay_chembl_id","standard_type","standard_value","standard_units"]].sort_values(by="standard_value",ascending = True)

Unnamed: 0,activity_id,assay_chembl_id,standard_type,standard_value,standard_units
105,1831024,CHEMBL918059,IC50,50.00,nM
99,1830967,CHEMBL918059,IC50,60.00,nM
100,1830968,CHEMBL918059,IC50,63.00,nM
104,1831023,CHEMBL918059,IC50,65.00,nM
103,1831022,CHEMBL918059,IC50,95.00,nM
...,...,...,...,...,...
48,1720647,CHEMBL871248,IC50,407380.28,nM
47,1720646,CHEMBL871248,IC50,500000.00,nM
46,1720645,CHEMBL871248,IC50,501187.23,nM
44,1720643,CHEMBL871248,IC50,1000000.00,nM


### Pre-process Bioactivity Data    

Create a dataframe with the columns `molecule_chembl_id`, `canonical_smiles`, `standard_value`, from `df`and a new column `bioactivity_class` containing labels for each compound as follows: 

Bioactivity data is represented by IC<sub>50</sub> units.
- Active: < 1000 nM
- Inactive: > 10,000 nM 
- Intermediate: Between 1000 nM and 10,000 nM 

In [14]:
bioactivity_class = []

for i in df["standard_value"]:
    if float(i) <= 1000:
        bioactivity_class.append("active")
    elif float(i) >= 10000:
        bioactivity_class.append("inactive")
    else:
        bioactivity_class.append("intermediate")

df2 = df[["molecule_chembl_id","canonical_smiles","standard_value"]].copy()
df2["bioactivity_class"] = bioactivity_class

# save as a csv file 
df2.to_csv("bioactivity_preprocessed_data.csv", index = False)

df2

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate
...,...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,10600.0,inactive
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,10100.0,inactive
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,11500.0,inactive
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,10700.0,inactive
