In [1]:
! pip install chembl_webresource_client



In [2]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

## Search for Target Protein

**Target** refers to target proteins or organism that the drug will act on. Drug will come into contact with the organism or protein and induce a modulatory activity (activate or inhibit).

**Hepatitis C** is a **liver disease** caused by the hepatitis C virus (HCV) that can lead to acute or chronic liver disease. Hepatitis C is important to research because it is a significant global health concern, causing chronic liver disease, cirrhosis, and liver cancer if left untreated, and despite recent breakthroughs in treatment, ongoing research is crucial to develop preventive measures like a vaccine, address treatment challenges in difficult populations, and better understand the virus's mechanisms to potentially eradicate it completely

In [3]:
# Target search for Hepatitis C
target = new_client.target
target_query = target.search('Hepatitis C')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Hepacivirus hominis,Hepatitis C virus,19.0,False,CHEMBL379,[],ORGANISM,3052230.0
1,"[{'xref_id': 'P26664', 'xref_name': None, 'xre...",Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,17.0,False,CHEMBL4620,"[{'accession': 'P26664', 'component_descriptio...",SINGLE PROTEIN,11104.0
2,"[{'xref_id': 'D2K2A8', 'xref_name': None, 'xre...",Hepatitis C virus,Hepatitis C virus NS4A protein,15.0,False,CHEMBL2364,"[{'accession': 'D2K2A8', 'component_descriptio...",SINGLE PROTEIN,3052230.0
3,"[{'xref_id': 'Q15004', 'xref_name': None, 'xre...",Homo sapiens,PCNA-associated factor,14.0,False,CHEMBL5574,"[{'accession': 'Q15004', 'component_descriptio...",SINGLE PROTEIN,9606.0
4,[],Homo sapiens,Hepatitis A virus cellular receptor 2,14.0,False,CHEMBL4630879,"[{'accession': 'Q8TDQ0', 'component_descriptio...",SINGLE PROTEIN,9606.0
...,...,...,...,...,...,...,...,...,...
872,[],Drosophila melanogaster,Nicotinic acetylcholine receptor,0.0,False,CHEMBL3350223,"[{'accession': 'P25162', 'component_descriptio...",PROTEIN COMPLEX GROUP,7227.0
873,[],Escherichia coli,1-deoxy-D-xylulose 5-phosphate reductoisomerase,0.0,False,CHEMBL3421521,"[{'accession': 'W8T2T2', 'component_descriptio...",SINGLE PROTEIN,562.0
874,[],Homo sapiens,UDP-glucuronosyltransferases (UGTs),0.0,False,CHEMBL4523985,"[{'accession': 'P22310', 'component_descriptio...",PROTEIN FAMILY,9606.0
875,[],Homo sapiens,Cytochrome P450,0.0,False,CHEMBL4523986,"[{'accession': 'P08684', 'component_descriptio...",PROTEIN FAMILY,9606.0


In [4]:
# selecting and retrieving bioactivity data for Hepatitis C virus polyprotein
selected_target = targets.target_chembl_id[1]
selected_target

'CHEMBL4620'

## Creating the CSV

In [5]:
# only retrieve bioactivity dta for Hepatitis C virus polyprotein (CHEMBL4620) that are reported as IC50
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [6]:
df = pd.DataFrame.from_dict(res)

In [7]:
df.head(5)

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33392,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,42.0
1,,,33393,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,4.3
2,,,35906,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,12.0
3,,,40613,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,3.6
4,,,40614,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,60.0


`standard value` represents the potency, the lower the number the better the potency of the drug becomes. Ideally we want the inhibitory concentration at 50% to have a low concentration.

In [8]:
df['standard_value']

Unnamed: 0,standard_value
0,42000.0
1,4300.0
2,12000.0
3,3600.0
4,60000.0
...,...
238,950.0
239,610.0
240,202.0
241,27000.0


In [9]:
df.to_csv('hepatitis_c_bioactivity_data.csv', index=False)

In [10]:
# mount google drive into notebook
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [11]:
# create data folder in Colab Notebook fold on google drive
! mkdir "/content/gdrive/My Drive/Colab Notebooks/Computational Drug Discovery: Hepatitis C/hepatitis_c_bioactivity_data"

mkdir: cannot create directory ‘/content/gdrive/My Drive/Colab Notebooks/Computational Drug Discovery: Hepatitis C/hepatitis_c_bioactivity_data’: File exists


In [12]:
! cp hepatitis_c_bioactivity_data.csv "/content/gdrive/My Drive/Colab Notebooks/Computational Drug Discovery: Hepatitis C/hepatitis_c_bioactivity_data"

In [13]:
! ls "/content/gdrive/My Drive/Colab Notebooks/Computational Drug Discovery: Hepatitis C/hepatitis_c_bioactivity_data"

hepatitis_c_bioactivity_data.csv


In [14]:
! head hepatitis_c_bioactivity_data.csv

action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,data_validity_comment,data_validity_description,document_chembl_id,document_journal,document_year,ligand_efficiency,molecule_chembl_id,molecule_pref_name,parent_molecule_chembl_id,pchembl_value,potential_duplicate,qudt_units,record_id,relation,src_id,standard_flag,standard_relation,standard_text_value,standard_type,standard_units,standard_upper_value,standard_value,target_chembl_id,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
,,33392,[],CHEMBL857692,Inhibitory concentration against HCV NS3 protease was determined,B,,,BAO_0000190,BAO_0000357,single protein format,CC[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(=O)c1cnccn1)[C@@H](C)CC)C(=O)C(=O)N1CCCC1,,,CHEMBL1136650,Bioorg Med Chem Lett,2003,"{'bei': '6.53', 'le':

In [15]:
df = df[df.standard_value.notna()]
df

Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,,33392,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,42.0
1,,,33393,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,4.3
2,,,35906,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,12.0
3,,,40613,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,3.6
4,,,40614,[],CHEMBL857692,Inhibitory concentration against HCV NS3 prote...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,uM,UO_0000065,,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,,213626,17603004,[],CHEMBL3705251,"Inhibition Assay: Briefly, 2-10 nM of purified...",B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,nM,UO_0000065,,950.0
239,,213627,17603005,[],CHEMBL3705251,"Inhibition Assay: Briefly, 2-10 nM of purified...",B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,nM,UO_0000065,,610.0
240,,214271,17603399,[],CHEMBL3705270,Inhibition Assay: Inhibition activity of HCV N...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,nM,UO_0000065,,202.0
241,,273482,17655802,[],CHEMBL3705926,Polymerase Assay: Assay Protocol: Either wild ...,B,,,BAO_0000190,...,Hepatitis C virus genotype 1a (isolate 1) (HCV),Hepatitis C virus polyprotein,11104,,,IC50,nM,UO_0000065,,27000.0


## Data pre-processing (bioactivity data)

The bioactivity data is in the I50 unit. Compounds having values of less than 1000 nM will be considered to be active while those greater than 10,000 nM will be considered to be inactive. Values between 1,000 and 10,000 nM will be referred to as interemediate.

In [16]:
# labeling compounds as either active, inactive, or intermediate
bioactivity_class = []
for i in df.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [17]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df = df[selection]

In [18]:
# pd.concat([df, pd.Series(bioactivity_class)], axis=1)
df2 = pd.DataFrame(data=bioactivity_class, columns=['bioactivity_class'])
df2

Unnamed: 0,bioactivity_class
0,inactive
1,intermediate
2,inactive
3,intermediate
4,inactive
...,...
237,active
238,active
239,active
240,inactive


In [19]:
df = df.merge(df2, left_index=True, right_index=True)
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL273701,CC[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)[C@@H](NC(...,42000.0,inactive
1,CHEMBL276488,CC[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)[C@@H](NC(...,4300.0,intermediate
2,CHEMBL13773,CC[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)[C@@H](NC(...,12000.0,inactive
3,CHEMBL13442,CC[C@H](NC(=O)[C@H](CC1CCCCC1)NC(=O)[C@@H](NC(...,3600.0,intermediate
4,CHEMBL266854,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)c1cnc...,60000.0,inactive
...,...,...,...,...
237,CHEMBL3644938,C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1CN(c2cc(OC)nc(N3...,1150.0,active
238,CHEMBL3644939,C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1CN(c2cc(Cl)nc(N3...,950.0,active
239,CHEMBL3644940,C=C[C@@H]1C[C@]1(NC(=O)[C@@H]1CN(c2cc(Nc3ccccc...,610.0,active
240,CHEMBL3659510,CCC[C@H](NC(=O)[C@@H]1[C@@H]2[C@H](CN1C(=O)[C@...,202.0,inactive


In [20]:
df.to_csv('hepatitis_c_bioactivity_data_preprocessed.csv', index=False)

In [21]:
! cp hepatitis_c_bioactivity_data_preprocessed.csv "/content/gdrive/My Drive/Colab Notebooks/Computational Drug Discovery: Hepatitis C/hepatitis_c_bioactivity_data_preprocessed.csv"