# Project Objective
Use bioinformatics tools to identify potential drugs for the treatment of anthrax.

# Acquire

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# pip install chembl_webresource_client
from chembl_webresource_client.new_client import new_client

In [2]:
# Target search for coronavirus on ChEMBL
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,[],Coronavirus,Coronavirus,17.0,False,CHEMBL613732,[],ORGANISM,11119
1,[],SARS coronavirus,SARS coronavirus,15.0,False,CHEMBL612575,[],ORGANISM,227859
2,[],Feline coronavirus,Feline coronavirus,15.0,False,CHEMBL612744,[],ORGANISM,12663
3,[],Human coronavirus 229E,Human coronavirus 229E,13.0,False,CHEMBL613837,[],ORGANISM,11137
4,"[{'xref_id': 'P0C6U8', 'xref_name': None, 'xre...",SARS coronavirus,SARS coronavirus 3C-like proteinase,10.0,False,CHEMBL3927,"[{'accession': 'P0C6U8', 'component_descriptio...",SINGLE PROTEIN,227859
5,[],Middle East respiratory syndrome-related coron...,Middle East respiratory syndrome-related coron...,9.0,False,CHEMBL4296578,[],ORGANISM,1335626
6,"[{'xref_id': 'P0C6X7', 'xref_name': None, 'xre...",SARS coronavirus,Replicase polyprotein 1ab,4.0,False,CHEMBL5118,"[{'accession': 'P0C6X7', 'component_descriptio...",SINGLE PROTEIN,227859
7,[],Severe acute respiratory syndrome coronavirus 2,Replicase polyprotein 1ab,4.0,False,CHEMBL4523582,"[{'accession': 'P0DTD1', 'component_descriptio...",SINGLE PROTEIN,2697049


In [3]:
# Focusing on 'Replicase polyprotein 1ab' on index row 6
selected_target = targets.target_chembl_id[4]

# Outputs the unique id of the target from ChEMBL
selected_target

'CHEMBL3927'

In [4]:
# Get bioactivity data
activity = new_client.activity

# The standard_type='IC50' filters for bioactivity tests using the IC50 standard of measuring
res = activity.filter(target_chembl_id = selected_target).filter(standard_type='IC50')

In [5]:
df = pd.DataFrame.from_dict(res)
df.head()

# standard_value column represents potency
# a smaller number means a smaller dose is needed to exhibit and effect
# lower value means more potent, higher value means less potent

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1480935,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,7.2
1,,1480936,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,9.4
2,,1481061,[],CHEMBL830868,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.5
3,,1481065,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,13.11
4,,1481066,[],CHEMBL829584,In vitro inhibitory concentration against SARS...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,uM,UO_0000065,,2.0


In [6]:
df.to_csv('bioactivity_data.csv', index=False)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 45 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   activity_comment           0 non-null      object
 1   activity_id                133 non-null    int64 
 2   activity_properties        133 non-null    object
 3   assay_chembl_id            133 non-null    object
 4   assay_description          133 non-null    object
 5   assay_type                 133 non-null    object
 6   assay_variant_accession    0 non-null      object
 7   assay_variant_mutation     0 non-null      object
 8   bao_endpoint               133 non-null    object
 9   bao_format                 133 non-null    object
 10  bao_label                  133 non-null    object
 11  canonical_smiles           133 non-null    object
 12  data_validity_comment      42 non-null     object
 13  data_validity_description  42 non-null     object
 14  document_c

In [8]:
# look for null values
#df[df.value.notna()]
df.value.isna().sum()

0

# Prepare

In [30]:
# Divide the compounds into classes of potency
bioactivity_class = []
for i in df.value:
    if float(i) >= 10000:
        bioactivity_class.append('inactive')
    elif float(i) <= 1000:
        bioactivity_class.append('active')
    else:
        bioactivity_class.append('intermediate')

bioactivity_class

['active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',
 'active',

In [10]:
# this is a list of the moleculues tested against our target
df.molecule_chembl_id

0       CHEMBL187579
1       CHEMBL188487
2       CHEMBL185698
3       CHEMBL426082
4       CHEMBL187717
           ...      
128    CHEMBL2146517
129     CHEMBL187460
130     CHEMBL363535
131     CHEMBL227075
132      CHEMBL45830
Name: molecule_chembl_id, Length: 133, dtype: object

In [11]:
# bioactivity_class has the same length as df.molecule_chembl_id
len(bioactivity_class)

133

In [12]:
# Iterate the molecule_chembl_id to a list
#mol_cid = []
#for i in df.molecule_chembl_id:
#    mol_cid.append(i)

# Iterate canonical_smiles to a list
#canonical_smiles = []
#for i in df.canonical_smiles:
#    canonical_smiles.append(i)

# Iterate standard_value to a list
#standard_value = []
#for i in df.standard_value:
#    standard_value.append(i)

# Alternative method to create the lists
#mol_cid = df.molecule_chembl_id.to_list()
#canonical_smiles = df.canonical_smiles.to_list()
#standard_value = df.standard_value.to_list()

In [32]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
chem_df = df[selection]

# this method gives a warning (not an error)
# chem_df['bioactivity_class'] = bioactivity_class

# alternative method to add bioactivity_class column to dataframe
chem_df = pd.concat([chem_df, pd.Series(bioactivity_class)], axis=1 )
chem_df = chem_df.rename(columns={0:'bioactivity_class'})

chem_df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,active
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,active
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,active
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,active
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,active
...,...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,10600.0,active
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,10100.0,active
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,11500.0,active
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,10700.0,active


In [33]:
# Every single compound is an active agent
chem_df.bioactivity_class.value_counts()

active    133
Name: bioactivity_class, dtype: int64

In [35]:
# saved file of bioactivity dataframe
chem_df.to_csv('bioactivity_preprocessed_data.csv', index=False)