In [18]:
import pandas as pd
import numpy as np 
from chembl_webresource_client.new_client import new_client
from sklearn.utils.class_weight import compute_class_weight
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski



In [2]:
target = new_client.target
target_query = target.search('dengue fever')
targets = pd.DataFrame.from_dict(target_query)


In [3]:
selected_target = targets.target_chembl_id[12]


In [4]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

In [5]:
df.shape[0]

1284

In [6]:
df2 = df[df.standard_value.notna()]


### **Labeling compounds as either being active, inactive or intermediate**
The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be **active** while those greater than 10,000 nM will be considered to be **inactive**. As for those values in between 1,000 and 10,000 nM will be referred to as **intermediate**.

In [7]:
bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  else:
    bioactivity_class.append("intermediate")

In [8]:
counter_=0
for values in bioactivity_class:
    if values=='active':
        counter_+=1


In [9]:
mol_cid = []
for i in df2.molecule_chembl_id:
  mol_cid.append(i)
canonical_smiles = []
for i in df2.canonical_smiles:
  canonical_smiles.append(i)
standard_value = []
for i in df2.standard_value:
  standard_value.append(i)
data_tuples = list(zip(mol_cid, canonical_smiles, bioactivity_class, standard_value))
df3 = pd.DataFrame( data_tuples,  columns=['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'standard_value'])

selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2[selection]


In [10]:
df3
for index,row in df3.iterrows():
  if float(row['standard_value']) >= 10000:
    df3.loc[index,'Compound Activity']='Inactive'
    # bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    df3.loc[index,'Compound Activity']='Intermediate'
  else:
    df3.loc[index,'Compound Activity']='Active'
        
df3
    # df3.loc[index,'Compound Activity']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.loc[index,'Compound Activity']='Inactive'


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,Compound Activity
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,Inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,Inactive
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,Inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,Inactive
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,Inactive
...,...,...,...,...
1279,CHEMBL5190612,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccc(C(F)(F)F...,25200.0,Inactive
1280,CHEMBL5176952,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4cccc(C(F)(F)...,23900.0,Inactive
1281,CHEMBL5184942,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccccc4C(F)(F...,24000.0,Inactive
1282,CHEMBL4526128,O=C(N[C@@H](Cc1ccc(O)cc1)C(=O)O)c1cc(-c2ccccc2...,9610.0,Intermediate


In [11]:
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    # i=0  
    for i,mol in enumerate(moldata):        
        try:
            desc_MolWt = Descriptors.MolWt(mol)
            desc_MolLogP = Descriptors.MolLogP(mol)
            desc_NumHDonors = Lipinski.NumHDonors(mol)
            desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
            print(smiles[i])
            row = np.array([smiles[i],desc_MolWt,
                            desc_MolLogP,
                            desc_NumHDonors,
                            desc_NumHAcceptors])   
        
            if(i==0):
                baseData=row
            else:
                baseData=np.vstack([baseData, row])
        except:
            print(i)
            pass
            # i=i+1      
    
    columnNames=["canonical_smiles","MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors
     


In [19]:
df_lipinski = lipinski(df3.canonical_smiles)
df_lipinski


COc1ccc2nc3cccc(OC)c3nc2c1
O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O
O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CCOCC1
O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1
CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(Cl)c1=O
C[n+]1c2ccccc2nc2ccc(NCCO)cc21.[O-][Cl+3]([O-])([O-])[O-]
C=CCOc1c(Br)cc(/C=C2/C(=O)ON=C2C)cc1OC
Nc1cccc2c(O)cc(S(=O)(=O)O)cc12
O=C(Nc1ccccc1C(=O)O)c1cccc(C(=O)Nc2ccccc2C(=O)O)c1
CCOc1cc(/C=N/NC2=NC(=O)C(CC(=O)O)S2)ccc1OCc1ccc(Cl)cc1
O=c1c(O)cccc2cc(O)c(O)c(O)c12
COC(=O)c1ccc(N2NC(=O)/C(=C/c3cc(OC)c(OC)c(OC)c3)C2=O)cc1
O=C(O)c1ccc(Cl)c(-c2ccc(/C=N/NC(=O)C(O)(c3ccccc3)c3ccccc3)o2)c1
O=C(CN1C(=O)/C(=C2/SC(=S)N(CCS(=O)(=O)O)C2=O)c2ccccc21)Nc1cccc(C(F)(F)F)c1
O=C(O)c1ccc2c(c1Nc1ccccc1)C(=O)c1ccccc1C2=O
COc1cc(C)c2c(c1C=O)Oc1c(c(C)c(O)c3c1C(O)OC3=O)OC2=O
CC(=C\c1ccccc1)/C=C1/SC(=S)N(C(C(=O)O)C(C)C)C1=O
c1coc(-c2nc3ccccc3nc2-c2ccco2)c1
Cc1cc(=O)c2c(o1)C(=O)c1occc1C2=O
CC(C)OP(=O)(OC(C)C)C1(NS(=O)(=O)c2ccc(Cl)cc2)C=C(Cl)C(=O)C(Cl)=C1
O=C(O)c1ccc(NC2=C/C(=N/S(=O)(=O)c3cccs3)c3ccccc3C2

Unnamed: 0,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors
0,COc1ccc2nc3cccc(OC)c3nc2c1,240.262,2.800200000000001,0,4
1,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,268.224,2.4404000000000003,1,4
2,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,407.42600000000033,1.3856,1,5
3,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,306.27700000000004,3.8480000000000016,1,5
4,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,372.7720000000001,2.36698,0,8
...,...,...,...,...,...
1187,O=C(Nc1nc2cc(O)c(O)cc2s1)c1ccccc1Sc1ccc(C(F)(F...,505.5430000000002,6.156920000000004,1,6
1188,O=C(Nc1nc2cc(O)c(O)cc2s1)c1ccccc1Oc1ccc2cc(O)c...,505.5430000000002,6.156920000000004,1,6
1189,O=C(Nc1cccs1)c1ccccc1Sc1ccc([N+](=O)[O-])cc1,505.5430000000002,6.156920000000004,1,6
1190,O=C(Nc1nc2ccc([N+](=O)[O-])cc2s1)c1ccccc1Sc1cc...,582.678,6.0240000000000045,3,7


In [20]:
df3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,Compound Activity
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,Inactive
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,Inactive
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,Inactive
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,Inactive
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,Inactive
...,...,...,...,...
1279,CHEMBL5190612,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccc(C(F)(F)F...,25200.0,Inactive
1280,CHEMBL5176952,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4cccc(C(F)(F)...,23900.0,Inactive
1281,CHEMBL5184942,Cc1ccc(S(=O)(=O)Nc2ccc(-c3nnc(SCc4ccccc4C(F)(F...,24000.0,Inactive
1282,CHEMBL4526128,O=C(N[C@@H](Cc1ccc(O)cc1)C(=O)O)c1cc(-c2ccccc2...,9610.0,Intermediate


In [21]:
df_combined = pd.merge(df3, df_lipinski, on='canonical_smiles', how='inner')


In [22]:
for index,row in df_combined.iterrows():
    df_combined.loc[index,'pIC50']=-np.log10(float(row['standard_value']))


In [23]:
df_combined

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,Compound Activity,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL1401841,COc1ccc2nc3cccc(OC)c3nc2c1,100000.0,Inactive,240.262,2.800200000000001,0,4,-5.000000
1,CHEMBL1608853,O=C(O)c1ccc2c(c1)C(=O)/C(=C\c1ccco1)C2=O,12310.0,Inactive,268.224,2.4404000000000003,1,4,-4.090258
2,CHEMBL1429799,O=C1NN(c2ccccc2)C(=O)/C1=C\c1ccccc1OCC(=O)N1CC...,100000.0,Inactive,407.42600000000033,1.3856,1,5,-5.000000
3,CHEMBL246446,O=C(O)c1ccc2nc(-c3ccco3)c(-c3ccco3)nc2c1,50970.0,Inactive,306.27700000000004,3.8480000000000016,1,5,-4.707315
4,CHEMBL1383455,CCn1nc([N+](=O)[O-])c(C(C#N)c2nc3ccccc3n2C)c(C...,100000.0,Inactive,372.7720000000001,2.36698,0,8,-5.000000
...,...,...,...,...,...,...,...,...,...
1297,CHEMBL4440832,O=C(Nc1nc2cc(O)c(O)cc2s1)c1ccccc1Oc1ccc2cc(O)c...,990.0,Intermediate,505.5430000000002,6.156920000000004,1,6,-2.995635
1298,CHEMBL5192980,O=C(Nc1nc2ccc([N+](=O)[O-])cc2s1)c1ccccc1Sc1cc...,5400.0,Intermediate,582.678,6.0240000000000045,3,7,-3.732394
1299,CHEMBL5176286,O=C(Nc1nc2ccc([N+](=O)[O-])cc2s1)c1ccccc1Sc1cc...,21000.0,Inactive,712.8140000000002,4.005600000000007,6,7,-4.322219
1300,CHEMBL4450411,O=C(O)c1cc2sc(NC(=O)c3ccccc3Sc3ccc(C(F)(F)F)cc...,12300.0,Inactive,428.4690000000001,4.441200000000003,1,4,-4.089905


In [24]:
float(row['standard_value'])

12300.0

In [25]:
import os
os.chdir(r'C:\Users\nedaf\OneDrive\Desktop\Kaveh_Chem_Informatics')

In [26]:
selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df_combined[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)


# to run the following code copy and paste whatever after ! in the command line

In [28]:
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh


'wget' is not recognized as an internal or external command,
operable program or batch file.


'wget' is not recognized as an internal or external command,
operable program or batch file.


In [30]:
unzip padel.zip

SyntaxError: invalid syntax (2666330793.py, line 1)