In [1]:
import rdkit

In [3]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [5]:
df = pd.read_csv('bioactivity_data.csv')


In [6]:
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

In [7]:
df_lipinski = lipinski(df.canonical_smiles)


In [8]:
df_combined = pd.concat([df,df_lipinski], axis=1)


In [9]:
df_combined

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,toid,type,units,uo_units,upper_value,value,MW,LogP,NumHDonors,NumHAcceptors
0,,530035,[],CHEMBL661155,Inhibition of rat ovarian Cytochrome P450 19A,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,34.0,232.283,1.3532,2.0,3.0
1,,539290,[],CHEMBL649360,In vitro inhibition of rat ovarian aromatase c...,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,2.0,253.301,2.775,1.0,3.0
2,,553204,[],CHEMBL649360,In vitro inhibition of rat ovarian aromatase c...,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,8.0,267.328,3.078,0.0,3.0
3,,615833,[],CHEMBL664329,Binding affinity for Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,,IC50,nM,UO_0000065,,3.0,324.775,2.8509,0.0,6.0
4,,789248,[],CHEMBL664334,In vitro inhibition of rat ovarian microsomal ...,B,,,BAO_0000190,BAO_0000019,...,,IC50,uM,UO_0000065,,6.2,232.283,1.3532,2.0,3.0
5,,819891,[],CHEMBL664334,In vitro inhibition of rat ovarian microsomal ...,B,,,BAO_0000190,BAO_0000019,...,,IC50,uM,UO_0000065,,0.265,251.329,3.5336,0.0,2.0
6,,15742388,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,4.6,235.286,3.2941,0.0,2.0
7,,15742389,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,0.17,224.263,2.6222,1.0,2.0
8,,15742390,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,0.26,224.263,2.6222,1.0,2.0
9,,16433981,[],CHEMBL3755852,Inhibition of aromatase in denucleated ovarian...,B,,,BAO_0000190,BAO_0000357,...,,IC50,uM,UO_0000065,,0.28,316.441,4.0599,0.0,3.0


In [10]:
def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x

Point to note: Values greater than 100,000,000 will be fixed at 100,000,000 otherwise the negative logarithmic value will become negative.


df_combined.standard_value.describe()


In [14]:
def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)
        
    return x

In [15]:
df_norm = norm_value(df_combined)
df_norm

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,type,units,uo_units,upper_value,value,MW,LogP,NumHDonors,NumHAcceptors,standard_value_norm
0,,530035,[],CHEMBL661155,Inhibition of rat ovarian Cytochrome P450 19A,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,34.0,232.283,1.3532,2.0,3.0,34000.0
1,,539290,[],CHEMBL649360,In vitro inhibition of rat ovarian aromatase c...,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,2.0,253.301,2.775,1.0,3.0,2000.0
2,,553204,[],CHEMBL649360,In vitro inhibition of rat ovarian aromatase c...,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,8.0,267.328,3.078,0.0,3.0,8000.0
3,,615833,[],CHEMBL664329,Binding affinity for Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,IC50,nM,UO_0000065,,3.0,324.775,2.8509,0.0,6.0,3.0
4,,789248,[],CHEMBL664334,In vitro inhibition of rat ovarian microsomal ...,B,,,BAO_0000190,BAO_0000019,...,IC50,uM,UO_0000065,,6.2,232.283,1.3532,2.0,3.0,6200.0
5,,819891,[],CHEMBL664334,In vitro inhibition of rat ovarian microsomal ...,B,,,BAO_0000190,BAO_0000019,...,IC50,uM,UO_0000065,,0.265,251.329,3.5336,0.0,2.0,265.0
6,,15742388,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,4.6,235.286,3.2941,0.0,2.0,4600.0
7,,15742389,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,0.17,224.263,2.6222,1.0,2.0,170.0
8,,15742390,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,0.26,224.263,2.6222,1.0,2.0,260.0
9,,16433981,[],CHEMBL3755852,Inhibition of aromatase in denucleated ovarian...,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,0.28,316.441,4.0599,0.0,3.0,280.0


In [16]:
df_final = pIC50(df_norm)
df_final


Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,type,units,uo_units,upper_value,value,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,,530035,[],CHEMBL661155,Inhibition of rat ovarian Cytochrome P450 19A,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,34.0,232.283,1.3532,2.0,3.0,4.468521
1,,539290,[],CHEMBL649360,In vitro inhibition of rat ovarian aromatase c...,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,2.0,253.301,2.775,1.0,3.0,5.69897
2,,553204,[],CHEMBL649360,In vitro inhibition of rat ovarian aromatase c...,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,8.0,267.328,3.078,0.0,3.0,5.09691
3,,615833,[],CHEMBL664329,Binding affinity for Cytochrome P450 19A1,B,,,BAO_0000190,BAO_0000357,...,IC50,nM,UO_0000065,,3.0,324.775,2.8509,0.0,6.0,8.522879
4,,789248,[],CHEMBL664334,In vitro inhibition of rat ovarian microsomal ...,B,,,BAO_0000190,BAO_0000019,...,IC50,uM,UO_0000065,,6.2,232.283,1.3532,2.0,3.0,5.207608
5,,819891,[],CHEMBL664334,In vitro inhibition of rat ovarian microsomal ...,B,,,BAO_0000190,BAO_0000019,...,IC50,uM,UO_0000065,,0.265,251.329,3.5336,0.0,2.0,6.576754
6,,15742388,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,4.6,235.286,3.2941,0.0,2.0,5.337242
7,,15742389,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,0.17,224.263,2.6222,1.0,2.0,6.769551
8,,15742390,[],CHEMBL3624597,Inhibition of rat ovarian aromatase,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,0.26,224.263,2.6222,1.0,2.0,6.585027
9,,16433981,[],CHEMBL3755852,Inhibition of aromatase in denucleated ovarian...,B,,,BAO_0000190,BAO_0000357,...,IC50,uM,UO_0000065,,0.28,316.441,4.0599,0.0,3.0,6.552842


In [17]:

df_2class = df_final[df_final.bioactivity_class != 'intermediate']
df_2class

AttributeError: 'DataFrame' object has no attribute 'bioactivity_class'