<a href="https://colab.research.google.com/github/lareadeola/Bioinformatics/blob/main/Drug_Discovery_Part_II_Exploratory_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install conda and rdkit

In [14]:
# !wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
# !chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
# !bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local/
# !conda install -c rdkit rdkit -y
# import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('bioactivity_preprocessed_data.csv')

Calculating Lipinski descriptors

In [5]:
# Christopher Lipinski, a scientist at Pfizer, came up with a set of rule-of-thumb
# for evaluating the druglikeness of compounds. Such druglikeness is based on the ADME
# (Absorption, Distribution, Metabolism, and Excretion) that is also known as
# pharmokinetic profile. Lipinski analysed all orally active FDA-approved drugs in
# the formulation of what is to be known as the Rule-of-Five or Lipinski's Rule. The rules are:

# 1. Molecular weight < 500 Dalton
# 2. Octanol-water partition coeffiecient (LogP) < 5
# 3. Hydrogen bond donors < 5
# 4. Hydrogen bond acceptors < 10

In [6]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
[K     |████████████████████████████████| 29.5 MB 2.9 MB/s 
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2


In [7]:
import numpy as np
# Downgrade numpy to a version compatible with rdkit's ABI
!pip install numpy==1.18.5
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

Collecting numpy==1.18.5
  Downloading numpy-1.18.5-cp37-cp37m-manylinux1_x86_64.whl (20.1 MB)
[K     |████████████████████████████████| 20.1 MB 1.9 MB/s 
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.5
    Uninstalling numpy-1.21.5:
      Successfully uninstalled numpy-1.21.5
Successfully installed numpy-1.18.5


RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

Calculate Lipinski Descriptors

In [8]:
def lipinski(smiles, verbose=False):

  moldata = []
  for element in smiles:
    mol = Chem.MolFromSmiles(element)
    moldata.append(mol)

  base_data = np.arange(1,1)
  i = 0

  for molecule in moldata:
    desc_MolWt = Descriptors.MolWt(molecule)
    desc_MolLogP = Descriptors.MolLogP(molecule)
    desc_NumHDonors = Lipinski.NumHDonors(molecule)
    desc_NumHAcceptors = Lipinski.NumHAcceptors(molecule)

    row = np.array([desc_MolWt,
                    desc_MolLogP,
                    desc_NumHDonors,
                    desc_NumHAcceptors])

    if i == 0:
      base_data = row
    else:
      base_data = np.vstack([base_data, row])
    i = i + 1

  column_names = ['MW', 'LogP', 'NumHDonors', 'NumHAcceptors']

  descriptors = pd.DataFrame(data=base_data, columns=column_names)

  return descriptors

In [9]:
df_lipinski = lipinski(df.canonical_smiles.dropna())

Combine DataFrames

In [10]:
df_lipinski

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,281.271,1.89262,0.0,5.0
1,415.589,3.81320,0.0,2.0
2,421.190,2.66050,0.0,4.0
3,293.347,3.63080,0.0,3.0
4,338.344,3.53900,0.0,5.0
...,...,...,...,...
254,436.512,1.47440,3.0,4.0
255,496.583,2.73690,3.0,4.0
256,471.985,2.06450,2.0,4.0
257,465.594,2.19130,2.0,4.0


In [11]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate
...,...,...,...,...
256,CHEMBL5565685,CC1(C)[C@@H]2[C@@H](C(=O)N[C@H](C=O)C[C@@H]3CC...,14.0,active
257,CHEMBL5565858,CC1(C)[C@@H]2[C@@H](C(=O)N[C@H](C=O)C[C@@H]3CC...,48.0,
258,CHEMBL4802135,CC(C)(C)[C@H](NC(=O)C(F)(F)F)C(=O)N1C[C@H]2[C@...,20.0,
259,,,,inactive


In [12]:
df_combined = pd.concat([df, df_lipinski], axis=1)

In [13]:
df_combined

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,intermediate,281.271,1.89262,0.0,5.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,intermediate,415.589,3.81320,0.0,2.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,inactive,421.190,2.66050,0.0,4.0
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,inactive,293.347,3.63080,0.0,3.0
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,intermediate,338.344,3.53900,0.0,5.0
...,...,...,...,...,...,...,...,...
256,CHEMBL5565685,CC1(C)[C@@H]2[C@@H](C(=O)N[C@H](C=O)C[C@@H]3CC...,14.0,active,471.985,2.06450,2.0,4.0
257,CHEMBL5565858,CC1(C)[C@@H]2[C@@H](C(=O)N[C@H](C=O)C[C@@H]3CC...,48.0,,465.594,2.19130,2.0,4.0
258,CHEMBL4802135,CC(C)(C)[C@H](NC(=O)C(F)(F)F)C(=O)N1C[C@H]2[C@...,20.0,,499.534,1.09718,3.0,5.0
259,,,,inactive,,,,


Convert IC50 to pIC50