# Compute Descriptors

Computation of descriptors used in Principal Componant Analysis:

1. Molecular Descriptors  
    - num_heavy_atoms
    - molecular_weight
    - num_rings
    - num_rings_arom
    - num_rings_ali
    - num_hbd
    - num_hba
    - slogp
    - tpsa
    - num_rotatable_bond
    - num_atom_oxygen
    - num_atom_nitrogen
    - ring_size_min
    - ring_size_max
    - frac_sp3

  
2. Subset Annotations
    - num_lipinski_violations
    - num_veber_violations
    
## Initialization

In [8]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# Global Imports
from pathlib import Path
import sys

import pandas as pd
import numpy as np

from rdkit import DataStructs
from rdkit.Chem import AllChem as Chem
# from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

# from Contrib.NP_Score import npscorer

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Project-local Imports
PROJECT_DIR = list(Path("..").absolute().parents)[1]
sys.path.append(str(PROJECT_DIR))
import plt_style
import utils as u

Path("../tmp").mkdir(exist_ok=True)

In [9]:
import rdkit
rdkit.__version__

'2019.09.3'

## Load data

In [2]:
input_tsv = '../tmp/chembl_np_deglyco_std.tsv'
df = u.read_tsv(input_tsv)
df

RDKit ERROR: [13:12:44] Explicit valence for atom # 26 B, 5, is greater than permitted


Unnamed: 0,Col0,Chembl_Id,Smiles
0,0,CHEMBL3916495,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c800>
1,1,CHEMBL3917534,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c760>
2,2,CHEMBL3954218,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c620>
3,3,CHEMBL3956388,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c7b0>
4,4,CHEMBL3953960,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c3f0>
...,...,...,...
38361,9985,CHEMBL1077644,<rdkit.Chem.rdchem.Mol object at 0x7f414eaddc60>
38362,9986,CHEMBL1078842,<rdkit.Chem.rdchem.Mol object at 0x7f414eaddcb0>
38363,9987,CHEMBL1163882,<rdkit.Chem.rdchem.Mol object at 0x7f414eaddd00>
38364,9988,CHEMBL1087405,<rdkit.Chem.rdchem.Mol object at 0x7f414eaddd50>


In [3]:
# display failed molecules
df[df['Smiles'].isna()]

Unnamed: 0,Col0,Chembl_Id,Smiles
11966,2638,CHEMBL510343,


In [4]:
# filter out failed molecules
df = df[~df['Smiles'].isna()].head(100)
df

Unnamed: 0,Col0,Chembl_Id,Smiles
0,0,CHEMBL3916495,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c800>
1,1,CHEMBL3917534,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c760>
2,2,CHEMBL3954218,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c620>
3,3,CHEMBL3956388,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c7b0>
4,4,CHEMBL3953960,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c3f0>
...,...,...,...
95,100,CHEMBL3971406,<rdkit.Chem.rdchem.Mol object at 0x7f414ed60800>
96,101,CHEMBL3972839,<rdkit.Chem.rdchem.Mol object at 0x7f414ed60850>
97,102,CHEMBL3971802,<rdkit.Chem.rdchem.Mol object at 0x7f414ed608a0>
98,103,CHEMBL3972383,<rdkit.Chem.rdchem.Mol object at 0x7f414ed608f0>


## Compute descriptors

In [5]:
df = u.compute_descriptors_df(df, smiles_col='Smiles')
df

Unnamed: 0,Col0,Chembl_Id,Smiles,num_heavy_atoms,molecular_weight,num_rings,num_rings_arom,num_rings_ali,num_hbd,num_hba,slogp,tpsa,num_rotatable_bond,num_atom_oxygen,num_atom_nitrogen,ring_size_min,ring_size_max,frac_sp3,num_lipinski_violations,num_veber_violations
0,0,CHEMBL3916495,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c800>,36.0,504.2723,3.0,0.0,3.0,0.0,8.0,4.3079,105.20,8.0,8.0,0.0,6.0,6.0,0.714286,1.0,1.0
1,1,CHEMBL3917534,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c760>,50.0,686.4183,3.0,1.0,2.0,4.0,7.0,9.7560,132.13,15.0,7.0,0.0,6.0,6.0,0.511628,2.0,3.0
2,2,CHEMBL3954218,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c620>,38.0,526.2567,7.0,0.0,7.0,1.0,8.0,3.1975,114.96,3.0,8.0,0.0,3.0,6.0,0.766667,1.0,1.0
3,3,CHEMBL3956388,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c7b0>,40.0,552.3815,3.0,0.0,3.0,1.0,5.0,7.6352,80.67,9.0,5.0,0.0,6.0,6.0,0.685714,2.0,2.0
4,4,CHEMBL3953960,<rdkit.Chem.rdchem.Mol object at 0x7f414d94c3f0>,22.0,619.6469,2.0,2.0,0.0,0.0,3.0,7.3086,27.69,4.0,3.0,0.0,6.0,6.0,0.142857,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,100,CHEMBL3971406,<rdkit.Chem.rdchem.Mol object at 0x7f414ed60800>,34.0,464.1835,4.0,2.0,2.0,2.0,7.0,5.2843,114.04,3.0,7.0,0.0,6.0,6.0,0.370370,1.0,1.0
96,101,CHEMBL3972839,<rdkit.Chem.rdchem.Mol object at 0x7f414ed60850>,35.0,484.3916,5.0,0.0,5.0,1.0,3.0,7.5680,46.53,6.0,3.0,0.0,3.0,6.0,0.906250,1.0,1.0
97,102,CHEMBL3971802,<rdkit.Chem.rdchem.Mol object at 0x7f414ed608a0>,31.0,426.2770,3.0,1.0,2.0,0.0,4.0,6.6587,52.60,6.0,4.0,0.0,6.0,6.0,0.629630,1.0,1.0
98,103,CHEMBL3972383,<rdkit.Chem.rdchem.Mol object at 0x7f414ed608f0>,43.0,602.3289,2.0,1.0,1.0,12.0,16.0,-1.2452,268.71,14.0,6.0,10.0,6.0,6.0,0.518519,3.0,5.0
