# Compute Descriptors

Computation of descriptors used in Principal Componant Analysis:

1. Molecular Descriptors  
    - num_heavy_atoms
    - molecular_weight
    - num_rings
    - num_rings_arom
    - num_rings_ali
    - num_hbd
    - num_hba
    - slogp
    - tpsa
    - num_rotatable_bond
    - num_atom_oxygen
    - num_atom_nitrogen
    - ring_size_min
    - ring_size_max
    - frac_sp3

  
2. Subset Annotations
    - num_lipinski_violations
    - num_veber_violations
    
## Initialization

In [3]:
%reload_ext autoreload
%autoreload 2

# standard
from pathlib import Path
import sys
import warnings
# data
import pandas as pd
import numpy as np
import rdkit
# plot
import matplotlib.pyplot as plt
import seaborn as sns
# notebook
from IPython.core.interactiveshell import InteractiveShell
# dev
PROJECT_DIR = list(Path("..").absolute().parents)[1]
sys.path.append(str(PROJECT_DIR))
import utils as u
Path("../tmp").mkdir(exist_ok=True)

# configure notebook session
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"


# log core package versions
print("RDKit".ljust(30), rdkit.__version__)
print("Pandas".ljust(30), pd.__version__)

RDKit                          2020.03.6
Pandas                         1.1.4


## Load data

In [4]:
input_tsv = '../tmp/chembl_np_deglyco_std.tsv'
df = u.read_tsv(input_tsv)
df

Unnamed: 0,Col0,Chembl_Id,Smiles
0,0,CHEMBL3916495,CCC(C)C(=O)OC1CC(C)C(OC(C)=O)C2=CC(OC(C)=O)C(C...
1,1,CHEMBL3917534,C=C(C)C(O)CC/C(C)=C/CC1C[C@]2(CC=C(C)C)C(=O)[C...
2,2,CHEMBL3954218,CC(=O)OC1C2O[C@]2(C(C)C2CC(C)=C(C)C(=O)O2)[C@@...
3,3,CHEMBL3956388,CC(C)=CCC[C@]1(C)C(CC=C(C)C)C[C@]2(CC=C(C)C)C(...
4,4,CHEMBL3953960,COc1c(Br)cc(Br)cc1Oc1c(Br)c(Br)cc(Br)c1OC
...,...,...,...
38361,9985,CHEMBL1077644,C[C@@]12C[C@@]3(O)OC(O1)[C@]1(COC(=O)c4ccccc4)...
38362,9986,CHEMBL1078842,C=C(C)C1CC/C(C)=C/CC=C(C)C(O)CC/C(COC(C)=O)=C\...
38363,9987,CHEMBL1163882,COc1ccc(/C=C\c2cc(OC)c(OC)c(OC)c2)c(O)c1O
38364,9988,CHEMBL1087405,C=C1CC2OC(=O)C(=C)C2C(O)/C=C(\C)CCC1=O


## Compute descriptors

In [6]:
df = u.compute_descriptors_df(df, smiles_col='Smiles')
df

Unnamed: 0,Col0,Chembl_Id,Smiles,num_heavy_atoms,molecular_weight,num_rings,num_rings_arom,num_rings_ali,num_hbd,num_hba,slogp,tpsa,num_rotatable_bond,num_atom_oxygen,num_atom_nitrogen,ring_size_min,ring_size_max,frac_sp3,num_lipinski_violations,num_veber_violations
0,0,CHEMBL3916495,CCC(C)C(=O)OC1CC(C)C(OC(C)=O)C2=CC(OC(C)=O)C(C...,36.0,504.2723,3.0,0.0,3.0,0.0,8.0,4.3079,105.20,8.0,8.0,0.0,6.0,6.0,0.714286,1.0,0.0
1,1,CHEMBL3917534,C=C(C)C(O)CC/C(C)=C/CC1C[C@]2(CC=C(C)C)C(=O)[C...,50.0,686.4183,3.0,1.0,2.0,4.0,7.0,9.7560,132.13,15.0,7.0,0.0,6.0,6.0,0.511628,2.0,1.0
2,2,CHEMBL3954218,CC(=O)OC1C2O[C@]2(C(C)C2CC(C)=C(C)C(=O)O2)[C@@...,38.0,526.2567,7.0,0.0,7.0,1.0,8.0,3.1975,114.96,3.0,8.0,0.0,3.0,6.0,0.766667,1.0,0.0
3,3,CHEMBL3956388,CC(C)=CCC[C@]1(C)C(CC=C(C)C)C[C@]2(CC=C(C)C)C(...,40.0,552.3815,3.0,0.0,3.0,1.0,5.0,7.6352,80.67,9.0,5.0,0.0,6.0,6.0,0.685714,2.0,0.0
4,4,CHEMBL3953960,COc1c(Br)cc(Br)cc1Oc1c(Br)c(Br)cc(Br)c1OC,22.0,619.6469,2.0,2.0,0.0,0.0,3.0,7.3086,27.69,4.0,3.0,0.0,6.0,6.0,0.142857,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,100,CHEMBL3971406,CC(C)=CCC1(C(=O)O)CC(=C(C)C)C(=O)c2oc3c4c(cc(O...,34.0,464.1835,4.0,2.0,2.0,2.0,7.0,5.2843,114.04,3.0,7.0,0.0,6.0,6.0,0.370370,1.0,0.0
96,101,CHEMBL3972839,C=C(CCC(C)C1CC[C@@]2(C)C3CCC4C(C)(C)C(O)CC[C@@...,35.0,484.3916,5.0,0.0,5.0,1.0,3.0,7.5680,46.53,6.0,3.0,0.0,3.0,6.0,0.906250,1.0,0.0
97,102,CHEMBL3971802,CCC(=O)Oc1ccc(OC(=O)CC)c(C[C@]2(C)C(C)CC[C@]3(...,31.0,426.2770,3.0,1.0,2.0,0.0,4.0,6.6587,52.60,6.0,4.0,0.0,6.0,6.0,0.629630,1.0,0.0
98,103,CHEMBL3972383,CC(C)C(NC(=O)C(CCCNC(=N)N)NC(=O)NC(Cc1ccccc1)C...,43.0,602.3289,2.0,1.0,1.0,12.0,16.0,-1.2452,268.71,14.0,6.0,10.0,6.0,6.0,0.518519,3.0,2.0


In [7]:
# check if there are some failed molecules
df[df['num_heavy_atoms'].isna()]

Unnamed: 0,Col0,Chembl_Id,Smiles,num_heavy_atoms,molecular_weight,num_rings,num_rings_arom,num_rings_ali,num_hbd,num_hba,slogp,tpsa,num_rotatable_bond,num_atom_oxygen,num_atom_nitrogen,ring_size_min,ring_size_max,frac_sp3,num_lipinski_violations,num_veber_violations
