# Description

Custom fingerprints based on rdkit data. 
Various other things.

I need this notebook simply for the code drafts.

In [6]:
import pandas as pd
from pathlib import Path
import numpy as np
import random
import os
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.useBWAtomPalette()
import matplotlib.pyplot as plt
%matplotlib inline

from rdkit.Chem import rdMolHash
PHARPATH = Path("../tmp/pharmacophores")
DATAPATH = Path("../data")

train_df = pd.read_csv(DATAPATH/"train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## 1. Build library of the molecular fragments and use it
https://www.rdkit.org/docs/GettingStartedInPython.html#molecular-fragments

In [7]:
fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
from rdkit.Chem import FragmentCatalog
fparams = FragmentCatalog.FragCatParams(1,6,fName)
fparams.GetNumFuncGroups()

39

In [8]:
fcat = FragmentCatalog.FragCatalog(fparams)
fcgen = FragmentCatalog.FragCatGenerator()

In [9]:
for smiles in tqdm(train_df.Smiles.values):
    mol = Chem.MolFromSmiles(smiles)
    fcgen.AddFragsFromMol(mol, fcat)

100%|██████████| 5557/5557 [03:28<00:00, 26.59it/s]


In [10]:
fcat.GetNumEntries()

221893

In [14]:
fpgen = FragmentCatalog.FragFPGenerator()
#fp = fpgen.GetFPForMol(ms[8],fcat)
#fp.GetNumOnBits()

In [13]:
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_df.Smiles.values]
test_mols = [Chem.MolFromSmiles(smiles) for smiles in test_df.Smiles.values]

In [15]:
fps = [fpgen.GetFPForMol(x, fcat) for x in train_mols]

In [18]:
from rdkit.ML.InfoTheory import InfoBitRanker
ranker = InfoBitRanker(len(fps[0]), 2)
activities = train_df.Active.values*1
for fp, activity in zip(fps, activities):
    ranker.AccumulateVotes(fp, int(activity))


In [22]:
top5 = ranker.GetTopN(15)
for id,gain,n0,n1 in top5:
    print(int(id),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

9702 0.013 52 31 cc<-O>c<-O>
9700 0.013 57 31 c<-O>c<-O>
9707 0.013 52 30 ccc<-O>c<-O>
12252 0.012 51 29 cccc<-O>c<-O>
50156 0.009 2 12 c<-O>c<-O>c<-O>
50169 0.009 2 12 cc<-O>c<-O>c<-O>c
50160 0.009 2 12 cc<-O>c<-O>c<-O>
9718 0.008 30 19 c<-O>c<-O>ccC
50231 0.008 2 11 c<-O>1cccc<-O>c<-O>1
50167 0.008 2 11 ccc<-O>c<-O>c<-O>
50188 0.008 2 11 ccc<-O>c<-O>c<-O>c
50185 0.008 2 11 cccc<-O>c<-O>c<-O>
22012 0.008 29 18 c<-O>c<-O>cc(c)C
50190 0.008 3 11 c<-O>c<-O>cccc<-O>
12263 0.007 46 19 c<-O>c<-O>cccc


In [24]:
fp = fpgen.GetFPForMol(train_mols[0],fcat)

In [32]:
np.asarray(fp).shape

(221893,)

In [35]:
np.asarray(fp.GetOnBits())

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165])