# Description

Custom fingerprints based on rdkit data. 
Various other things.

I need this notebook simply for the code drafts.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import random
import os
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.useBWAtomPalette()
import matplotlib.pyplot as plt
%matplotlib inline

from rdkit.Chem import rdMolHash
PHARPATH = Path("../tmp/pharmacophores")
DATAPATH = Path("../data")

train_df = pd.read_csv(DATAPATH/"train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## 1. Build library of the molecular fragments and use it
https://www.rdkit.org/docs/GettingStartedInPython.html#molecular-fragments

In [2]:
fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
from rdkit.Chem import FragmentCatalog
fparams = FragmentCatalog.FragCatParams(1,6,fName)
fparams.GetNumFuncGroups()

39

In [3]:
fcat = FragmentCatalog.FragCatalog(fparams)
fcgen = FragmentCatalog.FragCatGenerator()

In [4]:
for smiles in tqdm(train_df.Smiles.values):
    mol = Chem.MolFromSmiles(smiles)
    fcgen.AddFragsFromMol(mol, fcat)

100%|██████████| 5557/5557 [04:33<00:00, 20.34it/s]


In [5]:
fcat.GetNumEntries()  # a lot - need to use sparse data structures

221893

In [6]:
fpgen = FragmentCatalog.FragFPGenerator()
#fp = fpgen.GetFPForMol(ms[8],fcat)
#fp.GetNumOnBits()

In [7]:
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_df.Smiles.values]
test_mols = [Chem.MolFromSmiles(smiles) for smiles in test_df.Smiles.values]

In [8]:
fps_train = []
for x in tqdm(train_mols):
    fps_train.append(fpgen.GetFPForMol(x, fcat))
fps_test = []
for x in tqdm(test_mols):
    fps_test.append(fpgen.GetFPForMol(x, fcat))

# fps_test = [fpgen.GetFPForMol(x, fcat) for x in train_mols]

100%|██████████| 5557/5557 [04:21<00:00, 21.23it/s]
100%|██████████| 1614/1614 [01:02<00:00, 25.74it/s]


In [9]:
fps_train[:4]

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7ff41b745990>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7ff41b745b20>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7ff41b745b70>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7ff41b745c60>]

In [None]:
# fp = fpgen.GetFPForMol(train_mols[0],fcat)

In [11]:
# np.asarray(fp).shape
from scipy.sparse import csr_matrix
num_fp = fcat.GetNumEntries()
num_entries = len(fps_train)
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_train)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
train_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(num_entries, num_fp))

100%|██████████| 5557/5557 [00:12<00:00, 445.04it/s]


In [13]:
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_test)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
test_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(len(fps_test), num_fp))

100%|██████████| 1614/1614 [00:04<00:00, 371.50it/s]


In [14]:
test_frag_matrix

<1614x221893 sparse matrix of type '<class 'numpy.float64'>'
	with 382062 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True)
tfidf_transformer.fit(train_frag_matrix)
train_frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix)
test_frag_matrix_transformed = tfidf_transformer.transform(test_frag_matrix)

In [16]:
np.random.seed(SEED + 4)
ids1 = np.where(train_df.Active == 1)[0]
ids0 = np.where(train_df.Active == 0)[0]
m = len(ids0)
rebalanced = np.concatenate([
    np.random.choice(ids1, m),
    ids0]
)
subset_ids = np.concatenate([
    np.random.choice(ids0, len(ids1)),
    ids1
])
np.median(tfidf_transformer.idf_)

8.929846429742504

In [17]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer(smooth_idf=True)
#tfidf_transformer.fit(train_frag_matrix_transformed)
#frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix_transformed)

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
np.random.seed(SEED + 5)
model = MultinomialNB()
model = model.fit(train_frag_matrix_transformed[rebalanced], train_df.Active.values[rebalanced])

In [20]:
model.predict(train_frag_matrix_transformed).sum()

581

In [44]:
preds = model.predict_proba(train_frag_matrix_transformed)[:, 1]#[rebalanced])[:, 1]

In [45]:
# np.exp(1)**model.intercept_

In [46]:
from sklearn.metrics import f1_score, classification_report

In [48]:
threshold = 0.5 #0.999
print((preds > threshold).sum())
#f1_score(train_df.Active.values[rebalanced], preds > threshold)
f1_score(train_df.Active.values, preds > threshold)

581


0.5133418043202033

In [49]:
print(classification_report(
    train_df.Active.values, #[rebalanced],
    preds > threshold,
    target_names=['Inactive', 'Active']
    )
)

              precision    recall  f1-score   support

    Inactive       1.00      0.93      0.96      5351
      Active       0.35      0.98      0.51       206

    accuracy                           0.93      5557
   macro avg       0.67      0.95      0.74      5557
weighted avg       0.98      0.93      0.95      5557



In [50]:
# from rdkit.ML.InfoTheory import InfoBitRanker
# ranker = InfoBitRanker(len(fps[0]), 2)
# activities = train_df.Active.values*1
# for fp, activity in zip(fps, activities):
#     ranker.AccumulateVotes(fp, int(activity))
# top5 = ranker.GetTopN(15)
# for id,gain,n0,n1 in top5:
#     print(int(id),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

In [51]:
# rdkit.ML.InfoTheory.BitClusterer.BitClusterer

In [52]:
preds = model.predict(test_frag_matrix_transformed)#[:, 1] > 0.5

In [53]:
preds.sum()

216

In [81]:
# shap_values[0]

In [56]:
test_df["Active"] = preds

In [58]:
test_df.to_csv("../tmp/multinomial_nb_all_frags.csv")