# Description

Custom fingerprints based on rdkit data. 
Various other things.

I need this notebook simply for the code drafts.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import random
import os
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.useBWAtomPalette()
import matplotlib.pyplot as plt
%matplotlib inline

from rdkit.Chem import rdMolHash
PHARPATH = Path("../tmp/pharmacophores")
DATAPATH = Path("../data")

train_df = pd.read_csv(DATAPATH/"train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## 1. Build library of the molecular fragments and use it
https://www.rdkit.org/docs/GettingStartedInPython.html#molecular-fragments

In [2]:
fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
from rdkit.Chem import FragmentCatalog
fparams = FragmentCatalog.FragCatParams(1,6,fName)
fparams.GetNumFuncGroups()

39

In [3]:
fcat = FragmentCatalog.FragCatalog(fparams)
fcgen = FragmentCatalog.FragCatGenerator()

In [4]:
for smiles in tqdm(train_df.Smiles.values):
    mol = Chem.MolFromSmiles(smiles)
    fcgen.AddFragsFromMol(mol, fcat)

100%|██████████| 5557/5557 [03:49<00:00, 24.22it/s]


In [5]:
fcat.GetNumEntries()  # a lot - need to use sparse data structures

221893

In [6]:
fpgen = FragmentCatalog.FragFPGenerator()
#fp = fpgen.GetFPForMol(ms[8],fcat)
#fp.GetNumOnBits()

In [7]:
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_df.Smiles.values]
test_mols = [Chem.MolFromSmiles(smiles) for smiles in test_df.Smiles.values]

In [8]:
fps_train = []
for x in tqdm(train_mols):
    fps_train.append(fpgen.GetFPForMol(x, fcat))
fps_test = []
for x in tqdm(test_mols):
    fps_test.append(fpgen.GetFPForMol(x, fcat))

# fps_test = [fpgen.GetFPForMol(x, fcat) for x in train_mols]

100%|██████████| 5557/5557 [04:25<00:00, 20.95it/s]
100%|██████████| 1614/1614 [01:19<00:00, 20.40it/s]


In [9]:
fps_train[:4]

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8586dcfbc0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8586dcfc10>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8586dcfc60>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8586dcfcb0>]

In [10]:
# fp = fpgen.GetFPForMol(train_mols[0],fcat)

In [11]:
# np.asarray(fp).shape
from scipy.sparse import csr_matrix
num_fp = fcat.GetNumEntries()
num_entries = len(fps_train)
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_train)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
train_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(num_entries, num_fp))

100%|██████████| 5557/5557 [00:09<00:00, 575.35it/s]


In [12]:
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_test)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
test_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(len(fps_test), num_fp))

100%|██████████| 1614/1614 [00:02<00:00, 571.83it/s]


In [13]:
test_frag_matrix

<1614x221893 sparse matrix of type '<class 'numpy.float64'>'
	with 382062 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True)
tfidf_transformer.fit(train_frag_matrix)
train_frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix)
test_frag_matrix_transformed = tfidf_transformer.transform(test_frag_matrix)

In [15]:
np.random.seed(SEED + 4)
ids1 = np.where(train_df.Active == 1)[0]
ids0 = np.where(train_df.Active == 0)[0]
m = len(ids0)
rebalanced = np.concatenate([
    np.random.choice(ids1, m),
    ids0]
)
subset_ids = np.concatenate([
    np.random.choice(ids0, len(ids1)),
    ids1
])
np.median(tfidf_transformer.idf_)

8.929846429742504

In [16]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer(smooth_idf=True)
#tfidf_transformer.fit(train_frag_matrix_transformed)
#frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix_transformed)

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
np.random.seed(SEED + 5)
model = MultinomialNB()
model = model.fit(train_frag_matrix_transformed[rebalanced], train_df.Active.values[rebalanced])

In [70]:
model.predict(train_frag_matrix_transformed).sum()

581

In [71]:
train_scores = model.predict_proba(train_frag_matrix_transformed[rebalanced])[:, 1]#[rebalanced])[:, 1]

In [72]:
# np.exp(1)**model.intercept_

In [73]:
from sklearn.metrics import f1_score, classification_report

In [74]:
from sklearn.metrics import roc_curve, f1_score

fpr, tpr, thresholds = roc_curve(train_df.Active.values[rebalanced], train_scores)
opt_tpr_fpr = np.argmax(tpr-fpr)
optimal_threshold = thresholds[opt_tpr_fpr]
print(opt_tpr_fpr, tpr[opt_tpr_fpr-1:opt_tpr_fpr+2]-fpr[opt_tpr_fpr-1:opt_tpr_fpr+2], optimal_threshold)

271 [0.92356569 0.92842459 0.92450009] 0.8328463971224935


In [75]:
threshold = optimal_threshold  # 0.5 #0.999
# using automatic thresholding instead of classification table-based or default 0.5
print((train_scores > threshold).sum())
#f1_score(train_df.Active.values[rebalanced], preds > threshold)
f1_score(train_df.Active.values[rebalanced], train_scores > threshold)

5364


0.9618292113859076

In [76]:
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores > threshold,
    target_names=['Inactive', 'Active']
    )
)

              precision    recall  f1-score   support

    Inactive       0.96      0.96      0.96      5351
      Active       0.96      0.96      0.96      5351

    accuracy                           0.96     10702
   macro avg       0.96      0.96      0.96     10702
weighted avg       0.96      0.96      0.96     10702



In [77]:
# from rdkit.ML.InfoTheory import InfoBitRanker
# ranker = InfoBitRanker(len(fps[0]), 2)
# activities = train_df.Active.values*1
# for fp, activity in zip(fps, activities):
#     ranker.AccumulateVotes(fp, int(activity))
# top5 = ranker.GetTopN(15)
# for id,gain,n0,n1 in top5:
#     print(int(id),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

In [78]:
# rdkit.ML.InfoTheory.BitClusterer.BitClusterer

In [85]:
preds = model.predict_proba(test_frag_matrix_transformed)[:, 1] > threshold # > 0.5

In [86]:
preds.sum()

119

In [87]:
# shap_values[0]

In [88]:
test_df["Active"] = preds

In [89]:
test_df.to_csv("../tmp/multinomial_nb_all_frags_opt.csv")