# Description

Custom fingerprints based on rdkit data. 
Various other things.

I need this notebook simply for the code drafts.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import random
import os
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.useBWAtomPalette()
import matplotlib.pyplot as plt
%matplotlib inline

from rdkit.Chem import rdMolHash
TMP_DIR = Path("../tmp")
PHARPATH = Path("../tmp/pharmacophores")
DATAPATH = Path("../data")

train_df = pd.read_csv(DATAPATH/"train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## 1. Build library of the molecular fragments and use it
https://www.rdkit.org/docs/GettingStartedInPython.html#molecular-fragments

In [2]:
fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
from rdkit.Chem import FragmentCatalog
fparams = FragmentCatalog.FragCatParams(1,6,fName)
fparams.GetNumFuncGroups()

39

In [3]:
fcat = FragmentCatalog.FragCatalog(fparams)
fcgen = FragmentCatalog.FragCatGenerator()

In [4]:
for smiles in tqdm(train_df.Smiles.values):
    mol = Chem.MolFromSmiles(smiles)
    fcgen.AddFragsFromMol(mol, fcat)

100%|██████████| 5557/5557 [06:00<00:00, 15.43it/s]


In [5]:
fcat.GetNumEntries()  # a lot - need to use sparse data structures

221893

In [6]:
fpgen = FragmentCatalog.FragFPGenerator()
#fp = fpgen.GetFPForMol(ms[8],fcat)
#fp.GetNumOnBits()

In [7]:
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_df.Smiles.values]
test_mols = [Chem.MolFromSmiles(smiles) for smiles in test_df.Smiles.values]

In [8]:
fps_train = []
for x in tqdm(train_mols):
    fps_train.append(fpgen.GetFPForMol(x, fcat))
fps_test = []
for x in tqdm(test_mols):
    fps_test.append(fpgen.GetFPForMol(x, fcat))

# fps_test = [fpgen.GetFPForMol(x, fcat) for x in train_mols]

100%|██████████| 5557/5557 [03:56<00:00, 23.47it/s]
100%|██████████| 1614/1614 [00:59<00:00, 27.28it/s]


In [9]:
fps_train[:4]

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fd9ce7313f0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fd9ce731440>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fd9ce731490>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fd9ce7314e0>]

In [10]:
# fp = fpgen.GetFPForMol(train_mols[0],fcat)

In [140]:
# np.asarray(fp).shape
from scipy.sparse import csr_matrix
num_fp = fcat.GetNumEntries()
num_entries = len(fps_train)
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_train)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
train_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(num_entries, num_fp))

100%|██████████| 5557/5557 [00:13<00:00, 414.57it/s]


In [141]:
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_test)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
test_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(len(fps_test), num_fp))

100%|██████████| 1614/1614 [00:05<00:00, 294.88it/s]


# Smart (or not) filtering
1. Select fragments which are present in train and not present in test and remove them
2. For all the remaining ones: select top N (N=2000 or 0.95 model's compulative gain)

In [142]:
# ids_present_in_test = np.unique(test_frag_matrix.indices)
# %%timeit
test_bits_counts = np.asarray(test_frag_matrix.sum(0)).reshape(-1)
useful_bits = np.where(test_bits_counts > 0)[0]
non_useful_bits = np.where(test_bits_counts == 0)[0]

In [143]:
# ~ids_present_in_test & np.arange(test_frag_matrix.shape[1])
#test_frag_matrix = test_frag_matrix[:, useful_bits]
#train_frag_matrix = train_frag_matrix[:, useful_bits]

In [144]:
non_useful_bits = [int(x) for x in non_useful_bits]

In [145]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True)
tfidf_transformer.fit(train_frag_matrix)
train_frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix)
test_frag_matrix_transformed = tfidf_transformer.transform(test_frag_matrix)

In [146]:
ids = np.argsort(tfidf_transformer.idf_)
tfidf_transformer.idf_[ids].shape

(221893,)

In [130]:
from rdkit.ML.InfoTheory import InfoBitRanker
ranker = InfoBitRanker(len(fps_train[0]), 2)
activities = train_df.Active.values*1
records_no_bits = []
for i, (fp, activity) in enumerate(tqdm(zip(fps_train, activities), total=len(fps_train))):
    fp.UnSetBitsFromList(non_useful_bits)
    if fp.GetNumOnBits() == 0:
        records_no_bits.append(i)
        continue
    ranker.AccumulateVotes(fp, int(activity))


100%|██████████| 5557/5557 [12:48<00:00,  7.23it/s]


In [350]:
top_values = ranker.GetTopN(2000)
for identifier, gain, n0, n1 in top_values[:20]:
    print(int(identifier),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

9702 0.013 52 31 c<-O>c<-O>ccCC
9700 0.013 57 31 c<-O>c<-O>ccCC
9707 0.013 52 30 c<-O>c<-O>ccCC
12252 0.012 51 29 c<-O>c<-O>ccCC
50160 0.009 2 12 c<-O>c<-O>ccCC
50169 0.009 2 12 c<-O>c<-O>ccCC
50156 0.009 2 12 c<-O>c<-O>ccCC
9718 0.008 30 19 c<-O>c<-O>ccCC
50185 0.008 2 11 c<-O>c<-O>ccCC
50188 0.008 2 11 c<-O>c<-O>ccCC
50231 0.008 2 11 c<-O>c<-O>ccCC
50167 0.008 2 11 c<-O>c<-O>ccCC
22012 0.008 29 18 c<-O>c<-O>ccCC
50190 0.008 3 11 c<-O>c<-O>ccCC
12263 0.007 46 19 c<-O>c<-O>ccCC
9709 0.007 47 19 c<-O>c<-O>ccCC
9719 0.007 47 19 c<-O>c<-O>ccCC
22013 0.007 29 16 c<-O>c<-O>ccCC
22011 0.006 7 11 c<-O>c<-O>ccCC
22022 0.006 7 11 c<-O>c<-O>ccCC


In [351]:
bit_indices = np.asarray([int(x[0]) for x in top_values])

In [352]:
# bit_indices = useful_bits

In [353]:
np.random.seed(SEED + 4)
ids1 = np.where(train_df.Active == 1)[0]
ids0 = np.where(train_df.Active == 0)[0]
m = len(ids0)
rebalanced = np.concatenate([
    np.random.choice(ids1, m),
    ids0]
)
subset_ids = np.concatenate([
    np.random.choice(ids0, len(ids1)),
    ids1
])
np.median(tfidf_transformer.idf_)

9.62299361030245

In [354]:
def balance_data(y_train):
    df, counts = np.unique(y_train, return_counts=True)
    m = counts.max()
    index = np.arange(len(y_train))
    new_index = []
    for i, c in zip(df, counts):
        ids = y_train == i
        values = index[ids]
        if c == m:
            new_index.extend(values)
        else:
            new_index.extend(np.random.choice(values, m))
    np.random.shuffle(new_index)
    return new_index

In [355]:
train_frag_matrix_transformed[:, bit_indices]

<5557x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 121019 stored elements in Compressed Sparse Row format>

In [356]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer(smooth_idf=True)
#tfidf_transformer.fit(train_frag_matrix_transformed)
#frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix_transformed)

In [357]:
from sklearn.naive_bayes import MultinomialNB

In [358]:
np.random.seed(SEED + 5)

from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
NFOLDS = 11
BALANCE_TRAIN = True
y_full_train = train_df.Active.values
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED + 8)
kfsplits = kf.split(np.arange(train_df.shape[0]), y_full_train)
split_data = dict()
all_train_scores = []
all_train_scores_rebalanced = []
all_test_scores = []
for fold, (train_index, test_index) in enumerate(kfsplits):
    split_data[fold] = (train_index, test_index)
    x_train = train_frag_matrix_transformed[train_index][:, bit_indices]
    y_train = y_full_train[train_index]
    x_val = train_frag_matrix_transformed[test_index][:, bit_indices]
    y_val = y_full_train[test_index]
    if BALANCE_TRAIN:
        balanced_index = balance_data(y_train)
        x_train = x_train[balanced_index]
        y_train = y_train[balanced_index]
    model = MultinomialNB()
    model = model.fit(x_train, y_train)
    train_preds = model.predict(train_frag_matrix_transformed[:, bit_indices])
    train_scores = model.predict_proba(train_frag_matrix_transformed[:, bit_indices])[:, 1]#[rebalanced])[:, 1] 
    print(train_preds.sum())
    all_train_scores.append(train_scores)
    train_scores_balanced = model.predict_proba(
        train_frag_matrix_transformed[rebalanced][:, bit_indices]
    )[:, 1]#[rebalanced])[:, 1]
    all_train_scores_rebalanced.append(train_scores_balanced)

    test_scores = model.predict_proba(
        test_frag_matrix_transformed[:, bit_indices]
    )[:, 1]#[rebalanced])[:, 1] 
    all_test_scores.append(test_scores)

all_train_scores = np.stack(all_train_scores)
all_train_scores_rebalanced = np.stack(all_train_scores_rebalanced)
all_test_scores = np.stack(all_test_scores)
    

510
474
484
509
487
520
492
494
529
528
513


In [359]:
train_frag_matrix_transformed[:, bit_indices]

<5557x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 121019 stored elements in Compressed Sparse Row format>

In [360]:
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import roc_curve, f1_score

neg = (~train_df.Active).sum()
pos = (train_df.Active).sum()
train_scores = all_train_scores.mean(0)
train_scores_rebalanced = all_train_scores_rebalanced.mean(0)
fpr1, tpr1, thresholds1 = roc_curve(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced
)
print(tpr1.shape)
opt_tpr_fpr1 = np.argmax(tpr1-fpr1)
optimal_threshold1 = thresholds1[opt_tpr_fpr1]
print("Select threshold on train data without rebalancing", optimal_threshold1)
print(opt_tpr_fpr1, tpr1[opt_tpr_fpr1-1:opt_tpr_fpr1+2]-fpr1[opt_tpr_fpr1-1:opt_tpr_fpr1+2], optimal_threshold1)
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold1,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold1,
    target_names=['Inactive', 'Active']
    )
)


(912,)
Select threshold on train data without rebalancing 0.5009792646235678
201 [0.51261446 0.51579144 0.51429639] 0.5009792646235678
              precision    recall  f1-score   support

    Inactive       0.69      0.93      0.79      5351
      Active       0.89      0.58      0.71      5351

    accuracy                           0.76     10702
   macro avg       0.79      0.76      0.75     10702
weighted avg       0.79      0.76      0.75     10702

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       0.98      0.93      0.96      5351
      Active       0.24      0.58      0.34       206

    accuracy                           0.92      5557
   macro avg       0.61      0.75      0.65      5557
weighted avg       0.96      0.92      0.93      5557



In [361]:
train_df.Active.values.mean()

0.03707036170595645

In [362]:

fpr, tpr, thresholds = roc_curve(
    train_df.Active.values,
    train_scores
)
print(tpr.shape)
tp = tpr*pos
fp = fpr*neg
fn = (1-tpr)*pos
f1 = 2*tp/(2*tp+fp+fn)
opt_tpr_fpr = np.argmax(f1)
optimal_threshold2 = thresholds[opt_tpr_fpr]
print("Select threshold on train data without rebalancing", optimal_threshold2)
print(opt_tpr_fpr, tpr[opt_tpr_fpr-1:opt_tpr_fpr+2]-fpr[opt_tpr_fpr-1:opt_tpr_fpr+2], optimal_threshold2)
print("on balanced train")
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)

(870,)
Select threshold on train data without rebalancing 0.7714842680655122
86 [0.36555457 0.37526331 0.37414203] 0.7714842680655122
on balanced train
              precision    recall  f1-score   support

    Inactive       0.62      0.98      0.76      5351
      Active       0.96      0.39      0.56      5351

    accuracy                           0.69     10702
   macro avg       0.79      0.69      0.66     10702
weighted avg       0.79      0.69      0.66     10702

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       0.98      0.98      0.98      5351
      Active       0.45      0.39      0.42       206

    accuracy                           0.96      5557
   macro avg       0.72      0.69      0.70      5557
weighted avg       0.96      0.96      0.96      5557



In [363]:

optimal_threshold3 = 1. - train_df.Active.mean()
print("Select threshold based on mean Active value", optimal_threshold3)
print("on balanced train")
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)

Select threshold based on mean Active value 0.9629296382940435
on balanced train
              precision    recall  f1-score   support

    Inactive       0.62      0.98      0.76      5351
      Active       0.96      0.39      0.56      5351

    accuracy                           0.69     10702
   macro avg       0.79      0.69      0.66     10702
weighted avg       0.79      0.69      0.66     10702

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       0.98      0.98      0.98      5351
      Active       0.45      0.39      0.42       206

    accuracy                           0.96      5557
   macro avg       0.72      0.69      0.70      5557
weighted avg       0.96      0.96      0.96      5557



In [364]:
# np.exp(1)**model.intercept_


In [365]:
# from rdkit.ML.InfoTheory import InfoBitRanker
# ranker = InfoBitRanker(len(fps[0]), 2)
# activities = train_df.Active.values*1
# for fp, activity in zip(fps, activities):
#     ranker.AccumulateVotes(fp, int(activity))
# top5 = ranker.GetTopN(15)
# for id,gain,n0,n1 in top5:
#     print(int(id),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

In [366]:
# rdkit.ML.InfoTheory.BitClusterer.BitClusterer

In [367]:
preds = all_test_scores.mean(0) > optimal_threshold2 # > 0.5

In [368]:
preds.sum()

48

In [346]:
# shap_values[0]

In [369]:
test_df["Active"] = preds

In [370]:
test_df.to_csv("../tmp/multinomial_nb_all_frags_balanced_filtered_v3_2000_nfolds11.csv")

In [371]:
preds.sum()

48