# Description

Custom fingerprints based on rdkit data. 
Various other things.

I need this notebook simply for the code drafts.

In [91]:
import pandas as pd
from pathlib import Path
import numpy as np
import random
import os
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.useBWAtomPalette()
import matplotlib.pyplot as plt
%matplotlib inline

from rdkit.Chem import rdMolHash
TMP_DIR = Path("../tmp")
PHARPATH = Path("../tmp/pharmacophores")
DATAPATH = Path("../data")

train_df = pd.read_csv(DATAPATH/"train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## 1. Build library of the molecular fragments and use it
https://www.rdkit.org/docs/GettingStartedInPython.html#molecular-fragments

In [92]:
fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
from rdkit.Chem import FragmentCatalog
fparams = FragmentCatalog.FragCatParams(1,6,fName)
fparams.GetNumFuncGroups()

39

In [93]:
fcat = FragmentCatalog.FragCatalog(fparams)
fcgen = FragmentCatalog.FragCatGenerator()

In [94]:
for smiles in tqdm(train_df.Smiles.values):
    mol = Chem.MolFromSmiles(smiles)
    fcgen.AddFragsFromMol(mol, fcat)

100%|██████████| 5557/5557 [04:12<00:00, 22.01it/s]


In [95]:
fcat.GetNumEntries()  # a lot - need to use sparse data structures

221893

In [96]:
fpgen = FragmentCatalog.FragFPGenerator()
#fp = fpgen.GetFPForMol(ms[8],fcat)
#fp.GetNumOnBits()

In [97]:
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_df.Smiles.values]
test_mols = [Chem.MolFromSmiles(smiles) for smiles in test_df.Smiles.values]

In [98]:
fps_train = []
for x in tqdm(train_mols):
    fps_train.append(fpgen.GetFPForMol(x, fcat))
fps_test = []
for x in tqdm(test_mols):
    fps_test.append(fpgen.GetFPForMol(x, fcat))

# fps_test = [fpgen.GetFPForMol(x, fcat) for x in train_mols]

100%|██████████| 5557/5557 [04:51<00:00, 19.04it/s]
100%|██████████| 1614/1614 [01:20<00:00, 20.11it/s]


In [99]:
fps_train[:4]

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f94fe9bea30>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f94fe9bea80>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f94fe9be0d0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f94fef288a0>]

In [100]:
# fp = fpgen.GetFPForMol(train_mols[0],fcat)

In [101]:
# np.asarray(fp).shape
from scipy.sparse import csr_matrix
num_fp = fcat.GetNumEntries()
num_entries = len(fps_train)
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_train)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
train_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(num_entries, num_fp))

100%|██████████| 5557/5557 [00:13<00:00, 402.64it/s]


In [102]:
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_test)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
test_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(len(fps_test), num_fp))

100%|██████████| 1614/1614 [00:04<00:00, 390.61it/s]


In [103]:
test_frag_matrix

<1614x221893 sparse matrix of type '<class 'numpy.float64'>'
	with 382062 stored elements in Compressed Sparse Row format>

In [104]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True)
tfidf_transformer.fit(train_frag_matrix)
train_frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix)
test_frag_matrix_transformed = tfidf_transformer.transform(test_frag_matrix)

In [105]:
np.random.seed(SEED + 4)
ids1 = np.where(train_df.Active == 1)[0]
ids0 = np.where(train_df.Active == 0)[0]
m = len(ids0)
rebalanced = np.concatenate([
    np.random.choice(ids1, m),
    ids0]
)
subset_ids = np.concatenate([
    np.random.choice(ids0, len(ids1)),
    ids1
])
np.median(tfidf_transformer.idf_)

8.929846429742504

In [106]:
def balance_data(y_train):
    df, counts = np.unique(y_train, return_counts=True)
    m = counts.max()
    index = np.arange(len(y_train))
    new_index = []
    for i, c in zip(df, counts):
        ids = y_train == i
        values = index[ids]
        if c == m:
            new_index.extend(values)
        else:
            new_index.extend(np.random.choice(values, m))
    np.random.shuffle(new_index)
    return new_index

In [107]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer(smooth_idf=True)
#tfidf_transformer.fit(train_frag_matrix_transformed)
#frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix_transformed)

In [108]:
from sklearn.naive_bayes import MultinomialNB

In [109]:
np.random.seed(SEED + 5)

from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
NFOLDS = 5
BALANCE_TRAIN = True
y_full_train = train_df.Active.values
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED + 8)
kfsplits = kf.split(np.arange(train_df.shape[0]), y_full_train)
split_data = dict()
all_train_scores = []
all_train_scores_rebalanced = []
all_test_scores = []
for fold, (train_index, test_index) in enumerate(kfsplits):
    split_data[fold] = (train_index, test_index)
    x_train = train_frag_matrix_transformed[train_index]
    y_train = y_full_train[train_index]
    x_val = train_frag_matrix_transformed[test_index]
    y_val = y_full_train[test_index]
    if BALANCE_TRAIN:
        balanced_index = balance_data(y_train)
        x_train = x_train[balanced_index]
        y_train = y_train[balanced_index]
    model = MultinomialNB()
    model = model.fit(x_train, y_train)
    train_preds = model.predict(train_frag_matrix_transformed)
    train_scores = model.predict_proba(train_frag_matrix_transformed)[:, 1]#[rebalanced])[:, 1] 
    print(train_preds.sum())
    all_train_scores.append(train_scores)
    train_scores_balanced = model.predict_proba(train_frag_matrix_transformed[rebalanced])[:, 1]#[rebalanced])[:, 1]
    all_train_scores_rebalanced.append(train_scores_balanced)

    test_scores = model.predict_proba(test_frag_matrix_transformed)[:, 1]#[rebalanced])[:, 1] 
    all_test_scores.append(test_scores)

all_train_scores = np.stack(all_train_scores)
all_train_scores_rebalanced = np.stack(all_train_scores_rebalanced)
all_test_scores = np.stack(all_test_scores)
    

562
608
650
625
695


In [110]:
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import roc_curve, f1_score

neg = (~train_df.Active).sum()
pos = (train_df.Active).sum()
train_scores = all_train_scores.mean(0)
train_scores_rebalanced = all_train_scores_rebalanced.mean(0)
fpr1, tpr1, thresholds1 = roc_curve(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced
)
print(tpr1.shape)
opt_tpr_fpr1 = np.argmax(tpr1-fpr1)
optimal_threshold1 = thresholds1[opt_tpr_fpr1]
print("Select threshold on train data without rebalancing", optimal_threshold1)
print(opt_tpr_fpr1, tpr1[opt_tpr_fpr1-1:opt_tpr_fpr1+2]-fpr1[opt_tpr_fpr1-1:opt_tpr_fpr1+2], optimal_threshold1)
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold1,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold1,
    target_names=['Inactive', 'Active']
    )
)


(862,)
Select threshold on train data without rebalancing 0.6279235594271741
304 [0.92767707 0.93234909 0.93122781] 0.6279235594271741
              precision    recall  f1-score   support

    Inactive       0.98      0.95      0.96      5351
      Active       0.95      0.98      0.96      5351

    accuracy                           0.96     10702
   macro avg       0.96      0.96      0.96     10702
weighted avg       0.96      0.96      0.96     10702

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       1.00      0.95      0.97      5351
      Active       0.41      0.98      0.58       206

    accuracy                           0.95      5557
   macro avg       0.71      0.96      0.78      5557
weighted avg       0.98      0.95      0.96      5557



In [111]:
train_df.Active.values.mean()

0.03707036170595645

In [112]:

fpr, tpr, thresholds = roc_curve(
    train_df.Active.values,
    train_scores
)
print(tpr.shape)
tp = tpr*pos
fp = fpr*neg
fn = (1-tpr)*pos
f1 = 2*tp/(2*tp+fp+fn)
opt_tpr_fpr = np.argmax(f1)
optimal_threshold2 = thresholds[opt_tpr_fpr]
print("Select threshold on train data without rebalancing", optimal_threshold2)
print(opt_tpr_fpr, tpr[opt_tpr_fpr-1:opt_tpr_fpr+2]-fpr[opt_tpr_fpr-1:opt_tpr_fpr+2], optimal_threshold2)
print("on balanced train")
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)

(779,)
Select threshold on train data without rebalancing 0.7982320159035632
181 [0.87207726 0.87693163 0.87674475] 0.7982320159035632
on balanced train
              precision    recall  f1-score   support

    Inactive       0.91      0.97      0.94      5351
      Active       0.97      0.90      0.93      5351

    accuracy                           0.94     10702
   macro avg       0.94      0.94      0.94     10702
weighted avg       0.94      0.94      0.94     10702

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       1.00      0.97      0.98      5351
      Active       0.53      0.90      0.67       206

    accuracy                           0.97      5557
   macro avg       0.76      0.94      0.83      5557
weighted avg       0.98      0.97      0.97      5557



In [113]:

optimal_threshold3 = 1. - train_df.Active.mean()
print("Select threshold based on mean Active value", optimal_threshold3)
print("on balanced train")
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)

Select threshold based on mean Active value 0.9629296382940435
on balanced train
              precision    recall  f1-score   support

    Inactive       0.91      0.97      0.94      5351
      Active       0.97      0.90      0.93      5351

    accuracy                           0.94     10702
   macro avg       0.94      0.94      0.94     10702
weighted avg       0.94      0.94      0.94     10702

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       1.00      0.97      0.98      5351
      Active       0.53      0.90      0.67       206

    accuracy                           0.97      5557
   macro avg       0.76      0.94      0.83      5557
weighted avg       0.98      0.97      0.97      5557



In [114]:
# np.exp(1)**model.intercept_


In [115]:
# from rdkit.ML.InfoTheory import InfoBitRanker
# ranker = InfoBitRanker(len(fps[0]), 2)
# activities = train_df.Active.values*1
# for fp, activity in zip(fps, activities):
#     ranker.AccumulateVotes(fp, int(activity))
# top5 = ranker.GetTopN(15)
# for id,gain,n0,n1 in top5:
#     print(int(id),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

In [116]:
# rdkit.ML.InfoTheory.BitClusterer.BitClusterer

In [117]:
preds = all_test_scores.mean(0) > optimal_threshold2 # > 0.5

In [118]:
preds.sum()

90

In [119]:
# shap_values[0]

In [120]:
test_df["Active"] = preds

In [121]:
test_df.to_csv("../tmp/multinomial_nb_all_frags_balanced_v3.csv")