# Description

Custom fingerprints based on rdkit data. 
Various other things.

I need this notebook simply for the code drafts.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import random
import os
import sys
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
IPythonConsole.ipython_useSVG=True
IPythonConsole.drawOptions.addAtomIndices = False
IPythonConsole.drawOptions.addStereoAnnotation = True
IPythonConsole.drawOptions.useBWAtomPalette()
import matplotlib.pyplot as plt
%matplotlib inline

from rdkit.Chem import rdMolHash
TMP_DIR = Path("../tmp")
PHARPATH = Path("../tmp/pharmacophores")
DATAPATH = Path("../data")

train_df = pd.read_csv(DATAPATH/"train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [2]:
DEP_DIR = "../code"
if DEP_DIR not in sys.path:
    sys.path.append(DEP_DIR)
from common.cleaner import split_df, clean_smiles, collect_df
train_df["cleaned_smiles"] = train_df.Smiles.apply(clean_smiles)
test_df["cleaned_smiles"] = test_df.Smiles.apply(clean_smiles)
train_df = split_df(train_df, smiles_col="cleaned_smiles", keep_columns=["cleaned_smiles", "Smiles"])
test_df = split_df(test_df, smiles_col="cleaned_smiles", keep_columns=["cleaned_smiles", "Smiles"])

train_df.head()

Cannot parse ['[K+]', '[I-]']


Unnamed: 0,original_index,Active,num_parts,original_Smiles,original_cleaned_smiles,Smiles
0,0,False,1.0,COc1ccc2[nH]cc(CCN)c2c1,COc1ccc2[nH]cc(CCN)c2c1,COc1ccc2[nH]cc(CCN)c2c1
1,1,False,1.0,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,CCCN1CCC[C@H](c2cccc(O)c2)C1,CCCN1CCC[C@H](c2cccc(O)c2)C1
2,2,False,1.0,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...
3,3,False,1.0,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...
4,4,False,1.0,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1


In [5]:
# collect_df(test_df, split_col="Smiles")


In [6]:
train_df.Active.value_counts()

False    5540
True      218
Name: Active, dtype: int64

## 1. Build library of the molecular fragments and use it
https://www.rdkit.org/docs/GettingStartedInPython.html#molecular-fragments

In [7]:
fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt')
from rdkit.Chem import FragmentCatalog
fparams = FragmentCatalog.FragCatParams(1,6,fName)
fparams.GetNumFuncGroups()

39

In [8]:
fcat = FragmentCatalog.FragCatalog(fparams)
fcgen = FragmentCatalog.FragCatGenerator()

In [9]:
# (train_df.Smiles.apply(len) == 0).sum()
train_df["molecule"] = train_df.Smiles.apply(lambda x: Chem.MolFromSmiles(x))

In [10]:

for mol in tqdm(train_df.molecule.values):
    #mol = Chem.MolFromSmiles(smiles)
    fcgen.AddFragsFromMol(mol, fcat)

100%|██████████| 5758/5758 [04:06<00:00, 23.33it/s]


In [11]:
fcat.GetNumEntries()  # a lot - need to use sparse data structures

219855

In [12]:
fpgen = FragmentCatalog.FragFPGenerator()
#fp = fpgen.GetFPForMol(ms[8],fcat)
#fp.GetNumOnBits()

In [13]:
train_mols = [Chem.MolFromSmiles(smiles) for smiles in train_df.Smiles.values]
test_mols = [Chem.MolFromSmiles(smiles) for smiles in test_df.Smiles.values]

In [14]:
fps_train = []
for x in tqdm(train_mols):
    fps_train.append(fpgen.GetFPForMol(x, fcat))
fps_test = []
for x in tqdm(test_mols):
    fps_test.append(fpgen.GetFPForMol(x, fcat))

# fps_test = [fpgen.GetFPForMol(x, fcat) for x in train_mols]

100%|██████████| 5758/5758 [03:22<00:00, 28.37it/s]
100%|██████████| 1681/1681 [01:03<00:00, 26.35it/s]


In [15]:
RDConfig.RDDataDir

'/Users/lacemaker/anaconda3/envs/data_env/share/RDKit/Data'

In [16]:
fps_train[:4]

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8c44775170>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8c44775300>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8c44775350>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f8c44775440>]

In [17]:
# fp = fpgen.GetFPForMol(train_mols[0],fcat)

In [18]:
# np.asarray(fp).shape
from scipy.sparse import csr_matrix
num_fp = fcat.GetNumEntries()
num_entries = len(fps_train)
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_train)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
train_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(num_entries, num_fp))

100%|██████████| 5758/5758 [00:14<00:00, 398.12it/s]


In [19]:
row_indices = []
col_indices = []
for i, fp in enumerate(tqdm(fps_test)):
    bits = list(fp.GetOnBits())
    col_indices.extend(bits)
    row_indices.extend([i]*len(bits))
col_indices = np.asarray(col_indices)
row_indices = np.asarray(row_indices)
assert len(col_indices) == len(row_indices)
values = np.ones((len(col_indices), ))
test_frag_matrix = csr_matrix((values, (row_indices, col_indices)), shape=(len(fps_test), num_fp))

100%|██████████| 1681/1681 [00:04<00:00, 381.56it/s]


# Smart (or not) filtering
1. Select fragments which are present in train and not present in test and remove them
2. For all the remaining ones: select top N (N=2000 or 0.95 model's compulative gain)

In [20]:
# ids_present_in_test = np.unique(test_frag_matrix.indices)
# %%timeit
test_bits_counts = np.asarray(test_frag_matrix.sum(0)).reshape(-1)
useful_bits = np.where(test_bits_counts > 0)[0]
non_useful_bits = np.where(test_bits_counts == 0)[0]

In [21]:
# ~ids_present_in_test & np.arange(test_frag_matrix.shape[1])
#test_frag_matrix = test_frag_matrix[:, useful_bits]
#train_frag_matrix = train_frag_matrix[:, useful_bits]

In [22]:
non_useful_bits = [int(x) for x in non_useful_bits]

In [23]:
len(useful_bits)

76281

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(smooth_idf=True)
tfidf_transformer.fit(train_frag_matrix)
train_frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix)
test_frag_matrix_transformed = tfidf_transformer.transform(test_frag_matrix)

In [25]:
ids = np.argsort(tfidf_transformer.idf_)
tfidf_transformer.idf_[ids].shape

(219855,)

In [26]:
from rdkit.ML.InfoTheory import InfoBitRanker
ranker = InfoBitRanker(len(fps_train[0]), 2)
activities = train_df.Active.values*1
records_no_bits = []
for i, (fp, activity) in enumerate(tqdm(zip(fps_train, activities), total=len(fps_train))):
    fp.UnSetBitsFromList(non_useful_bits)
    if fp.GetNumOnBits() == 0:
        records_no_bits.append(i)
        continue
    ranker.AccumulateVotes(fp, int(activity))


100%|██████████| 5758/5758 [11:30<00:00,  8.34it/s]


In [27]:
top_values = ranker.GetTopN(4000)
for identifier, gain, n0, n1 in top_values[:20]:
    print(int(identifier),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(identifier)))

9702 0.013 52 31 cc<-O>c<-O>
9700 0.012 57 31 c<-O>c<-O>
9707 0.012 52 30 ccc<-O>c<-O>
12252 0.012 51 29 cccc<-O>c<-O>
50003 0.009 2 12 c<-O>c<-O>c<-O>
50016 0.009 2 12 cc<-O>c<-O>c<-O>c
50007 0.009 2 12 cc<-O>c<-O>c<-O>
9718 0.008 30 19 c<-O>c<-O>ccC
50078 0.008 2 11 c<-O>1cccc<-O>c<-O>1
50014 0.008 2 11 ccc<-O>c<-O>c<-O>
50035 0.008 2 11 ccc<-O>c<-O>c<-O>c
50032 0.008 2 11 cccc<-O>c<-O>c<-O>
21974 0.008 29 18 c<-O>c<-O>cc(c)C
50037 0.007 3 11 c<-O>c<-O>cccc<-O>
12263 0.007 46 19 c<-O>c<-O>cccc
9709 0.006 47 19 cc<-O>c<-O>c
9719 0.006 47 19 ccc<-O>c<-O>c
21975 0.006 29 16 c<-O>c<-O>cccC
21984 0.006 7 11 c<-O>c<-O>cc(c)CC
21973 0.006 7 11 c<-O>c<-O>ccCC


In [28]:
bit_indices = np.asarray([int(x[0]) for x in top_values])

In [29]:
# bit_indices = useful_bits

In [30]:
np.random.seed(SEED + 4)
ids1 = np.where(train_df.Active == 1)[0]
ids0 = np.where(train_df.Active == 0)[0]
m = len(ids0)
rebalanced = np.concatenate([
    np.random.choice(ids1, m),
    ids0]
)
subset_ids = np.concatenate([
    np.random.choice(ids0, len(ids1)),
    ids1
])
np.median(tfidf_transformer.idf_)

8.965371946946728

In [31]:
def balance_data(y_train):
    df, counts = np.unique(y_train, return_counts=True)
    m = counts.max()
    index = np.arange(len(y_train))
    new_index = []
    for i, c in zip(df, counts):
        ids = y_train == i
        values = index[ids]
        if c == m:
            new_index.extend(values)
        else:
            new_index.extend(np.random.choice(values, m))
    np.random.shuffle(new_index)
    return new_index

In [32]:
train_frag_matrix_transformed[:, bit_indices]

<5758x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 190586 stored elements in Compressed Sparse Row format>

In [33]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer(smooth_idf=True)
#tfidf_transformer.fit(train_frag_matrix_transformed)
#frag_matrix_transformed = tfidf_transformer.transform(train_frag_matrix_transformed)

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
train_df.Active.isnull().sum()

0

In [44]:
np.random.seed(SEED + 5)

from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
NFOLDS = 15#11
BALANCE_TRAIN = True
y_full_train = train_df.Active.values.astype(int)
kf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED + 8)
kfsplits = kf.split(np.arange(train_df.shape[0]), y_full_train)
split_data = dict()
all_train_scores = []
all_train_scores_rebalanced = []
all_test_scores = []
for fold, (train_index, test_index) in enumerate(kfsplits):
    split_data[fold] = (train_index, test_index)
    x_train = train_frag_matrix_transformed[train_index][:, bit_indices]
    y_train = y_full_train[train_index]
    x_val = train_frag_matrix_transformed[test_index][:, bit_indices]
    y_val = y_full_train[test_index]
    if BALANCE_TRAIN:
        balanced_index = balance_data(y_train)
        x_train = x_train[balanced_index]
        y_train = y_train[balanced_index]
    model = MultinomialNB()
    model = model.fit(x_train, y_train)
    train_preds = model.predict(train_frag_matrix_transformed[:, bit_indices])
    train_scores = model.predict_proba(train_frag_matrix_transformed[:, bit_indices])[:, 1]#[rebalanced])[:, 1] 
    print(train_preds.sum())
    all_train_scores.append(train_scores)
    train_scores_balanced = model.predict_proba(
        train_frag_matrix_transformed[rebalanced][:, bit_indices]
    )[:, 1]#[rebalanced])[:, 1]
    all_train_scores_rebalanced.append(train_scores_balanced)

    test_scores = model.predict_proba(
        test_frag_matrix_transformed[:, bit_indices]
    )[:, 1]#[rebalanced])[:, 1] 
    all_test_scores.append(test_scores)

all_train_scores = np.stack(all_train_scores)
all_train_scores_rebalanced = np.stack(all_train_scores_rebalanced)
all_test_scores = np.stack(all_test_scores)
    

527
555
534
531
550
536
528
537
520
540
537
514
532
536
534


In [45]:
train_frag_matrix_transformed[:, bit_indices]

<5758x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 190586 stored elements in Compressed Sparse Row format>

In [48]:
train_df["Active"] = train_df.Active.astype(int)

In [49]:
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import roc_curve, f1_score

neg = (~train_df.Active).sum()
pos = (train_df.Active).sum()
train_scores = all_train_scores.mean(0)
train_scores_rebalanced = all_train_scores_rebalanced.mean(0)
fpr1, tpr1, thresholds1 = roc_curve(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced
)
print(tpr1.shape)
opt_tpr_fpr1 = np.argmax(tpr1-fpr1)
optimal_threshold1 = thresholds1[opt_tpr_fpr1]
print("Select threshold on train data without rebalancing", optimal_threshold1)
print(opt_tpr_fpr1, tpr1[opt_tpr_fpr1-1:opt_tpr_fpr1+2]-fpr1[opt_tpr_fpr1-1:opt_tpr_fpr1+2], optimal_threshold1)
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold1,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold1,
    target_names=['Inactive', 'Active']
    )
)


(991,)
Select threshold on train data without rebalancing 0.4487115140179196
344 [0.5801444  0.583213   0.57761733] 0.4487115140179196
              precision    recall  f1-score   support

    Inactive       0.79      0.78      0.79      5540
      Active       0.79      0.80      0.79      5540

    accuracy                           0.79     11080
   macro avg       0.79      0.79      0.79     11080
weighted avg       0.79      0.79      0.79     11080

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       0.99      0.78      0.87      5540
      Active       0.13      0.80      0.22       218

    accuracy                           0.78      5758
   macro avg       0.56      0.79      0.55      5758
weighted avg       0.96      0.78      0.85      5758



In [50]:
train_df.Active.values.mean()

0.03786036818339701

In [51]:

fpr, tpr, thresholds = roc_curve(
    train_df.Active.values,
    train_scores
)
print(tpr.shape)
tp = tpr*pos
fp = fpr*neg
fn = (1-tpr)*pos
f1 = 2*tp/(2*tp+fp+fn)
opt_tpr_fpr = np.argmax(f1)
optimal_threshold2 = thresholds[opt_tpr_fpr]
print("Select threshold on train data without rebalancing", optimal_threshold2)
print(opt_tpr_fpr, tpr[opt_tpr_fpr-1:opt_tpr_fpr+2]-fpr[opt_tpr_fpr-1:opt_tpr_fpr+2], optimal_threshold2)
print("on balanced train")
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)

(958,)
Select threshold on train data without rebalancing 0.5194611816343447
187 [0.55045044 0.54774285 0.55691717] 0.5194611816343447
on balanced train
              precision    recall  f1-score   support

    Inactive       0.71      0.94      0.81      5540
      Active       0.91      0.61      0.73      5540

    accuracy                           0.78     11080
   macro avg       0.81      0.78      0.77     11080
weighted avg       0.81      0.78      0.77     11080

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       0.98      0.94      0.96      5540
      Active       0.29      0.61      0.39       218

    accuracy                           0.93      5758
   macro avg       0.64      0.77      0.68      5758
weighted avg       0.96      0.93      0.94      5758



In [52]:

optimal_threshold3 = 1. - train_df.Active.mean()
print("Select threshold based on mean Active value", optimal_threshold3)
print("on balanced train")
print(classification_report(
    train_df.Active.values[rebalanced],
    train_scores_rebalanced > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)
print("the same threshold, on train data without rebalancing")
print(classification_report(
    train_df.Active.values,
    train_scores > optimal_threshold2,
    target_names=['Inactive', 'Active']
    )
)

Select threshold based on mean Active value 0.962139631816603
on balanced train
              precision    recall  f1-score   support

    Inactive       0.71      0.94      0.81      5540
      Active       0.91      0.61      0.73      5540

    accuracy                           0.78     11080
   macro avg       0.81      0.78      0.77     11080
weighted avg       0.81      0.78      0.77     11080

the same threshold, on train data without rebalancing
              precision    recall  f1-score   support

    Inactive       0.98      0.94      0.96      5540
      Active       0.29      0.61      0.39       218

    accuracy                           0.93      5758
   macro avg       0.64      0.77      0.68      5758
weighted avg       0.96      0.93      0.94      5758



In [53]:
# np.exp(1)**model.intercept_


In [54]:
# from rdkit.ML.InfoTheory import InfoBitRanker
# ranker = InfoBitRanker(len(fps[0]), 2)
# activities = train_df.Active.values*1
# for fp, activity in zip(fps, activities):
#     ranker.AccumulateVotes(fp, int(activity))
# top5 = ranker.GetTopN(15)
# for id,gain,n0,n1 in top5:
#     print(int(id),'%.3f'%gain,int(n0),int(n1), fcat.GetEntryDescription(int(id)))

In [55]:
# rdkit.ML.InfoTheory.BitClusterer.BitClusterer

In [56]:
preds = all_test_scores.mean(0) > optimal_threshold2 # > 0.5

In [57]:
preds.sum()

170

In [58]:
# shap_values[0]

In [59]:
test_df["Active"] = preds

In [60]:
from common.cleaner import collect_df

In [64]:
submission_df = pd.read_csv(DATAPATH/"test.csv", index_col=0)
submission_df.tail()

Unnamed: 0,Smiles
1609,NS(=O)(=O)c1cc2c(cc1Cl)NC(C1CC3C=CC1C3)NS2(=O)=O
1610,Cc1cccc(Nc2ccncc2S(=O)(=O)NC(=O)NC(C)C)c1
1611,CCCC(=O)O[C@]1(C(=O)CO)CC[C@H]2[C@@H]3CCC4=CC(...
1612,CN(C)c1cccc(Oc2cnc(Nc3cccc(O)c3)nc2)c1
1613,O=C(O)c1ccccc1-c1c2ccc(=O)cc-2oc2cc(O)ccc12


In [65]:
df = test_df.groupby("original_Smiles").agg({
    # "Smiles": lambda x: ".".join(x),
    "Active": lambda x: x.any(),
    # "original_Smiles": "first"
}).reset_index()
activity_dict = {smiles: activity for smiles, activity in df.values}

In [67]:
submission_df["Active"] = submission_df.Smiles.apply(lambda x: activity_dict[x])

In [68]:
submission_df.to_csv("../tmp/multinomial_nb_all_frags_balanced_cleaned_v4_4000_nfolds15.csv")

In [69]:
preds.sum()

170