I've decided to preprocess the input dataset (to later reuse it and also to recompute pharmacophores on it), to make this reproducible I'm using this notebook.

I also want to take a look at the train/test split and construct a better validation dataset.

In [1]:
import sys
import os
import numpy as np
import pandas as pd
from pathlib import Path
import random
SEED = 123
random.seed(SEED)
np.random.seed(SEED)

In [2]:
CODEPATH = "../code"
if not CODEPATH in sys.path:
    sys.path.append(CODEPATH)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from common.cleaner import split_df, clean_smiles, collect_df

In [5]:
DATAPATH = Path("../data")
train_df = pd.read_csv(DATAPATH / "train.csv", index_col=0)
test_df = pd.read_csv(DATAPATH / "test.csv", index_col=0)


In [6]:
train_df["cleaned"] = train_df.Smiles.apply(clean_smiles)
test_df["cleaned"] = test_df.Smiles.apply(clean_smiles)

Cannot parse ['[I-]', '[K+]'] ['K']


In [7]:
train_df.head()

Unnamed: 0,Smiles,Active,cleaned
0,COc1ccc2[nH]cc(CCN)c2c1,False,COc1ccc2[nH]cc(CCN)c2c1
1,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,False,CCCN1CCC[C@H](c2cccc(O)c2)C1
2,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,False,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...
3,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,False,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...
4,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,False,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1


In [8]:
train_df_splitted = split_df(
    train_df,
    smiles_col="cleaned",
    keep_columns=["Smiles", "cleaned"],
)
test_df_splitted = split_df(test_df, smiles_col="cleaned", keep_columns=["Smiles", "cleaned"])

In [9]:
train_df_splitted.head()

Unnamed: 0,original_index,Active,num_parts,original_Smiles,original_cleaned,part
0,0,False,1.0,COc1ccc2[nH]cc(CCN)c2c1,COc1ccc2[nH]cc(CCN)c2c1,COc1ccc2[nH]cc(CCN)c2c1
1,1,False,1.0,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,CCCN1CCC[C@H](c2cccc(O)c2)C1,CCCN1CCC[C@H](c2cccc(O)c2)C1
2,2,False,1.0,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...
3,3,False,1.0,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...
4,4,False,1.0,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1


In [10]:
# just a test to see that it is revertible
df = collect_df(train_df_splitted).rename(columns={"original_Smiles": "Smiles"})
df.merge(train_df, on=["Smiles", "Active"], how="left").shape == train_df.shape

Found Active


True

In [11]:
train_df_splitted.to_csv(DATAPATH/"train_splitted.csv")
test_df_splitted.to_csv(DATAPATH/"test_splitted.csv")

In [12]:
from DeepPurpose.utils import smiles2pubchem, smiles2morgan



In [13]:
from tqdm.auto import tqdm
train_fingerprints = []
for original_index, part, active in tqdm(train_df_splitted[["original_index", "part", "Active"]].values):
    fp = smiles2pubchem(part)
    train_fingerprints.append((original_index, part, active, fp))


100%|██████████| 5758/5758 [04:49<00:00, 19.91it/s]


In [14]:
test_fingerprints = []
for original_index, part in tqdm(test_df_splitted[["original_index", "part"]].values):
    fp = smiles2pubchem(part)
    test_fingerprints.append((original_index, part, fp))


100%|██████████| 1681/1681 [01:47<00:00, 15.71it/s]


In [15]:
import numpy as np
train_fp = np.stack([x[-1]for x in train_fingerprints])
test_fp = np.stack([x[-1] for x in test_fingerprints])

In [16]:
from catboost import CatBoostClassifier

x_train = np.concatenate([train_fp, test_fp], axis=0)
y_train = np.concatenate([np.zeros(train_fp.shape[:1]), np.ones(test_fp.shape[:1])], axis=0)

In [17]:
model = CatBoostClassifier(
    random_seed=42, depth=11,
    iterations=100,
    auto_class_weights="Balanced", eval_metric="F1"
)
model.fit(x_train, y_train, eval_set=(x_train, y_train),
    plot=True,silent=True )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f90e37f5290>

In [18]:
predictions = model.predict_proba(x_train)

In [19]:
(np.argmax(predictions, 1) == y_train).sum(), len(y_train), len(train_fp)

(6812, 7439, 5758)

In [20]:
preds1 = predictions[..., 1]

In [21]:
preds1[y_train==1]  # test dataset

array([0.72892155, 0.82534539, 0.75346306, ..., 0.54401058, 0.76688576,
       0.65985463])

In [22]:
train_preds = preds1[y_train==0]

In [23]:
ids = np.argsort(train_preds)[::-1]
train_preds[ids][:len(test_fp)], len(test_fp)

(array([0.80396857, 0.79183887, 0.78817349, ..., 0.34115085, 0.34108438,
        0.34108438]),
 1681)

In [24]:
activities = np.asarray([x[2] for x in train_fingerprints])
active1 = np.where(activities)[0]
active0 = np.where(~activities)[0]

In [54]:
N = 0.20
n1 = int(len(active1)*N)
n0 = int(len(active0)*N)
print(n0, n1)
ids1 = np.argsort(train_preds[active1])[::-1]
pos_ids = active1[ids1][:n1]
ids0 = np.argsort(train_preds[active0])[::-1]
neg_ids = active0[ids0][:n0]

1108 43


In [55]:
ids_ = np.argsort(train_preds)[::-1][:n0+n1]
activities[ids_].mean(), activities[ids_].sum()

(0.03909643788010426, 45)

In [56]:
val_ids = np.concatenate([pos_ids, neg_ids])
val_ids.shape

(1151,)

In [57]:
np.median(train_preds[val_ids])

0.49519580765819243

In [58]:
train_ids_ = [x for x in np.arange(len(train_preds)) if x not in val_ids]
np.median(train_preds[train_ids_])

0.2060434529900928

In [59]:
val_ids = np.sort(val_ids)

In [60]:
train_df_splitted["val_index"] = train_df_splitted.original_index.apply(lambda x: x in set(val_ids))

In [61]:
train_df_splitted.to_csv(DATAPATH / "train_splitted_val.csv")