In [1]:
%load_ext autoreload
%autoreload 2

import typing
from typing import List, Iterable
import pickle
import numpy as np
import pandas as pd

import rdkit
from rdkit.Chem import AllChem as AllChem
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate

In [2]:
with open("AID_628/data.pkl","rb") as file:
    data = pickle.load(file)
    
smiles_ls = data.get("smiles_ls")
mol_ls = data.get("mol_ls")
fp_ls = data.get("fp_ls")
activity_ls = data.get("activity_ls")
ds_ls = data.get("ds_ls")

del data

X = np.concatenate([np.array(ds_ls), np.array(fp_ls)], axis=1)
y = np.array(activity_ls)
print(X.shape, y.shape)

SEED = 0
P_INIT = 0.15
P_ITER = 0.05
N_ITER = 4
P_EXPLOIT = 0.8
N_TOTAL = len(fp_ls)

(63662, 1121) (63662,)


In [18]:
N_JOBS = 4

# define model
rf = RandomForestClassifier(n_estimators=1200, class_weight="balanced",
    max_features='log2',bootstrap=True,min_samples_split = 8,
    min_samples_leaf = 3, n_jobs = N_JOBS,random_state=SEED)

svm = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
    fit_intercept=True, max_iter=10000,tol=0.001, shuffle=True, verbose=0,
    epsilon=0.1, n_jobs=N_JOBS, random_state=None, learning_rate='optimal',
    eta0=0.0007, power_t=0.5, class_weight='balanced', warm_start=False,
    average=5)

lgbm = lgb.LGBMClassifier(boosting_type='dart', num_leaves=42, max_depth=-1,
    learning_rate=0.25, n_estimators=1200, subsample_for_bin=200000,
    objective='binary', is_unbalance=False, max_bin=200,
    min_child_weight=0.001, min_child_samples=30, subsample=1.0,
    subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0,
    random_state=None, n_jobs=N_JOBS, silent=True,
    importance_type='split')


model_ls = [rf, svm, lgbm]

In [22]:
scoring = ['accuracy', 'precision', 'recall', 'roc_auc',]
scores = cross_validate(rf, X, y, scoring=scoring)
scores_ls = [cross_validate(model, X, y, scoring=scoring) for model in model_ls]

[LightGBM] [Info] Number of positive: 1744, number of negative: 49185
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115025 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16402
[LightGBM] [Info] Number of data points in the train set: 50929, number of used features: 1119
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034244 -> initscore=-3.339407
[LightGBM] [Info] Start training from score -3.339407
[LightGBM] [Info] Number of positive: 1744, number of negative: 49185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.155851 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16486
[LightGBM] [Info] Number of data points in the train set: 50929, number of used features: 1119
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034244 -> initscore=-3.33940

In [23]:
scores_ls

# svm fitted everything to positive...

# rf > lgbm in auc, lgbm > rf in recall

[{'fit_time': array([94.21736455, 91.98897028, 89.56756973, 92.68861818, 92.38944411]),
  'score_time': array([6.17101908, 8.45499635, 6.12318063, 5.78977561, 6.04073596]),
  'test_accuracy': array([0.96654363, 0.96560119, 0.96591266, 0.966541  , 0.96567703]),
  'test_precision': array([0.91666667, 0.44444444, 0.55555556, 0.72727273, 0.48571429]),
  'test_recall': array([0.02522936, 0.01834862, 0.02293578, 0.03669725, 0.03899083]),
  'test_roc_auc': array([0.68786207, 0.67512868, 0.67593362, 0.68861564, 0.77434893])},
 {'fit_time': array([1.30261183, 6.03028321, 1.50522566, 1.49150443, 8.75233364]),
  'score_time': array([0.1477282 , 0.0899713 , 0.09896803, 0.09600353, 0.08252239]),
  'test_accuracy': array([0.03424173, 0.03424173, 0.03424442, 0.03424442, 0.03424442]),
  'test_precision': array([0.03424173, 0.03424173, 0.03424442, 0.03424442, 0.03424442]),
  'test_recall': array([1., 1., 1., 1., 1.]),
  'test_roc_auc': array([0.57235533, 0.55648838, 0.57013133, 0.55605164, 0.55263739])

In [5]:
def random_pick(fp_ls: List[rdkit.DataStructs.cDataStructs.ExplicitBitVect], 
                n: int, 
                seed: int = 42) -> List[int]:
    """Random diverse compound picking based on Rdkit MaxMinPicker"""
    picker = MaxMinPicker()
    return list(picker.LazyBitVectorPick(fp_ls, len(fp_ls), n, seed=seed))


class Indice():
    def __init__(self, size: int) -> None:
        self.unsampled = list(range(size))
        self.sampled = []
        
    def add(self, idxs: Iterable[int]) -> None:
        self.sampled = list(set(self.sampled + list(idxs)))
        self.unsampled = list(set(self.unsampled) - set(self.sampled))

# test
idxs = Indice(10)

idxs.add([1,2])
print(idxs.sampled)
print(idxs.unsampled)

idxs.add([1,2,8])
print(idxs.sampled)
print(idxs.unsampled)

idxs.add([1,2,8])
print(idxs.sampled)
print(idxs.unsampled)

[1, 2]
[0, 3, 4, 5, 6, 7, 8, 9]
[8, 1, 2]
[0, 3, 4, 5, 6, 7, 9]
[8, 1, 2]
[0, 3, 4, 5, 6, 7, 9]


In [6]:
clf = RandomForestClassifier(n_estimators=1200, class_weight="balanced",
    max_features='log2',bootstrap=True,min_samples_split = 8,
    min_samples_leaf = 3, n_jobs = 4,random_state=SEED)

# select initial subset
init_idx = random_pick(fp_ls, int(P_INIT*len(fp_ls)))
idxs = Indice(len(fp_ls))
idxs.add(init_idx)

for _ in range(N_ITER):
    
    print(np.array(activity_ls)[idxs.sampled].sum() / np.array(activity_ls).sum())
    
    # train supversied model
    # form dataset
    X_sampled = X[idxs.sampled]
    y_sampled = y[idxs.sampled]
    X_unsampled = X[idxs.unsampled]

    # fit & make prediction
    clf.fit(X_sampled, y_sampled)
    probs = clf.predict_proba(X_unsampled)[:,0]  # prob of being inactive

    # select next batch
    # add exploitation set
    n_exploit = int(P_ITER * N_TOTAL * P_EXPLOIT)
    idx_exploit = np.argsort(probs)[:n_exploit]  # sorted from low -> high
    idxs.add(np.array(idxs.unsampled)[idx_exploit])

    # add exploration set
    n_explore = int(P_ITER * N_TOTAL * (1-P_EXPLOIT))
    idx_explore = random_pick([fp_ls[i] for i in idxs.unsampled], n_explore)
    idxs.add(np.array(idxs.unsampled)[idx_explore])

np.array(activity_ls)[idxs.sampled].sum() / np.array(activity_ls).sum()

0.16559633027522935
0.36009174311926606
0.4651376146788991
0.5344036697247706


0.594954128440367

In [23]:
ytrue_idxs = np.where(np.array(activity_ls))[0]
ytrue = np.array(smiles_ls)[ytrue_idxs]

yhat_idxs = np.where(np.array(activity_ls)[idxs.sampled])[0]
yhat = np.array(smiles_ls)[hits_idxs]
yhat.shape

(1297,)

In [9]:
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

pains_idxs = []

# initialize filter
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)


for i, mol in zip(range(len(mol_ls)), mol_ls):
    entry = catalog.GetFirstMatch(mol)  # Get the first matching PAINS
    if entry is not None:
        pains_idxs.append(i)

In [24]:
print(f"% PAINS: {len(pains_idxs)/len(activity_ls)}")
print(f"% Hits: {sum(activity_ls)/len(activity_ls)}")

print(f"# PAINs active: {len(pains_idxs) + len(ytrue_idxs) - len(set(pains_idxs+list(ytrue_idxs)))}")
print(f"% PAINs active: {(len(pains_idxs) + len(ytrue_idxs) - len(set(pains_idxs+list(ytrue_idxs)))) / len(ytrue_idxs)}")

print(f"# PAINs sampled hit: {len(pains_idxs) + len(yhat_idxs) - len(set(pains_idxs+list(yhat_idxs)))}")
print(f"% PAINs sampled hit: {(len(pains_idxs) + len(yhat_idxs) - len(set(pains_idxs+list(yhat_idxs)))) / len(yhat_idxs)}")

% PAINS: 0.03582985140272062
% Hits: 0.03424334767993466
# PAINs active: 116
% PAINs active: 0.05321100917431193
# PAINs sampled hit: 42
% PAINs sampled hit: 0.03238242097147263


In [None]:
# => not biasing towards PAINs