In [1]:
%load_ext autoreload
%autoreload 2

import typing
from typing import List, Iterable
import pickle
import numpy as np
import pandas as pd

import rdkit
from rdkit.Chem import AllChem as AllChem
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from utils import *

SEED = 0
P_INIT = 0.15
P_ITER = 0.05
N_ITER = 4
P_EXPLORE = 0.2

Loaded 97 descriptor functions


In [2]:
with open("dataset/AID_1259374/data.pkl","rb") as file:
    data = pickle.load(file)
    
fp_ls = data.get("fp_ls")
y = np.array(data.get("activity_ls"))

# initial picks using RDkit
init_idxs = random_pick(fp_ls, int(P_INIT*len(fp_ls)))

In [2]:
def iHTS(learner, y, init_idxs, n_iter=N_ITER, p_iter=P_ITER):
    #
    results = []
    
    # initialize idxs
    idxs = Indice(len(y))
    
    # observe initial set
    idxs.add(init_idxs)
    learner.observe(init_idxs, y[init_idxs])

    # iteratively sample the library
    for it in range(n_iter):
        print(f"{it}:", end='')
        sample_idxs = learner.select(idxs.unsampled, int(p_iter * len(y)))
        learner.observe(sample_idxs, y[sample_idxs])
        idxs.add(sample_idxs)

        results.append(idxs.sampled)
        
        print(f"{hit_screened(y, idxs.sampled)}")
        
    return results

In [5]:
"""Random Forest"""
clf = RandomForestClassifier(n_estimators=1200, class_weight="balanced",
    max_features='log2',bootstrap=True,min_samples_split = 8,
    min_samples_leaf = 3, n_jobs = 16,random_state=SEED)
learner = Supervised_learner(clf, P_EXPLORE, data)

results = iHTS(learner, y, init_idxs)

  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: Input X contains infinity or a value too large for dtype('float32').

In [3]:
with open("dataset/AID_1259374/data.pkl","rb") as file:
    data = pickle.load(file)
    
y = np.array(data["activity_ls"])

In [4]:
# initial picks using RDkit
init_idxs = naive_random_choice(range(len(y)), int(P_INIT*len(y)))
learner = Thompson_learner(data)
results = iHTS(learner, y, init_idxs)

Initialized 65626 scaffolds for 646073 compounds
0:0.18976109215017065
1:0.23890784982935154
2:0.29146757679180885
3:0.3412969283276451


In [2]:
with open("dataset/AID_628/data.pkl","rb") as file:
    data = pickle.load(file)
    
# fp_ls = data["fp_ls"]
# y = np.array(data["activity_ls"])

# scaffold_dict = (data["scaffold_dict"])
# print(len(scaffold_dict.keys()))

# # initial picks using RDkit
# init_idxs = random_pick(fp_ls, int(P_INIT*len(fp_ls)))


"""Thompson sampling"""

learner = Thompson_learner(data)

# results = iHTS(learner, y, init_idxs)

Initialized 15545 scaffolds for 63662 compounds


In [5]:
len(learner.scf_ls[0].unsampled)

515

In [5]:
learner.scf_ls

[CC1CCCC(CCCC2CCCCC2)C1 | 4 / 103,
 CC(CC1CCC2CCCCC21)C1CCC(C2CCC(C3CCCCC3)CC2)CC1 | 1 / 1,
 C1CCC(C2CCC(C3CCCCC3)CC2)CC1 | 3 / 259,
 CC(CCCCCCCC(C)CCC1C(C)CCC1C)CCCCC1CCC2CC(C)CC12 | 0 / 0,
 CC(CCCCCCCC(C)CCCCCCC1CCCC1C)CCCCC1CCC2CC(C)CC12 | 1 / 1,
 CC(CCCCCCCC(C)CCCCC1CCC2CC(C)CC12)CCCCCCCC1CCC(C)C1 | 1 / 1,
 CC(CCCCCCCC(C)CCCCC1CCC2CC(C)CC12)CCCCCCC1CCCC1C | 1 / 1,
 CC(CCCCCCCC(C)CCCCCCC1CCC(C)C1C)CCCCC1CCC2CC(C)CC12 | 1 / 1,
 CC(CCCCCCCC(C)CCCCCCC1CCCC1)CCCCC1CCC2CC(C)CC12 | 0 / 0,
 CC(C1CCCCC1)C1CCC(C2CCC(C3CCCCC3)CC2)CC1 | 2 / 14,
 C1CCC2CCCCC2C1 | 17 / 1000,
 CC1CC2CCCC2C1 | 2 / 11,
 CC1C2CCCCC2C(C2C3CCCCC3C(C)C3C(CC4CCCCC4)CCCC32)C2CCCC(CC3CCCCC3)C12 | 0 / 0,
 CC(C1CCC(C2CCC(C3CCCCC3)CC2)CC1)C1CC1 | 0 / 1,
 CC(CCCCC(C)C(CC(C)C1CCCCC1)C(CC1CCCCC1CC1CCCCC1)C1CCCC1)CCC(C)CCCC1CCC(C2CCCC2)C1 | 0 / 0,
 CC(C)(C1CCC(C2CCC(C3CCCCC3)CC2)CC1)C1CC1 | 0 / 0,
 CC1CCCC2C1CCC1CCCCC12 | 0 / 10,
 CC1CCCC1 | 3 / 169,
 CC(CCCC1CCCCC1)CC1CCCCC1 | 10 / 491,
 CC(CCCCCC1CCC(C)C1)CCC1C(C)C2CCCCC2C1C |

In [26]:
for file in [
    "dataset/test/AID_1259354/data.pkl",
    "dataset/test/AID_488969/data.pkl",
    "dataset/test/AID_598/data.pkl"
]:
    
    print(file)
    
    with open(file,"rb") as file:
        data = pickle.load(file)
    
    fp_ls = data.get("fp_ls")
    y = np.array(data.get("activity_ls"))
    
    # initial picks using RDkit
    init_idxs = random_pick(fp_ls, int(P_INIT*len(fp_ls)))
    
    clf = RandomForestClassifier(n_estimators=1200, class_weight="balanced",
        max_features='log2',bootstrap=True,min_samples_split = 8,
        min_samples_leaf = 3, n_jobs = 16,random_state=SEED)
    learner = Supervised_learner(clf, P_EXPLORE, data)
    
    results = iHTS(learner, y, init_idxs)

dataset/test/AID_1259354/data.pkl


  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: Input X contains infinity or a value too large for dtype('float32').

In [16]:
np.isnan(np.array(data['fp_ls'])).sum()

0

In [None]:
'EState_VSA11' too big (1e38)

In [48]:
for file in [
    "dataset/test/AID_1259354/data.pkl",
    "dataset/test/AID_488969/data.pkl",
    "dataset/test/AID_598/data.pkl"
]:
    
    print(file)
    
    with open(file,"rb") as file:
        data = pickle.load(file)
    
    fp_ls = data.get("fp_ls")
    y = np.array(data.get("activity_ls"))
    
    # initial picks using RDkit
    init_idxs = random_pick(fp_ls, int(P_INIT*len(fp_ls)))
    
    learner = Thompson_learner(data)
    
    results = iHTS(learner, y, init_idxs)

dataset/test/AID_1259354/data.pkl
Initialized 13955 scaffolds for 56366 compounds
0:0.26918859649122806
1:0.3119517543859649
2:0.36019736842105265
3:0.40076754385964913
dataset/test/AID_488969/data.pkl
Initialized 21391 scaffolds for 105158 compounds
0:0.21421975992613113
1:0.26500461680517085
2:0.314404432132964
3:0.3601108033240997
dataset/test/AID_598/data.pkl
Initialized 17876 scaffolds for 85210 compounds
0:0.23998444185141968
1:0.2849085958770906
2:0.3335278101905873
3:0.3755348113574485
