In [None]:
import sys
sys.path.append('../release')

In [None]:
import pandas as pd
import numpy as np

from rdkit.Chem import PandasTools
PandasTools.RenderImagesInAllDataFrames(images=True)
import rdkit.Chem as Chem
from rdkit import DataStructs

from predictor import VanillaQSAR
from utils import get_fp
from sklearn.ensemble import RandomForestClassifier as RFC

In [None]:
exp_data = pd.read_csv('../data/egfr_with_pubchem.csv')
exp_data['molecules'] = exp_data.smiles.apply(Chem.MolFromSmiles)
exp_actives = exp_data.copy()[exp_data.predictions > 0.75]

# Selecting promising molecules from a general kinase library

In [None]:
model_instance = RFC
model_params = {'n_estimators': 250,
                'n_jobs': 10}
my_predictor = VanillaQSAR(model_instance=model_instance,
                           model_params=model_params,
                           ensemble_size=10)
my_predictor.load_model('../checkpoints/predictor/egfr_rfc')

In [None]:
!unzip ../data/Enamine_Kinase_Library_plated.zip

In [None]:
kin_lib = Chem.SDMolSupplier('../data/Enamine_Kinase_Library_plated_64000cmds_20200524.sdf')
print(len(kin_lib))

In [None]:
kin_lib = pd.DataFrame({'molecules': kin_lib})
kin_lib['smiles'] = kin_lib.molecules.apply(Chem.MolToSmiles)
mols, props, nan_smiles = my_predictor.predict(kin_libs.smiles, get_features=get_fp)
mapper = dict(zip(mols, props))
kin_lib['predictions'] = kin_lib.smiles.apply(mapper.get)

In [None]:
# draw some molecules with nonzero predicted activity
sample = kin_lib[kin_lib.predictions > 0].sample(25)
sample

In [None]:
labels = sample.predictions.apply('{:.3f}'.format)
Chem.Draw.MolsToGridImage(sample.molecules, legends=labels)

## Filter for fused ring motif

In [None]:
substruct = Chem.MolFromSmarts('a(:a)(:a)(:a)')
kin_lib['is_fused'] = kin_lib.molecules.apply(lambda m: m.HasSubstructMatch(substruct))
fused_lib = kin_lib[kin_lib.is_fused]
print(len(fused_lib))

In [None]:
sample = fused_lib.sample(25)
Chem.Draw.MolsToGridImage(sample.molecules, molsPerRow=5)

## Analyze properties of molecules with fused ring

In [None]:
from analysis_utils import _plot_similarities
_plot_similarities(fused_lib.mols, from_smiles=False, sample_size=1000, bins=50)

In [None]:
# number of predicted active molecules so low as to be negligible
sum(fused_lib.predictions > 0)

In [None]:
from analysis_utils import compare_libraries
shared_scaffolds = compare_libraries(fused_lib, exp_actives, sample_size=1000, bins=50,
                                     properties=['MolWt', 'MolLogP'], return_shared_scaffolds=True,
                                     plot=False)

In [None]:
shared_scaffolds

# Select molecules with novel scaffolds

In [None]:
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
fused_lib['novel_scaffold'] = fused_lib.molecules.apply(lambda m: MurckoScaffoldSmiles(mol=m) not in shared_scaffolds)
fused_lib_novel = fused_lib[fused_lib.novel_scaffold]

In [None]:
fused_lib_novel[['smiles', 'predictions']].to_csv('../data/Enamine_Kinase_Library_filtered.smi', header=False, index=False)

In [None]:
# remove salts to enable tokenization
fused_lib_novel.replace(to_replace='\,\[K\+]', value='', regex=True, inplace=True)
fused_lib_novel = fused_lib_novel[fused_lib_novel.predictions > 0]
fused_lib_novel[['smiles', 'predictions']].to_csv('../data/egfr_enamine.smi',
                                                  header=False, index=False)
