In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import subprocess
import os

import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdmolops
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.ML.Cluster import Butina

# shut off warnings
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')  

import matplotlib
from matplotlib import rc
font = {'size'   : 8}
matplotlib.rc('font', **font)

# change font
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"

# Part 1: Read in data and get worst predicted killing compounds

In [8]:
res_folder = ''
df = pd.read_csv('../out/round3finalval08162021_broad_chunks_final_predictions_broad800k_08_20_2021.csv')
df = df.sort_values(by = 'hit_kill', ascending = True)
df = df.iloc[0:1000,:] # get worst 1000 to start

df['SMILES'] = df['smiles']
meta = pd.read_csv('../../beta_lactam/data/PublicStructures.txt', sep = '\t')
df = df.merge(meta, on = 'SMILES')

# use same functions as described in 2C.2_calculate_tanimoto_similarity_of_hits.ipynb
smis = list(df['SMILES'])
print("Computing tanimoto scores against abx...")
df = compute_tanimoto_against_abx(smis, df)
print("Computing tanimoto scores against training set...")
df = compute_tanimoto_against_training_set(smis, df)
df = df[df['tanimoto similarity to closest train set'] < 1]
df.to_csv('../out/round3finalval08162021_plots/worst_mols_broad800k_kill02_predicted_hits.csv', index = False)

In [9]:
# apply same threshold as earlier
simthresh = 0.9

df = pd.read_csv('../out/worst_mols_broad800k_model_predictions.csv')
df = df[df['tanimoto similarity to closest train set'] < simthresh]
df = df[df['tanimoto similarity to closest abx'] < simthresh]
df = df.iloc[0:300,:]

df.to_csv('../out/finalized_selected_worst_mols_broad800k_model_predictions.csv', index = False)
df

Unnamed: 0,SMILES,tanimoto similarity to closest train set,closest train set smiles,closest train set name,tanimoto similarity to closest abx,closest abx smiles,closest abx name,smiles,hit_inh,hit_inh_epi_unc,hit_kill,hit_kill_epi_unc,Name,PROJECT_CODE,PROJECT_NAME,CXSMILES,STEREO_COMMENTS,row_num
0,COCc1ccc(s1)C(=O)N1CCC(CC1)(C(=O)O)n1cccn1,0.446043,CC[C@@]12CCCN3CCc4c([C@H]13)n(c1ccccc41)[C@](O...,vincamine,0.439402,CC1(C)S[C@@H]2[C@H](NC(=O)COc3ccccc3)C(=O)N2[C...,PENIMEPICYCLINE,COCc1ccc(s1)C(=O)N1CCC(CC1)(C(=O)O)n1cccn1,0.000018,1.938463e-09,6.434774e-07,2.746465e-12,BRD-K52631221,2001,General HTS Sets,COCc1ccc(s1)C(=O)N1CCC(CC1)(C(O)=O)n1cccn1,as drawn,0
1,CCOC(=O)C1CCN(CC1)C(=S)NC(=O)c1cccs1,0.375624,Clc1cccc2cc(sc12)C(=O)N[C@H]1CN2CCC1CC2,EVP-6124,0.285604,CC(=O)C(=O)N[C@@H]1/C=C(C)/C=C/[C@@H](OC(=O)N2...,TERDECAMYCIN,CCOC(=O)C1CCN(CC1)C(=S)NC(=O)c1cccs1,0.000004,1.784110e-10,6.488524e-07,3.357698e-12,BRD-K82870356,2001,General HTS Sets,CCOC(=O)C1CCN(CC1)C(=S)NC(=O)c1cccs1,as drawn,1
2,CCN1CCCC1=O.CO[C@H]1CN(CCO)S(=O)(=O)c2ccccc21....,0.829735,CC[C@H](C)[C@@H]1N[C@@H]2C=Cc3c(cc(nc3[C@H]2O)...,thiostrepton,0.793291,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(=O)c(c5o...,RIFALAZIL,CCN1CCCC1=O.CO[C@H]1CN(CCO)S(=O)(=O)c2ccccc21....,0.000002,2.825133e-11,9.749020e-07,6.664941e-12,BRD-M17049327,2001,General HTS Sets,CCN1CCCC1=O.CC(C)c1nc(=O)cc(C)[nH]1.Fc1cccc(Cl...,as drawn,2
3,ONC(=O)c1ccc(s1)C#C,0.279221,"Cc1cc(sc1[C@@H](N)C(O)=O)C(O)=O |&1:6,r|",3-MATIDA,0.175266,CC(C)=CCC/C(C)=C/CC/C(C)=C/CN1C(=O)c2cccc(O)c2...,DIAZEPINOMICIN,ONC(=O)c1ccc(s1)C#C,0.000002,4.871869e-11,9.844318e-07,1.142022e-11,BRD-K20971343,2001,General HTS Sets,ONC(=O)c1ccc(s1)C#C,as drawn,3
4,CCNC(=S)NNC(=O)c1csc(c1)C(C)C,0.312549,ClC=C1Nc2sc3CCCCc3c2C(=O)N1,"2-(chloromethyl)-5,6,7,8-tetrahydro[1]benzothi...",0.239979,Cc1c(N)nc([C@H](CC(N)=O)NC[C@H](N)C(N)=O)nc1C(...,TALISOMYCIN,CCNC(=S)NNC(=O)c1csc(c1)C(C)C,0.000002,5.079206e-11,9.858252e-07,9.323063e-12,BRD-K29372009,2001,General HTS Sets,CCNC(=S)NNC(=O)c1csc(c1)C(C)C,as drawn,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,O=C(N1CCC[C@@H](C1)c1ccn[nH]1)c1ccc(cc1)Cn1cnnn1,0.451419,Fc1ccc(cc1)C(=O)N1CCC[C@@H](C1)c1nc(no1)-c1ccc...,ADX-47273,0.386072,Cc1ccc2c(c1O)NC(O)C1CC(/C=C/C(N)=O)=CN1C2=O,ANTHRAMYCIN,O=C(N1CCC[C@@H](C1)c1ccn[nH]1)c1ccc(cc1)Cn1cnnn1,0.000015,2.008649e-09,3.221358e-06,6.639079e-11,BRD-A87246208,2001,General HTS Sets,O=C(N1CCC[C@@H](C1)c1ccn[nH]1)c1ccc(Cn2cnnn2)c...,as drawn,295
296,COC(=O)c1sccc1NC(=S)N(C)CCC#N,0.482402,CCCN[C@@H](C)C(=O)Nc1c(C)csc1C(=O)OC |&1:4|,articaine,0.304162,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(c3C2=O)C...,RIFABUTIN,COC(=O)c1sccc1NC(=S)N(C)CCC#N,0.000006,2.638478e-10,3.230704e-06,9.944416e-11,BRD-K21079502,2001,General HTS Sets,COC(=O)c1sccc1NC(=S)N(C)CCC#N,as drawn,296
297,CCn1ncc(n1)NC(=O)c1ccc(s1)[C@H]1CCCN1Cc1[nH]cnc1C,0.527582,CN1CCN(C)C(=O)[C@@H]1c1ccc(Nc2nc(cn(C)c2=O)-c2...,GDC-0834,0.512545,Cc1c(N)nc([C@H](CC(N)=O)NC[C@H](N)C(N)=O)nc1C(...,TALISOMYCIN,CCn1ncc(n1)NC(=O)c1ccc(s1)[C@H]1CCCN1Cc1[nH]cnc1C,0.000015,1.592624e-09,3.232398e-06,5.600136e-11,BRD-A30288233,7159,Calico PRISM,CCn1ncc(NC(=O)c2ccc(s2)[C@H]2CCCN2Cc2[nH]cnc2C...,as drawn,297
298,O=C(Cc1ccccc1)NNC(=S)NC(=O)c1cccs1,0.353811,Cc1[nH]c2ccccc2c1CCNC(=O)c1cccs1,CK-636,0.260530,N[C@@H](C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(Cl)CC[...,LORACARBEF,O=C(Cc1ccccc1)NNC(=S)NC(=O)c1cccs1,0.000005,1.546700e-10,3.245806e-06,7.158436e-11,BRD-K91382924,2001,General HTS Sets,O=C(Cc1ccccc1)NNC(=S)NC(=O)c1cccs1,as drawn,298
