In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from pred_utils import compute_tanimoto_against_dataset

# Part 1: Process predictions from model

In [2]:
# load in Broad800K model predictions
df = pd.read_csv('../out/broad800k_model_predictions_08_20_2021.csv')

# keep only those with predicted kill > 0.2
df = df[df['hit_kill'] > 0.2]
df['SMILES'] = df['smiles']

# merge with metadata and grab smiles
meta = pd.read_csv('../data/PublicStructures.txt', sep = '\t')
df = df.merge(meta, on = 'SMILES')
smis = list(df['SMILES'])

# compute tanimoto similarity to nearest neighbor antibiotic
print("Computing tanimoto scores against abx...")
abx = pd.read_csv('../data/chembl_abx.csv', sep = ';') # set of anything labelled 'Antibiotics' from ChEMBL
df = compute_tanimoto_against_dataset(smis, df, abx, dataset_name = 'abx', smi_col='Smiles', name_col = 'Name')

# compute tanimoto similarity to nearest neighbor training set
print("Computing tanimoto scores against training set...")
ts = pd.read_csv('../out/data_prep_for_ml_fullset.csv')
df = compute_tanimoto_against_dataset(smis, df, ts, dataset_name = 'train set', smi_col='SMILES', name_col = 'Name')
df = df[df['tanimoto similarity to closest train set'] < 1]
df.to_csv('../out/broad800k_model_predictions_kill02_08_20_2021.csv', index = False)
df

Computing tanimoto scores against abx...
Computing tanimoto scores against training set...


Unnamed: 0,SMILES,tanimoto similarity to closest train set,closest train set smiles,closest train set name,tanimoto similarity to closest abx,closest abx smiles,closest abx name,smiles,hit_inh,hit_inh_epi_unc,hit_kill,hit_kill_epi_unc,Name,PROJECT_CODE,PROJECT_NAME,CXSMILES,STEREO_COMMENTS
0,C[N+](C)(CCOc1ccccc1)Cc1ccccc1.OC(=O)c1cc2cccc...,0.435644,CCN1CCC[C@H]1CNC(=O)c1c(OC)ccc(Br)c1OC,remoxipride,0.339597,Nc1ccc(C(=O)Oc2ccccc2)c(O)c1,PHENYL AMINOSALICYLATE,C[N+](C)(CCOc1ccccc1)Cc1ccccc1.OC(=O)c1cc2cccc...,0.089759,0.003944,0.234748,0.022272,BRD-M39654024,2001,General HTS Sets,OC(=O)c1cc2ccccc2c(Br)c1O.C[N+](C)(CCOc1ccccc1...,as drawn
1,CN(C)c1ccc(cc1)-c1nc2ccc3ccccc3c2c2c1CCC2,0.595642,CC[C@H](NC(=O)c1c(C)c(nc2ccccc12)-c1ccccc1)c1c...,SB-222200,0.476708,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(=O)c(c5o...,RIFALAZIL,CN(C)c1ccc(cc1)-c1nc2ccc3ccccc3c2c2c1CCC2,0.032539,0.000854,0.272220,0.030580,BRD-K92834382,2001,General HTS Sets,CN(C)c1ccc(cc1)-c1nc2ccc3ccccc3c2c2CCCc12,as drawn
2,O[C@@H](CNCc1ccccc1)Cn1c2ccc(cc2c2cc(ccc12)Cl)Cl,0.636771,CN(C)C[C@@H](O)Cn1c2ccc(Br)cc2c2cc(Br)ccc12 |&...,wiskostatin,0.350282,CCn1cc(C(=O)O)c(=O)c2ccc(-c3ccncc3)cc21,ROSOXACIN,O[C@@H](CNCc1ccccc1)Cn1c2ccc(cc2c2cc(ccc12)Cl)Cl,0.069170,0.002910,0.236441,0.032415,BRD-A79555596,2001,General HTS Sets,O[C@@H](CNCc1ccccc1)Cn1c2ccc(Cl)cc2c2cc(Cl)ccc...,as drawn
3,Clc1ccc(cc1)-c1cn2c(nc3ccccc32)n1CCCNC1CCCCC1,0.492949,COc1cc2CC(C)(C)n3c(cc4nc5ccccc5nc34)-c2cc1OC,YM-90709,0.457711,CC[C@H]1OC(=O)[C@H](C)C(=O)[C@H](C)[C@@H](O[C@...,TELITHROMYCIN,Clc1ccc(cc1)-c1cn2c(nc3ccccc32)n1CCCNC1CCCCC1,0.046292,0.002590,0.283152,0.045922,BRD-K25973121,2001,General HTS Sets,Clc1ccc(cc1)-c1cn2c3ccccc3nc2n1CCCNC1CCCCC1,as drawn
4,O[C@@H](CNC1CCCCC1)CN1c2ccccc2Sc2ccccc21,0.806202,CN(C)CCCN1c2ccccc2Sc2ccccc12,promazine,0.316276,N#Cc1c(N2C[C@@H]3NCCO[C@H]3C2)c(F)cc2c(=O)c(C(...,FINAFLOXACIN,O[C@@H](CNC1CCCCC1)CN1c2ccccc2Sc2ccccc21,0.042517,0.001975,0.202662,0.025442,BRD-A33120153,2001,General HTS Sets,"O[C@@H](CNC1CCCCC1)CN1c2ccccc2Sc2ccccc12 |&1:1,r|",as drawn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,ONC(=O)CCCCCCC(=O)N/N=C/C[P+](c1ccccc1)(c1cccc...,0.451411,CC1=C(C)C(=O)C(CCCCCCCCCC[P+](c2ccccc2)(c2cccc...,visomitin,0.204197,CC1=C\[C@@H](O)CC(=O)Cc2nc(co2)C(=O)N2CCC=C2C(...,MIKAMYCIN,ONC(=O)CCCCCCC(=O)N/N=C/C[P+](c1ccccc1)(c1cccc...,0.152810,0.068002,0.231551,0.098328,BRD-K03937843,2001,General HTS Sets,ONC(=O)CCCCCCC(=O)N\N=C\C[P+](c1ccccc1)(c1cccc...,as drawn
513,C/C(=N\NC(=O)CCCCCCC(=O)Nc1ccccc1O)C[P+](c1ccc...,0.426415,CCCCCCCCCCC(C)(C)C(=O)Nc1c(OC)cc(OC)cc1OC,CI-976,0.316321,CN(C)[C@@H]1C(O)=C(C(N)=O)C(=O)[C@@]2(O)C(O)=C...,ERAVACYCLINE,C/C(=N\NC(=O)CCCCCCC(=O)Nc1ccccc1O)C[P+](c1ccc...,0.219038,0.067046,0.360840,0.107617,BRD-K95293872,2001,General HTS Sets,C\C(C[P+](c1ccccc1)(c1ccccc1)c1ccccc1)=N/NC(=O...,as drawn
514,CCCCCCc1nc(cc2c1[Si](O[C@H]2c1ccccc1)(C(C)C)C(...,0.630123,CC[C@H](C)[C@@H]1N[C@@H]2C=Cc3c(cc(nc3[C@H]2O)...,thiostrepton,0.613179,CO[C@H]1/C=C/O[C@@]2(C)Oc3c(C)c(O)c4c(=O)c(c5o...,RIFALAZIL,CCCCCCc1nc(cc2c1[Si](O[C@H]2c1ccccc1)(C(C)C)C(...,0.177634,0.029326,0.217421,0.036573,BRD-K34709721,2001,General HTS Sets,CCCCCCc1nc(CCCc2cc3[C@@H](O[Si](C(C)C)(C(C)C)c...,as drawn
515,NCCCCNCCCN1c2ccccc2Sc2ccc(cc21)Cl,0.963359,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc12,chlorpromazine,0.316936,COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(...,MOXIFLOXACIN,NCCCCNCCCN1c2ccccc2Sc2ccc(cc21)Cl,0.116063,0.006734,0.317389,0.024718,BRD-K51073125,5001,CMAP,NCCCCNCCCN1c2ccccc2Sc2ccc(Cl)cc12,as drawn
