In [1]:
%matplotlib inline
import pandas as pd
from pipeline import run_pipeline 

  from .autonotebook import tqdm as notebook_tqdm
  from rdkit.Chem import MCS


In [2]:
# # broad800k mols
# broad800k = pd.read_csv('../out/model_preds_on_frags_and_cpds/NG/broad800K_melis_predictions_with_FINALbayHO11152022_11_16_2022.csv')
# metadata = pd.read_csv('../data/static_datasets/PublicStructures.txt', sep = '\t')
# broad800k = broad800k.merge(metadata, left_on = 'smiles', right_on = 'SMILES', how = 'left')
# broad800k = broad800k.drop_duplicates('smiles')
# broad800k = broad800k[['smiles', 'Name', 'hit', 'hit_epi_unc']]
# broad800k.to_csv('../out/model_preds_on_frags_and_cpds/NG/broad800K_melis_predictions_with_FINALbayHO11152022_with_metadata_11_16_2022.csv')

In [3]:
# ACTUAL VALUES for NG round 1

# values for processing fragments and compounds
fragment_path = '../out/model_preds_on_frags_and_cpds/NG/enamine_frags_18mil_with_FINALbayHO11152022_melis_predictions_11_18_2022.csv'
compound_path = '../out/model_preds_on_frags_and_cpds/NG/broad800K_melis_predictions_with_FINALbayHO11152022_with_metadata_11_16_2022.csv'
result_path = '../out/fragment_algorithm_pipeline_runs/12_NG_rd1/'
fragment_smi_col = 'smiles'
compound_smi_col = 'smiles'
fragment_hit_col = 'hit'
compound_hit_col = 'hit'
cpd_name_col = 'Name'

# filters and thresholds for fragments and compounds
fragment_score = 0.2
compound_score = 0.3
fragment_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
compound_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
fragment_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
compound_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
fragment_require_more_than_coh = True
nitrofuran = 'O=[N+](O)c1ccco1'
sulfonamide = 'NS(=O)=O'
quinolone = 'O=c1cc[nH]c2ccccc12'
fragment_remove_patterns = [nitrofuran, sulfonamide, quinolone]

# input for matching and comparison to existing datasets
frags_cannot_disrupt_rings = True
fragment_length_threshold = 0 # must be bigger than 0
display_inline_candidates = False
analogues_pval_diff_thresh = 0
analogues_absolute_diff_thresh = 0.05

# toxicity
toxicity_threshold_if_present = 0.5
toxicity_threshold_require_presence = False

# antibiotics
abx_path = '../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv'
abx_smiles_col = 'Smiles'
abx_name_col = 'Name'
cpd_sim_to_abx = 0.5

# training set
train_set_path = '../data/training_data/NG/FULL_10_26_2022.csv'
train_set_smiles_col = 'SMILES'
train_set_name_col = 'Name'
train_set_hit_col = 'hit'
train_set_just_actives = True
train_set_thresh = 0.5
train_set_greater_than = True
cpd_sim_to_train_set = 0.5

# purchasable libraries
purch_path = '../data/static_datasets/Broad_800K_purchasable.xlsx'
purch_name_col = 'BROADID'
purch_name_needs_split = True

# tested before libraries - can be more expansive than train set
tested_before_path = '../data/training_data/NG/FULL_10_26_2022.csv'
tested_before_name_col = 'Name'
tested_before_name_needs_split = False

In [4]:
run_pipeline(fragment_path=fragment_path, compound_path=compound_path, result_path=result_path, fragment_smi_col=fragment_smi_col, compound_smi_col=compound_smi_col, fragment_hit_col=fragment_hit_col, compound_hit_col=compound_hit_col, fragment_score=fragment_score, compound_score=compound_score, fragment_require_more_than_coh=fragment_require_more_than_coh, fragment_remove_pains_brenk=fragment_remove_pains_brenk, compound_remove_pains_brenk=compound_remove_pains_brenk, fragment_druglikeness_filter=fragment_druglikeness_filter, compound_druglikeness_filter=compound_druglikeness_filter, fragment_remove_patterns=fragment_remove_patterns, frags_cannot_disrupt_rings=frags_cannot_disrupt_rings, fragment_length_threshold=fragment_length_threshold, toxicity_threshold_if_present=toxicity_threshold_if_present, toxicity_threshold_require_presence=toxicity_threshold_require_presence, abx_path=abx_path, abx_smiles_col=abx_smiles_col, abx_name_col=abx_name_col, train_set_path=train_set_path, train_set_smiles_col=train_set_smiles_col, train_set_name_col=train_set_name_col, analogues_pval_diff_thresh=analogues_pval_diff_thresh, analogues_absolute_diff_thresh=analogues_absolute_diff_thresh, cpd_name_col=cpd_name_col, display_inline_candidates=display_inline_candidates, purch_path=purch_path, purch_name_col=purch_name_col, purch_name_needs_split=purch_name_needs_split, cpd_sim_to_abx=cpd_sim_to_abx, cpd_sim_to_train_set=cpd_sim_to_train_set, train_set_just_actives = train_set_just_actives, train_set_hit_col = train_set_hit_col, train_set_thresh = train_set_thresh, train_set_greater_than = train_set_greater_than)


Processing fragments...
length of df:  18338026
length of df >0.2:  33552
length of df with more than C,O,H characters:  33528
length of df with valid mols:  33528
length of all preds with clean (no PAINS or Brenk) mols:  33528


  mcs = MCS.FindMCS([mol, pattern_mol], atomCompare='elements',completeRingsOnly = True)


length of df with no O=[N+](O)c1ccco1:  33528
length of df with no NS(=O)=O:  31656
length of df with no O=c1cc[nH]c2ccccc12:  31140

Processing compounds...
length of df:  798200
length of df >0.3:  29546
length of df with valid mols:  29546
length of all preds with clean (no PAINS or Brenk) mols:  6392

Matching fragments in compounds...
number of matched fragments:  283
Previewing dataframe so far...


Unnamed: 0,matched_fragments,fragment_SMILES,length_of_fragment,matched_molecules,number_of_matched_molecules,fragment_scores,full_molecule_scores,average_molecule_score
161,23157,CNC1=NC=CS1,7,"[29, 31, 169, 227, 235, 652, 661, 670, 716, 75...",159,0.213531,"[0.3884902316331863, 0.4377731198072433, 0.472...",0.413301
184,28731,CNC1=CC=C(C(F)(F)F)C=C1,12,"[76, 132, 833, 956, 1132, 2758, 3226, 3231, 32...",78,0.231742,"[0.3294391995668411, 0.3843697693943977, 0.400...",0.46002
52,6433,CNC(=O)NC1=CC=C(Cl)C(Cl)=C1,13,"[64, 83, 84, 405, 434, 438, 677, 687, 727, 878...",56,0.235171,"[0.4876297825574875, 0.4073721832036972, 0.415...",0.413796
162,23291,CNC1=CC=C(F)C(Cl)=C1,10,"[49, 91, 225, 418, 652, 779, 856, 938, 1297, 1...",50,0.261093,"[0.4299478089809417, 0.4020566514134407, 0.481...",0.424398
141,19742,NC(=O)C1=CC=C(Cl)C(Cl)=C1,11,"[613, 875, 918, 1057, 1104, 1160, 1367, 1417, ...",42,0.262383,"[0.555994657278061, 0.4205136322975158, 0.3213...",0.411792


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


number of fragments passing both toxicity filters under 0.5: 192
number of abx:  566
number of train set molecules:  1336
Checking analogues of compounds with and without fragments...


192it [2:43:32, 51.11s/it] 
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


number of fragments with >0.05 (or n/a) absolute value difference between analogues w/ and w/o frag:  179
final number of molecules to test:  667
length of df with purchasable mols:  161
length of all preds with tan abx < 0.5:  145
length of all preds with tan ts < 0.5:  74


Unnamed: 0,Name,smiles,hit,tan to nearest abx,tan to nearest ts,row_num,matching_frags,cluster
8,BRD-K96335988,FC(F)(F)c1ccc(cc1)Nc1nccc(n1)-c1cccc(c1)Cl,0.396698,0.305870,0.434483,0,[1],2
42,BRD-K20597075,CN(C)C(=O)N1CCN(CC1)c1ccc(c(c1)Cl)Cl,0.332390,0.323232,0.394619,1,"[49, 59]",3
66,BRD-A38251550,Clc1ccc2c(c1Cl)OC[C@H]2NC(=O)NCc1cccnc1,0.463516,0.423873,0.447650,2,"[103, 149]",6
67,BRD-A04409093,Fc1ccc(cc1Cl)N1CC[C@@H](Sc2cn[nH]n2)C1=O,0.338019,0.389135,0.392099,3,"[3, 70]",0
72,BRD-K31294592,Cc1ccc(cc1NC(=O)c1ccc(c(c1)Cl)Cl)-c1nc2ncccc2o1,0.321033,0.477233,0.472457,4,[4],8
...,...,...,...,...,...,...,...,...
610,BRD-A75992786,C[C@@H](Oc1ccc(cc1Cl)Cl)C(=O)Nc1c(cccc1C(=O)O)C,0.384132,0.385256,0.385850,69,[8],7
612,BRD-K58005623,Clc1cc(c(s1)Cl)C(=O)Nc1nc(cs1)-c1cccs1,0.405157,0.436316,0.485183,70,[0],5
636,BRD-A24280483,Cc1ncccc1C(=O)N[C@H]1CCN(C1=O)c1ccc(c(c1)F)Cl,0.322885,0.455271,0.488051,71,"[13, 14, 123]",0
649,BRD-K24188608,Fc1c(cccc1NCC1=NOC(=C2SC=NC2=C)N1)Cl,0.418261,0.427403,0.429091,72,[80],2
