In [1]:
%matplotlib inline
import pandas as pd
from pipeline import run_pipeline
# JV note: recapitulates 5_fragment_prioritization_pipeline_v3_06_10_2022; 

  from .autonotebook import tqdm as notebook_tqdm
  from rdkit.Chem import MCS


In [2]:
# save metadata
broad800k = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_05_20_2022.csv')
metadata = pd.read_csv('../data/static_datasets/PublicStructures.txt', sep = '\t')
broad800k = broad800k.merge(metadata, left_on = 'smiles', right_on = 'SMILES', how = 'left')
broad800k = broad800k.drop_duplicates('smiles')
broad800k = broad800k[['smiles', 'Name', 'ACTIVITY']]
broad800k.to_csv('../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv')

In [3]:
# ACTUAL VALUES for SA round 1

# values for processing fragments and compounds
fragment_path = '../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb11_05_24_2022.csv'
compound_path = '../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv'
result_path = '../out/fragment_algorithm_pipeline_runs/06_SA_rd1/'
fragment_smi_col = 'SMILES'
compound_smi_col = 'smiles'
fragment_hit_col = 'ACTIVITY'
compound_hit_col = 'ACTIVITY'
cpd_name_col = 'Name'

# filters and thresholds for fragments and compounds
fragment_score = 0.05
compound_score = 0.2
fragment_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
compound_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
fragment_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
compound_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
fragment_require_more_than_coh = False
fragment_remove_patterns = []

# input for matching and comparison to existing datasets
frags_cannot_disrupt_rings = False
fragment_length_threshold = 10 # must be bigger than 10
display_inline_candidates = False
analogues_pval_diff_thresh = 0
analogues_absolute_diff_thresh = 0.05

# toxicity
toxicity_threshold_if_present = 0.5 # technically for this round, we forced them to have a value for toxicity
toxicity_threshold_require_presence = True

# antibiotics
abx_path = '../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv'
abx_smiles_col = 'Smiles'
abx_name_col = 'Name'
cpd_sim_to_abx = 0

# training set
train_set_path = ''
train_set_smiles_col = 'SMILES'
train_set_name_col = 'Name'
cpd_sim_to_train_set = 0

# purchasable libraries
purch_path = ''
purch_name_col = 'BROADID'
purch_name_needs_split = False

# tested before libraries - can be more expansive than train set
tested_before_path = ''
tested_before_name_col = 'Name'
tested_before_name_needs_split = False

In [4]:
run_pipeline(fragment_path=fragment_path, compound_path=compound_path, result_path=result_path, fragment_smi_col=fragment_smi_col, compound_smi_col=compound_smi_col, fragment_hit_col=fragment_hit_col, compound_hit_col=compound_hit_col, fragment_score=fragment_score, compound_score=compound_score, fragment_require_more_than_coh=fragment_require_more_than_coh, fragment_remove_pains_brenk=fragment_remove_pains_brenk, compound_remove_pains_brenk=compound_remove_pains_brenk, fragment_druglikeness_filter=fragment_druglikeness_filter, compound_druglikeness_filter=compound_druglikeness_filter, fragment_remove_patterns=fragment_remove_patterns, frags_cannot_disrupt_rings=frags_cannot_disrupt_rings, fragment_length_threshold=fragment_length_threshold, toxicity_threshold_if_present=toxicity_threshold_if_present, toxicity_threshold_require_presence=toxicity_threshold_require_presence, abx_path=abx_path, abx_smiles_col=abx_smiles_col, abx_name_col=abx_name_col, train_set_path=train_set_path, train_set_smiles_col=train_set_smiles_col, train_set_name_col=train_set_name_col, analogues_pval_diff_thresh=analogues_pval_diff_thresh, analogues_absolute_diff_thresh=analogues_absolute_diff_thresh, cpd_name_col=cpd_name_col, display_inline_candidates=display_inline_candidates, purch_path=purch_path, purch_name_col=purch_name_col, purch_name_needs_split=purch_name_needs_split, cpd_sim_to_abx=cpd_sim_to_abx, cpd_sim_to_train_set=cpd_sim_to_train_set)


Processing fragments...
length of df:  27523571
length of df >0.05:  238434
length of df with valid mols:  238434
length of all preds with clean (no PAINS or Brenk) mols:  81864

Processing compounds...
length of df:  798200
length of df >0.2:  7236
length of df with valid mols:  7236
length of all preds with clean (no PAINS or Brenk) mols:  3673

Matching fragments in compounds...
number of matched fragments:  181
Previewing dataframe so far...


Unnamed: 0,matched_fragments,fragment_SMILES,length_of_fragment,matched_molecules,number_of_matched_molecules,fragment_scores,full_molecule_scores,average_molecule_score
179,81857,ClC1=CC=CC=C1Cl,8,"[32, 46, 55, 86, 94, 109, 120, 125, 162, 175, ...",263,0.252329,"[0.2011455439031124, 0.2453069861978292, 0.224...",0.252813
67,76739,CNC1=CC=C(Cl)C(Cl)=C1,10,"[32, 94, 194, 200, 237, 263, 265, 377, 405, 46...",109,0.171429,"[0.2011455439031124, 0.2071893790736794, 0.249...",0.23948
2,1496,CCCc1ccc(O)cc1,10,"[0, 154, 255, 257, 277, 284, 300, 302, 306, 35...",106,0.07036,"[0.203643961623311, 0.5237967811524868, 0.2132...",0.270202
174,81806,CC1=CC=C(Cl)C(Cl)=C1,9,"[46, 55, 109, 120, 200, 201, 203, 300, 378, 42...",96,0.083356,"[0.2453069861978292, 0.2249952647835016, 0.235...",0.257822
42,42396,CCCCc1ccc(O)cc1,11,"[0, 255, 257, 277, 284, 300, 354, 431, 443, 47...",71,0.108027,"[0.203643961623311, 0.2132368322461843, 0.2206...",0.251289


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


number of fragments passing both toxicity filters under 0.5: 110
number of abx:  566
Checking analogues of compounds with and without fragments...


77it [1:35:13, 74.20s/it] 
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


number of fragments with >0.05 (or n/a) absolute value difference between analogues w/ and w/o frag:  63
final number of molecules to test:  271


Unnamed: 0,Name,smiles,ACTIVITY,tan to nearest abx,row_num,matching_frags,cluster
0,BRD-K26589290,Oc1ccc(cc1)C1(CCCCC1)c1ccc(cc1)O,0.203644,0.370023,0,"[0, 7]",20
1,BRD-K07179331,Oc1cc(cc(c1)C1CCCCC1)C1CCCCC1,0.312106,0.425824,1,[61],0
2,BRD-K01826743,CN[C@@H](CC(C)C)C(=O)N[C@H]1[C@@H](O)c2ccc(c(c...,0.542936,1.000000,2,"[2, 9, 10]",8
3,BRD-K81916719,CSc1nc2cc(c(cc2[nH]1)Oc1cccc(c1Cl)Cl)Cl,0.451400,0.332386,3,[28],26
4,BRD-K01649396,CN[C@@H]1C[C@@H](c2ccc(c(c2)Cl)Cl)c2ccccc21,0.242427,0.339157,4,[4],12
...,...,...,...,...,...,...,...
266,BRD-K82744348,CCCC(c1cc(c(cc1C)O)C(C)(C)C)c1cc(c(cc1C)O)C(C)...,0.293666,0.331486,266,"[0, 17]",32
267,BRD-A37078961,COc1ccc(c(c1)OC)C1=NN(c2nc(cs2)C(=O)O)[C@@](O)...,0.245453,0.663992,267,"[0, 32]",6
268,BRD-A57439040,CC1=C([C@@H](NC(=O)N1)c1ccc(c(c1)Cl)Cl)C(=O)OC...,0.226732,0.520189,268,"[4, 11]",25
269,BRD-A71348525,COc1cc(c(cc1OCC(=O)O)Cl)[C@H]1NC(=O)NC(=C1C(=O...,0.200840,0.588819,269,"[0, 1]",38
