In [1]:
%matplotlib inline
import pandas as pd
from pipeline import run_pipeline 

  from .autonotebook import tqdm as notebook_tqdm
  from rdkit.Chem import MCS


In [2]:
# ACTUAL VALUES for NG round 1

# values for processing fragments and compounds
fragment_path = '../out/model_preds_on_frags_and_cpds/NG/enamine_frags_18mil_with_FINALbayHO11152022_melis_predictions_11_18_2022.csv'
compound_path = '../out/model_preds_on_frags_and_cpds/NG/extended_screen_set_with_FINALbayHO11152022_melis_predictions_11_22_2022.csv'
result_path = '../out/fragment_algorithm_pipeline_runs/13_NG_rd2/'
fragment_smi_col = 'smiles'
compound_smi_col = 'SMILES'
fragment_hit_col = 'hit'
compound_hit_col = 'hit'
cpd_name_col = 'SMILES'

# filters and thresholds for fragments and compounds
fragment_score = 0.25
compound_score = 0.5
fragment_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
compound_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
fragment_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
compound_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
fragment_require_more_than_coh = True
nitrofuran = 'O=[N+](O)c1ccco1'
sulfonamide = 'NS(=O)=O'
quinolone = 'O=c1cc[nH]c2ccccc12'
fragment_remove_patterns = [nitrofuran, sulfonamide, quinolone]

# input for matching and comparison to existing datasets
frags_cannot_disrupt_rings = True
fragment_length_threshold = 0 # must be bigger than 0
display_inline_candidates = False
analogues_pval_diff_thresh = 0
analogues_absolute_diff_thresh = 0.05

# toxicity
toxicity_threshold_if_present = 0.5
toxicity_threshold_require_presence = False

# antibiotics
abx_path = '../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv'
abx_smiles_col = 'Smiles'
abx_name_col = 'Name'
cpd_sim_to_abx = 0.5

# training set
train_set_path = '../data/training_data/NG/FULL_10_26_2022.csv'
train_set_smiles_col = 'SMILES'
train_set_name_col = 'Name'
train_set_hit_col = 'hit'
train_set_just_actives = True
train_set_thresh = 0.5
train_set_greater_than = True
cpd_sim_to_train_set = 0.5

# tested before libraries - can be more expansive than train set
tested_before_path = '../data/training_data/NG/FULL_10_26_2022.csv'
tested_before_name_col = 'Name'
tested_before_name_needs_split = False

In [3]:
run_pipeline(fragment_path=fragment_path, compound_path=compound_path, result_path=result_path, fragment_smi_col=fragment_smi_col, compound_smi_col=compound_smi_col, fragment_hit_col=fragment_hit_col, compound_hit_col=compound_hit_col, fragment_score=fragment_score, compound_score=compound_score, fragment_require_more_than_coh=fragment_require_more_than_coh, fragment_remove_pains_brenk=fragment_remove_pains_brenk, compound_remove_pains_brenk=compound_remove_pains_brenk, fragment_druglikeness_filter=fragment_druglikeness_filter, compound_druglikeness_filter=compound_druglikeness_filter, fragment_remove_patterns=fragment_remove_patterns, frags_cannot_disrupt_rings=frags_cannot_disrupt_rings, fragment_length_threshold=fragment_length_threshold, toxicity_threshold_if_present=toxicity_threshold_if_present, toxicity_threshold_require_presence=toxicity_threshold_require_presence, abx_path=abx_path, abx_smiles_col=abx_smiles_col, abx_name_col=abx_name_col, train_set_path=train_set_path, train_set_smiles_col=train_set_smiles_col, train_set_name_col=train_set_name_col, analogues_pval_diff_thresh=analogues_pval_diff_thresh, analogues_absolute_diff_thresh=analogues_absolute_diff_thresh, cpd_name_col=cpd_name_col, display_inline_candidates=display_inline_candidates, cpd_sim_to_abx=cpd_sim_to_abx, cpd_sim_to_train_set=cpd_sim_to_train_set, train_set_just_actives = train_set_just_actives, train_set_hit_col = train_set_hit_col, train_set_thresh = train_set_thresh, train_set_greater_than = train_set_greater_than)


Processing fragments...
length of df:  18338026
length of df >0.25:  16119
length of df with more than C,O,H characters:  16112
length of df with valid mols:  16112
length of all preds with clean (no PAINS or Brenk) mols:  16112


  mcs = MCS.FindMCS([mol, pattern_mol], atomCompare='elements',completeRingsOnly = True)


length of df with no O=[N+](O)c1ccco1:  16112
length of df with no NS(=O)=O:  14715
length of df with no O=c1cc[nH]c2ccccc12:  14444

Processing compounds...
length of df:  5348852
length of df >0.5:  54850
length of df with valid mols:  54850
length of all preds with clean (no PAINS or Brenk) mols:  9126

Matching fragments in compounds...
number of matched fragments:  135
Previewing dataframe so far...


Unnamed: 0,matched_fragments,fragment_SMILES,length_of_fragment,matched_molecules,number_of_matched_molecules,fragment_scores,full_molecule_scores,average_molecule_score
74,9201,NC(=O)C1=CC=C(Cl)C(Cl)=C1,11,"[5, 30, 87, 90, 180, 221, 514, 556, 557, 558, ...",190,0.262383,"[0.5937710893154144, 0.7019818139076233, 0.591...",0.581501
93,10798,CNC1=CC=C(F)C(Cl)=C1,10,"[8, 75, 314, 636, 647, 963, 967, 1046, 1187, 1...",73,0.261093,"[0.5087220023572445, 0.5270452889800071, 0.585...",0.580187
31,2847,CC(=O)NCC1=CC=C(Cl)C(Cl)=C1,13,"[35, 36, 227, 233, 234, 235, 1022, 1023, 1027,...",55,0.331402,"[0.5706801253557205, 0.6045532840490341, 0.540...",0.582468
23,2182,CNC1=CN=C2C=C(Cl)C(Cl)=CC2=N1,14,"[4058, 4059, 4060, 4061, 4063, 4064, 4065, 406...",36,0.496916,"[0.6016030287742615, 0.5229667621850967, 0.546...",0.551668
10,793,CS(=O)C1=CC=C(Cl)C(Cl)=C1,11,"[315, 536, 773, 775, 776, 917, 921, 1856, 1857...",34,0.272349,"[0.7733694469928741, 0.5540057808160782, 0.597...",0.594385


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


number of fragments passing both toxicity filters under 0.5: 113
number of abx:  566
number of train set molecules:  1336
Checking analogues of compounds with and without fragments...


113it [14:04:59, 448.67s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


number of fragments with >0.05 (or n/a) absolute value difference between analogues w/ and w/o frag:  105
final number of molecules to test:  593
length of all preds with tan abx < 0.5:  490
length of all preds with tan ts < 0.5:  376


Unnamed: 0,SMILES,hit,tan to nearest abx,tan to nearest ts,row_num,matching_frags,cluster
0,CC1=NN=C(NC(=O)C2=CC(Cl)=C(Cl)C=C2)S1,0.580516,0.268247,0.449482,0,[0],27
1,CC(=O)C1=CC=C(NS(=O)(=O)C2=CC=CC(=C2)S(=O)(=O)...,0.509975,0.348165,0.409917,1,[4],56
2,CC1=NN(C2=CC=C(NC(C3=CC=C(Cl)C(Cl)=C3)=O)C=C2)...,0.593771,0.485962,0.491559,2,[0],18
3,O=C(C1=C(Cl)C=CC=C1)NC(C(NC(C2=C(Cl)C=CC=C2)=O...,0.508722,0.348801,0.466387,3,[1],67
4,ClC1=C(Cl)C=C(C=C1)C(=O)NC1CCC(CC2CCC(CC2)NC(=...,0.613747,0.355731,0.325617,4,[0],2
...,...,...,...,...,...,...,...
582,ClC1=C(Cl)C=C(C=C1)C(=O)NC1=NN=C(S1)C1=CN=CC=C1,0.616137,0.379512,0.402768,371,[0],30
588,ClC1=CN=C(NC(=O)C2=C(Cl)C(Cl)=CC=C2)C=C1,0.501333,0.292776,0.475983,372,[7],37
589,Cl.COC(=O)C1=C(NC(=O)NCC2=CC=C(Cl)C(Cl)=C2)C=C...,0.534548,0.329538,0.418256,373,"[10, 11]",66
590,FC1=CC=C(NC2=NN=C(S2)C2=NC=CN=C2)C=C1Cl,0.551679,0.359815,0.359662,374,[1],8
