In [1]:
%matplotlib inline
import pandas as pd
from pipeline import run_pipeline
# JV note: recapitulates 19.0_redo18.0_no_enamine_mistake_fragment_prioritization_pipeline_v6_script_0.05frag_0.15mol_all_frags_800K_12_20_20221
# also 19.0_PART2_final_mol_prioritization_corresponding_to_19.0_pipeline_v6_script_0.05frag_0.15mol_all_frags_800K_12_20_2022

  from .autonotebook import tqdm as notebook_tqdm
  from rdkit.Chem import MCS


In [None]:
# amass fragments from different datasets

# gdb11
df1 = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb11_05_24_2022.csv')
df1 = df1[['SMILES', 'ACTIVITY']]
df1['Source'] = ['GDB-11'] * len(df1)
display(df1.iloc[0:10])
print('length of gdb11 preds: ', len(df1))

# gdb17 sample
df2 = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb17_07_05_2022.csv')
df2 = df2[['SMILES', 'ACTIVITY']]
df2['Source'] = ['GDB-17 Sample'] * len(df2)
display(df2.iloc[0:10])
print('length of gdb17 preds: ', len(df2))

# enamine fragments
df3 = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_enamine_18milfrags_10_13_2022.csv')
df3 = df3[['smiles', 'ACTIVITY']]
df3.columns = ['SMILES', 'ACTIVITY']
df3['Source'] = ['Enamine'] * len(df3)
display(df3.iloc[0:10])
print('length of enamine preds: ', len(df3))

df = pd.concat([df1, df2, df3])
df = df.drop_duplicates('SMILES', ignore_index = True) # at first, just deduplicate based on SMILES
df.to_csv('../out/model_preds_on_frags_and_cpds/SA/combined_gdb11_gdb17_enamine_preds.csv', index = False)

In [2]:
# ACTUAL VALUES for SA round 5

# values for processing fragments and compounds
fragment_path = '../out/model_preds_on_frags_and_cpds/SA/combined_gdb11_gdb17_enamine_preds.csv'
compound_path = '../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv'
result_path = '../out/fragment_algorithm_pipeline_runs/10_SA_rd5/'
fragment_smi_col = 'SMILES'
compound_smi_col = 'smiles'
fragment_hit_col = 'ACTIVITY'
compound_hit_col = 'ACTIVTY'
cpd_name_col = 'Name'

# filters and thresholds for fragments and compounds
fragment_score = 0.1
compound_score = 0.15
fragment_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
compound_remove_pains_brenk = 'both' # one of 'both', 'pains', 'brenk', 'none'
fragment_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
compound_druglikeness_filter = [] # list containing 'egan', 'ghose', 'lipinski', 'muegge'
fragment_require_more_than_coh = True
fragment_remove_patterns = []

# input for matching and comparison to existing datasets
frags_cannot_disrupt_rings = True
fragment_length_threshold = 0 # must be bigger than 0
display_inline_candidates = False
analogues_pval_diff_thresh = 0.05
analogues_absolute_diff_thresh = 0.05

# toxicity
toxicity_threshold_if_present = 0.5
toxicity_threshold_require_presence = False

# antibiotics
abx_path = '../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv'
abx_smiles_col = 'Smiles'
abx_name_col = 'Name'
cpd_sim_to_abx = 0.7

# training set
train_set_path = '../data/training_data/SA/combined_SA_screen_39K.csv'
train_set_smiles_col = 'SMILES'
train_set_name_col = 'Compound_ID'
cpd_sim_to_train_set = 0.9

# purchasable libraries
purch_path = '../data/static_datasets/Broad_800K_purchasable.xlsx'
purch_name_col = 'BROADID'
purch_name_needs_split = True

# tested before libraries - can be more expansive than train set
tested_before_path = ''
tested_before_name_col = 'Name'
tested_before_name_needs_split = False

In [3]:
run_pipeline(fragment_path=fragment_path, compound_path=compound_path, result_path=result_path, fragment_smi_col=fragment_smi_col, compound_smi_col=compound_smi_col, fragment_hit_col=fragment_hit_col, compound_hit_col=compound_hit_col, fragment_score=fragment_score, compound_score=compound_score, fragment_require_more_than_coh=fragment_require_more_than_coh, fragment_remove_pains_brenk=fragment_remove_pains_brenk, compound_remove_pains_brenk=compound_remove_pains_brenk, fragment_druglikeness_filter=fragment_druglikeness_filter, compound_druglikeness_filter=compound_druglikeness_filter, fragment_remove_patterns=fragment_remove_patterns, frags_cannot_disrupt_rings=frags_cannot_disrupt_rings, fragment_length_threshold=fragment_length_threshold, toxicity_threshold_if_present=toxicity_threshold_if_present, toxicity_threshold_require_presence=toxicity_threshold_require_presence, abx_path=abx_path, abx_smiles_col=abx_smiles_col, abx_name_col=abx_name_col, train_set_path=train_set_path, train_set_smiles_col=train_set_smiles_col, train_set_name_col=train_set_name_col, analogues_pval_diff_thresh=analogues_pval_diff_thresh, analogues_absolute_diff_thresh=analogues_absolute_diff_thresh, cpd_name_col=cpd_name_col, display_inline_candidates=display_inline_candidates, purch_path=purch_path, purch_name_col=purch_name_col, purch_name_needs_split=purch_name_needs_split, cpd_sim_to_abx=cpd_sim_to_abx, cpd_sim_to_train_set=cpd_sim_to_train_set)


Processing fragments...
length of df:  18338026


KeyboardInterrupt: 