In [None]:
%matplotlib inline
import pandas as pd
from pipeline import run_pipeline

# Save metadata

In [None]:
# broad800k
broad800k = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_05_20_2022.csv')
metadata = pd.read_csv('../data/static_datasets/PublicStructures.txt', sep = '\t')
broad800k = broad800k.merge(metadata, left_on = 'smiles', right_on = 'SMILES', how = 'left')
broad800k = broad800k.drop_duplicates('smiles')
broad800k = broad800k[['smiles', 'Name', 'ACTIVITY']]
broad800k.to_csv('../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv')

# orderable 5 mil molecules
orderable = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/combined_orderable_5mil_preds_08_01_2022.csv')
orderable['smiles'] = orderable['SMILES']
orderable['Name'] = orderable['Database']
orderable = orderable[['smiles', 'Name', 'ACTIVITY']]

# combine them
full_cpd_df = broad800k.append(orderable)
full_cpd_df.to_csv('../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_and_5mil_orderable_preds.csv', index = False)

# SA Rd 1

In [None]:
# JV note: recapitulates 5_fragment_prioritization_pipeline_v3_06_10_2022

run_pipeline(
    fragment_path='../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb11_05_24_2022.csv',
    compound_path='../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv',
    result_path='../out/fragment_algorithm_pipeline_runs/06_SA_rd1/',
    fragment_smi_col='SMILES',
    compound_smi_col='smiles',
    fragment_hit_col='ACTIVITY',
    compound_hit_col='ACTIVITY',
    fragment_score=0.05,
    compound_score=0.2,
    fragment_remove_pains_brenk='both',
    compound_remove_pains_brenk='both',
    fragment_druglikeness_filter=[],
    compound_druglikeness_filter=[],
    fragment_require_more_than_coh=False,
    fragment_remove_patterns=[],
    frags_cannot_disrupt_rings=False,
    fragment_length_threshold=10,
    display_inline_candidates=False,
    analogues_pval_diff_thresh=0,
    analogues_absolute_diff_thresh=0.05,
    toxicity_threshold_if_present=0.5,
    toxicity_threshold_require_presence=True,
    abx_path='../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv',
    abx_smiles_col='Smiles',
    abx_name_col='Name',
    train_set_path='',
    train_set_smiles_col='SMILES',
    train_set_name_col='Name',
    cpd_sim_to_abx=0,
    cpd_sim_to_train_set=0,
    cpd_name_col='Name',
    purch_path='',
    purch_name_col='BROADID',
    purch_name_needs_split=False
)


# SA Rd 2

In [None]:
# JV note: recapitulates 13.1_fragment_prioritization_pipeline_v5_script_0.05_17atom_on_800K_and_orderable and out/pipeline_v5_script/frag_0.05_17atom_800K_and_5mil

run_pipeline(
    fragment_path='../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb17_07_05_2022.csv',
    compound_path='../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_and_5mil_orderable_preds.csv',
    result_path='../out/fragment_algorithm_pipeline_runs/07_SA_rd2/',
    fragment_smi_col='SMILES',
    compound_smi_col='smiles',
    fragment_hit_col='ACTIVITY',
    compound_hit_col='ACTIVITY',
    cpd_name_col='Name',
    fragment_score=0.05,
    compound_score=0.2,
    fragment_remove_pains_brenk='both',
    compound_remove_pains_brenk='both',
    fragment_druglikeness_filter=[],
    compound_druglikeness_filter=[],
    fragment_require_more_than_coh=True,
    fragment_remove_patterns=[],
    frags_cannot_disrupt_rings=True,
    fragment_length_threshold=0,
    display_inline_candidates=True,
    analogues_pval_diff_thresh=0,
    analogues_absolute_diff_thresh=0.05,
    toxicity_threshold_if_present=0.5,
    toxicity_threshold_require_presence=False,
    abx_path='../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv',
    abx_smiles_col='Smiles',
    abx_name_col='Name',
    cpd_sim_to_abx=0,
    train_set_path='../data/training_data/SA/37K_sa_screen.csv',
    train_set_smiles_col='SMILES',
    train_set_name_col='Compound_ID',
    cpd_sim_to_train_set=0,
    purch_path='',
    purch_name_col='BROADID',
    purch_name_needs_split=False,
    tested_before_path='',
    tested_before_name_col='Name',
    tested_before_name_needs_split=False
)

# SA Rd 3

In [None]:
# JV note: recapitulates 16.2_fragment_prioritization_pipeline_v5_script_0.05frag_0.15mol_enamine18milfrags_800K;

run_pipeline(
    fragment_path='../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_enamine_18milfrags_10_13_2022.csv',
    compound_path='../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv',
    result_path='../out/fragment_algorithm_pipeline_runs/08_SA_rd3/',
    fragment_smi_col='SMILES',
    compound_smi_col='smiles',
    fragment_hit_col='ACTIVITY',
    compound_hit_col='ACTIVTY',
    cpd_name_col='Name',
    fragment_score=0.1,
    compound_score=0.15,
    fragment_remove_pains_brenk='both',
    compound_remove_pains_brenk='both',
    fragment_druglikeness_filter=[],
    compound_druglikeness_filter=[],
    fragment_require_more_than_coh=True,
    fragment_remove_patterns=[],
    frags_cannot_disrupt_rings=True,
    fragment_length_threshold=0,
    display_inline_candidates=False,
    analogues_pval_diff_thresh=0,
    analogues_absolute_diff_thresh=0.05,
    toxicity_threshold_if_present=0.5,
    toxicity_threshold_require_presence=False,
    abx_path='../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv',
    abx_smiles_col='Smiles',
    abx_name_col='Name',
    cpd_sim_to_abx=0,
    train_set_path='../data/training_data/SA/37K_sa_screen.csv',
    train_set_smiles_col='SMILES',
    train_set_name_col='Compound_ID',
    cpd_sim_to_train_set=0,
    purch_path='../data/static_datasets/Broad_800K_purchasable.xlsx',
    purch_name_col='BROADID',
    purch_name_needs_split=True,
    tested_before_path='',
    tested_before_name_col='Name',
    tested_before_name_needs_split=False
)

# SA Rd 4

In [None]:
# JV note: recapitulates 18.0_fragment_prioritization_pipeline_v6_script_0.05frag_0.15mol_all_frags_800K_12_12_2022; 
# also 18.0_PART2_final_mol_prioritization_corresponding_to_18.0_pipeline_v6_script_0.05frag_0.15mol_all_frags_800K_12_19_2022;

run_pipeline(
    fragment_path='../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_enamine_18milfrags_10_13_2022.csv',
    compound_path='../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv',
    result_path='../out/fragment_algorithm_pipeline_runs/09_SA_rd4/',
    fragment_smi_col='SMILES',
    compound_smi_col='smiles',
    fragment_hit_col='ACTIVITY',
    compound_hit_col='ACTIVTY',
    cpd_name_col='Name',
    fragment_score=0.1,
    compound_score=0.15,
    fragment_remove_pains_brenk='both',
    compound_remove_pains_brenk='both',
    fragment_druglikeness_filter=[],
    compound_druglikeness_filter=[],
    fragment_require_more_than_coh=True,
    fragment_remove_patterns=[],
    frags_cannot_disrupt_rings=True,
    fragment_length_threshold=0,
    display_inline_candidates=False,
    analogues_pval_diff_thresh=0.05,
    analogues_absolute_diff_thresh=0,
    toxicity_threshold_if_present=0.5,
    toxicity_threshold_require_presence=False,
    abx_path='../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv',
    abx_smiles_col='Smiles',
    abx_name_col='Name',
    cpd_sim_to_abx=0.7,
    train_set_path='../data/training_data/SA/combined_SA_screen_39K.csv',
    train_set_smiles_col='SMILES',
    train_set_name_col='Compound_ID',
    cpd_sim_to_train_set=1.0,
    purch_path='../data/static_datasets/Broad_800K_purchasable.xlsx',
    purch_name_col='BROADID',
    purch_name_needs_split=True,
    tested_before_path='',
    tested_before_name_col='Name',
    tested_before_name_needs_split=False
)

# SA Rd 5

### Collate fragments from different datasets

In [None]:
# gdb11
df1 = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb11_05_24_2022.csv')
df1 = df1[['SMILES', 'ACTIVITY']]
df1['Source'] = ['GDB-11'] * len(df1)
display(df1.iloc[0:10])
print('length of gdb11 preds: ', len(df1))

# gdb17 sample
df2 = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_fragment_preds_gdb17_07_05_2022.csv')
df2 = df2[['SMILES', 'ACTIVITY']]
df2['Source'] = ['GDB-17 Sample'] * len(df2)
display(df2.iloc[0:10])
print('length of gdb17 preds: ', len(df2))

# enamine fragments
df3 = pd.read_csv('../out/model_preds_on_frags_and_cpds/SA/FINISHED_combined_enamine_18milfrags_10_13_2022.csv')
df3 = df3[['smiles', 'ACTIVITY']]
df3.columns = ['SMILES', 'ACTIVITY']
df3['Source'] = ['Enamine'] * len(df3)
display(df3.iloc[0:10])
print('length of enamine preds: ', len(df3))

df = pd.concat([df1, df2, df3])
df = df.drop_duplicates('SMILES', ignore_index = True) # at first, just deduplicate based on SMILES
df.to_csv('../out/model_preds_on_frags_and_cpds/SA/combined_gdb11_gdb17_enamine_preds.csv', index = False)

In [None]:
# JV note: recapitulates 19.0_redo18.0_no_enamine_mistake_fragment_prioritization_pipeline_v6_script_0.05frag_0.15mol_all_frags_800K_12_20_20221
# also 19.0_PART2_final_mol_prioritization_corresponding_to_19.0_pipeline_v6_script_0.05frag_0.15mol_all_frags_800K_12_20_2022

run_pipeline(
    fragment_path='../out/model_preds_on_frags_and_cpds/SA/combined_gdb11_gdb17_enamine_preds.csv',
    compound_path='../out/model_preds_on_frags_and_cpds/SA/combined_broad_preds_with_metadata_05_20_2022.csv',
    result_path='../out/fragment_algorithm_pipeline_runs/10_SA_rd5/',
    fragment_smi_col='SMILES',
    compound_smi_col='smiles',
    fragment_hit_col='ACTIVITY',
    compound_hit_col='ACTIVTY',
    cpd_name_col='Name',
    fragment_score=0.1,
    compound_score=0.15,
    fragment_remove_pains_brenk='both',
    compound_remove_pains_brenk='both',
    fragment_druglikeness_filter=[],
    compound_druglikeness_filter=[],
    fragment_require_more_than_coh=True,
    fragment_remove_patterns=[],
    frags_cannot_disrupt_rings=True,
    fragment_length_threshold=0,
    display_inline_candidates=False,
    analogues_pval_diff_thresh=0.05,
    analogues_absolute_diff_thresh=0.05,
    toxicity_threshold_if_present=0.5,
    toxicity_threshold_require_presence=False,
    abx_path='../data/static_datasets/04052022_CLEANED_v5_antibiotics_across_many_classes.csv',
    abx_smiles_col='Smiles',
    abx_name_col='Name',
    cpd_sim_to_abx=0.7,
    train_set_path='../data/training_data/SA/combined_SA_screen_39K.csv',
    train_set_smiles_col='SMILES',
    train_set_name_col='Compound_ID',
    cpd_sim_to_train_set=0.9,
    purch_path='../data/static_datasets/Broad_800K_purchasable.xlsx',
    purch_name_col='BROADID',
    purch_name_needs_split=True,
    tested_before_path='',
    tested_before_name_col='Name',
    tested_before_name_needs_split=False
)